예제 #1
0
 def __init__(self, captions_path, spectrograms_path, lazy=False):
     self.lazy = lazy
     if self.lazy:
         # Warning: The lazy path does not check whether the cpation metadata
         # links it to the spectrogram. It assumes that the specrogram data,
         # read from the files from the path in sorted order, loaded in as
         # tensors, follows the exact same ordering as the LMD-encoded captions.
         self.captions = lmd.Reader(captions_path).stream_data(
             get_meta=False)
         self.spectrograms = SpectrogramLazyDataset(spectrograms_path)
     else:
         self.captions = lmd.Reader(captions_path).stream_data(
             get_meta=True)
         self.spectrograms = SpectrogramDataset(spectrograms_path)
예제 #2
0
    def documents(self):
        self._download()

        return map(
            lambda x: (remove_advertisement(x[0]), x[1]),
            lmd.Reader('components/openwebtext2/openwebtext2.jsonl.zst.tar').
            stream_data(get_meta=True))
예제 #3
0
    def documents(self):
        self._download()

        return filter(
            lambda x: len(x[0]) < 100000,
            lmd.Reader('components/github/github.jsonl.zst.tar').stream_data(
                get_meta=True))
예제 #4
0
    def documents(self):
        self._download()

        yield from map(
            strip_markdown_colons,
            lmd.Reader(
                'components/pubmedcentral/PMC_extracts.tar.gz').stream_data())
예제 #5
0
    def documents(self):
        self._download()

        yield from map(
            remove_advertisement,
            lmd.Reader('components/openwebtext2/openwebtext2.jsonl.zst.tar').
            stream_data())
def test_tgz_read():
    reader = lmd.Reader('test/blns.txt.tar.gz')
    blns = open('test/blns.txt').read()

    data = list(reader.stream_data(get_meta=False))

    assert data[0] == blns
    assert len(data) == 1
def test_jsonl_tar():
    blns = open('test/blns.txt').read()
    reader = lmd.Reader('test/blns.jsonl.zst.tar')

    data = list(reader.stream_data(get_meta=True))

    assert data[0] == (blns, {})
    assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123})
    assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']})
    assert data[3] == ('testing 123456789', {})

    assert data[4] == (blns, {})
    assert data[5] == ('testing 123\n\ntesting 345', {'testing': 123})
    assert data[6] == (blns, {'testing2': 456, 'testing': ['a', 'b']})
    assert data[7] == ('testing 123456789', {})
def test_jsonl_paras():
    archive = lmd.Archive('test_dir')
    blns = open('test/blns.txt').read()
    archive.add_data(blns)
    archive.add_data(['testing 123', 'testing 345'], meta={'testing': 123})
    archive.add_data(blns, meta={'testing2': 456, 'testing': ['a', 'b']})
    archive.add_data('testing 123456789')
    archive.commit()

    reader = lmd.Reader('test_dir')

    data = list(reader.stream_data(get_meta=True))

    assert data[0] == (blns, {})
    assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123})
    assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']})
    assert data[3] == ('testing 123456789', {})
    shutil.rmtree('test_dir')
def test_json():
    archive = lmd.JSONArchive('test_dir')
    blns = open('test/blns.txt').read()
    archive.add_data(blns)
    archive.add_data('testing 123')
    archive.add_data(blns)
    archive.add_data('testing 123456789')
    archive.commit()

    reader = lmd.Reader('test_dir')

    data = list(reader.stream_data())

    assert data[0] == blns
    assert data[1] == 'testing 123'
    assert data[2] == blns
    assert data[3] == 'testing 123456789'
    shutil.rmtree('test_dir')
예제 #10
0
def compute_perplexity_data(model, data_path, indices=None):
    # For expedience, we're going to assume everything fits in memory for now
    # Also for expedience we're just going to save lists of arrays
    overall_output = {
        "all_logprobs": [],
        "all_positions": [],
        "aggregate_length": 0,
        "aggregate_utf8_length": 0.
    }

    reader = lm_dataformat.Reader(data_path)
    for i, doc in enumerate(tqdm_lib.tqdm(reader.stream_data())):
        if indices is not None and i not in indices:
            continue
        output = model.get_perplexity_data(doc)
        if not output:
            continue
        overall_output["all_logprobs"].append(output["logprobs"])
        overall_output["all_positions"].append(output["positions"])
        overall_output["aggregate_length"] += output["length"]
        overall_output["aggregate_utf8_length"] += output["utf8_length"]

    return overall_output
예제 #11
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/europarl/EuroParliamentProceedings_1996_2011.jsonl.zst'
        ).stream_data()
예제 #12
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/philpapers/PhilArchive.jsonl.zst').stream_data()
예제 #13
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/uspto/pile_uspto.jsonl.zst.tar').stream_data()
예제 #14
0
 def meta_items():
     rdr = lmd.Reader(f)
     return pool.imap(analyze, rdr.stream_data(get_meta=True))
예제 #15
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/czic/GOVINFO_CZIC_KL.jsonl.zst').stream_data()
예제 #16
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/openwebtext/openwebtext').stream_data()
예제 #17
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/freelaw/FreeLaw_Opinions.jsonl.zst').stream_data()
예제 #18
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/youtubesubtitles/yt_subs.jsonl.zst').stream_data()
예제 #19
0
    def documents(self):
        self._download()

        yield from filter(
            lambda x: len(x) < 100000,
            lmd.Reader('components/github/github.jsonl.zst.tar').stream_data())
예제 #20
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/commoncrawl/pile_cc_filtered.jsonl.zst.tar'
        ).stream_data()
예제 #21
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/ubuntu_irc/ubuntu_irc_until_2020_9_1.jsonl.zst'
        ).stream_data()
예제 #22
0
    def documents(self):
        self._download()

        yield from lmd.Reader('components/cord19/out').stream_data()
예제 #23
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/literotica/Literotica.jsonl.zst').stream_data()
예제 #24
0
    def documents(self):
        self._download()

        yield from lmd.Reader('components/enron_emails/out').stream_data()
예제 #25
0
    def documents(self):
        self._download()

        yield from lmd.Reader('components/hackernews/hn.tar.gz').stream_data()
예제 #26
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/exporter/NIH_ExPORTER_awarded_grant_text.jsonl.zst'
        ).stream_data()
예제 #27
0
    def documents(self):
        self._download()

        yield from lmd.Reader('components/arxiv/arxiv.jsonl.zst').stream_data()
예제 #28
0
    def documents(self):
        self._download()

        yield from lmd.Reader('components/stackexchange/out').stream_data()
예제 #29
0
    def documents(self):
        self._download()

        yield from lmd.Reader('components/opensubtitles/out').stream_data()
예제 #30
0
    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/pubmed/PUBMED_title_abstracts_2019_baseline.jsonl.zst'
        ).stream_data()