Python Reader 예제들, lm_dataformat.Reader Python 예제들

예제 #1

0

파일 보기

 def __init__(self, captions_path, spectrograms_path, lazy=False):
     self.lazy = lazy
     if self.lazy:
         # Warning: The lazy path does not check whether the cpation metadata
         # links it to the spectrogram. It assumes that the specrogram data,
         # read from the files from the path in sorted order, loaded in as
         # tensors, follows the exact same ordering as the LMD-encoded captions.
         self.captions = lmd.Reader(captions_path).stream_data(
             get_meta=False)
         self.spectrograms = SpectrogramLazyDataset(spectrograms_path)
     else:
         self.captions = lmd.Reader(captions_path).stream_data(
             get_meta=True)
         self.spectrograms = SpectrogramDataset(spectrograms_path)

예제 #2

0

파일 보기

파일: datasets.py 프로젝트: maybeee18/The-Pile

    def documents(self):
        self._download()

        return map(
            lambda x: (remove_advertisement(x[0]), x[1]),
            lmd.Reader('components/openwebtext2/openwebtext2.jsonl.zst.tar').
            stream_data(get_meta=True))

예제 #3

0

파일 보기

파일: datasets.py 프로젝트: maybeee18/The-Pile

    def documents(self):
        self._download()

        return filter(
            lambda x: len(x[0]) < 100000,
            lmd.Reader('components/github/github.jsonl.zst.tar').stream_data(
                get_meta=True))

예제 #4

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from map(
            strip_markdown_colons,
            lmd.Reader(
                'components/pubmedcentral/PMC_extracts.tar.gz').stream_data())

예제 #5

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from map(
            remove_advertisement,
            lmd.Reader('components/openwebtext2/openwebtext2.jsonl.zst.tar').
            stream_data())

예제 #6

0

파일 보기

파일: test_dat_archive_reader.py 프로젝트: researcher2/lm_dataformat

def test_tgz_read():
    reader = lmd.Reader('test/blns.txt.tar.gz')
    blns = open('test/blns.txt').read()

    data = list(reader.stream_data(get_meta=False))

    assert data[0] == blns
    assert len(data) == 1

예제 #7

0

파일 보기

파일: test_dat_archive_reader.py 프로젝트: researcher2/lm_dataformat

def test_jsonl_tar():
    blns = open('test/blns.txt').read()
    reader = lmd.Reader('test/blns.jsonl.zst.tar')

    data = list(reader.stream_data(get_meta=True))

    assert data[0] == (blns, {})
    assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123})
    assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']})
    assert data[3] == ('testing 123456789', {})

    assert data[4] == (blns, {})
    assert data[5] == ('testing 123\n\ntesting 345', {'testing': 123})
    assert data[6] == (blns, {'testing2': 456, 'testing': ['a', 'b']})
    assert data[7] == ('testing 123456789', {})

예제 #8

0

파일 보기

파일: test_dat_archive_reader.py 프로젝트: researcher2/lm_dataformat

def test_jsonl_paras():
    archive = lmd.Archive('test_dir')
    blns = open('test/blns.txt').read()
    archive.add_data(blns)
    archive.add_data(['testing 123', 'testing 345'], meta={'testing': 123})
    archive.add_data(blns, meta={'testing2': 456, 'testing': ['a', 'b']})
    archive.add_data('testing 123456789')
    archive.commit()

    reader = lmd.Reader('test_dir')

    data = list(reader.stream_data(get_meta=True))

    assert data[0] == (blns, {})
    assert data[1] == ('testing 123\n\ntesting 345', {'testing': 123})
    assert data[2] == (blns, {'testing2': 456, 'testing': ['a', 'b']})
    assert data[3] == ('testing 123456789', {})
    shutil.rmtree('test_dir')

예제 #9

0

파일 보기

파일: test_dat_archive_reader.py 프로젝트: researcher2/lm_dataformat

def test_json():
    archive = lmd.JSONArchive('test_dir')
    blns = open('test/blns.txt').read()
    archive.add_data(blns)
    archive.add_data('testing 123')
    archive.add_data(blns)
    archive.add_data('testing 123456789')
    archive.commit()

    reader = lmd.Reader('test_dir')

    data = list(reader.stream_data())

    assert data[0] == blns
    assert data[1] == 'testing 123'
    assert data[2] == blns
    assert data[3] == 'testing 123456789'
    shutil.rmtree('test_dir')

예제 #10

0

파일 보기

def compute_perplexity_data(model, data_path, indices=None):
    # For expedience, we're going to assume everything fits in memory for now
    # Also for expedience we're just going to save lists of arrays
    overall_output = {
        "all_logprobs": [],
        "all_positions": [],
        "aggregate_length": 0,
        "aggregate_utf8_length": 0.
    }

    reader = lm_dataformat.Reader(data_path)
    for i, doc in enumerate(tqdm_lib.tqdm(reader.stream_data())):
        if indices is not None and i not in indices:
            continue
        output = model.get_perplexity_data(doc)
        if not output:
            continue
        overall_output["all_logprobs"].append(output["logprobs"])
        overall_output["all_positions"].append(output["positions"])
        overall_output["aggregate_length"] += output["length"]
        overall_output["aggregate_utf8_length"] += output["utf8_length"]

    return overall_output

예제 #11

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/europarl/EuroParliamentProceedings_1996_2011.jsonl.zst'
        ).stream_data()

예제 #12

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/philpapers/PhilArchive.jsonl.zst').stream_data()

예제 #13

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/uspto/pile_uspto.jsonl.zst.tar').stream_data()

예제 #14

0

파일 보기

 def meta_items():
     rdr = lmd.Reader(f)
     return pool.imap(analyze, rdr.stream_data(get_meta=True))

예제 #15

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/czic/GOVINFO_CZIC_KL.jsonl.zst').stream_data()

예제 #16

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/openwebtext/openwebtext').stream_data()

예제 #17

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/freelaw/FreeLaw_Opinions.jsonl.zst').stream_data()

예제 #18

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/youtubesubtitles/yt_subs.jsonl.zst').stream_data()

예제 #19

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from filter(
            lambda x: len(x) < 100000,
            lmd.Reader('components/github/github.jsonl.zst.tar').stream_data())

예제 #20

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/commoncrawl/pile_cc_filtered.jsonl.zst.tar'
        ).stream_data()

예제 #21

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/ubuntu_irc/ubuntu_irc_until_2020_9_1.jsonl.zst'
        ).stream_data()

예제 #22

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader('components/cord19/out').stream_data()

예제 #23

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/literotica/Literotica.jsonl.zst').stream_data()

예제 #24

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader('components/enron_emails/out').stream_data()

예제 #25

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader('components/hackernews/hn.tar.gz').stream_data()

예제 #26

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/exporter/NIH_ExPORTER_awarded_grant_text.jsonl.zst'
        ).stream_data()

예제 #27

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader('components/arxiv/arxiv.jsonl.zst').stream_data()

예제 #28

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader('components/stackexchange/out').stream_data()

예제 #29

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader('components/opensubtitles/out').stream_data()

예제 #30

0

파일 보기

파일: datasets.py 프로젝트: stjordanis/The-Pile

    def documents(self):
        self._download()

        yield from lmd.Reader(
            'components/pubmed/PUBMED_title_abstracts_2019_baseline.jsonl.zst'
        ).stream_data()