예제 #1
0
def get(savepath: str,
        bsize: int = 32,
        vocab_size: int = 5000) -> (DataLoader, DataLoader, Dict, Dict):
    savepath = Path(savepath)

    print('Reading...')
    train = lfds.SmallParallelEnJa('train')
    validation = lfds.SmallParallelEnJa('dev')

    train = train.map(preprocess)
    validation = validation.map(preprocess)

    src_tokens: List = lf.flat_map(lambda x: x[0],
                                   train + validation,
                                   lazy=True)  # en
    tgt_tokens: List = lf.flat_map(lambda x: x[1],
                                   train + validation,
                                   lazy=True)  # ja

    print('Building vocabulary...')
    src_t2i, _ = build_vocab(src_tokens, savepath / 'src.voacb', vocab_size)
    tgt_t2i, _ = build_vocab(tgt_tokens, savepath / 'tgt.voacb', vocab_size)

    print(f'Source vocab size: {len(src_t2i)}')
    print(f'Target Vocab size: {len(tgt_t2i)}')

    src_pad_idx = src_t2i[PAD_TOKEN]
    tgt_pad_idx = tgt_t2i[PAD_TOKEN]
    src_unk_idx = src_t2i[UNK_TOKEN]
    tgt_unk_idx = tgt_t2i[UNK_TOKEN]

    print('Postprocessing...')
    train_loader = DataLoader(train.map(
        postprocess(src_t2i, src_unk_idx, tgt_t2i,
                    tgt_unk_idx)).save(savepath / 'enja.train.cache'),
                              batch_size=bsize,
                              shuffle=True,
                              num_workers=4,
                              collate_fn=get_collate_fn(
                                  src_pad_idx, tgt_pad_idx))

    validation_loader = DataLoader(validation.map(
        postprocess(src_t2i, src_unk_idx, tgt_t2i,
                    tgt_unk_idx)).save(savepath / 'enja.validation.cache'),
                                   batch_size=bsize,
                                   shuffle=False,
                                   num_workers=4,
                                   collate_fn=get_collate_fn(
                                       src_pad_idx, tgt_pad_idx))

    return train_loader, validation_loader, src_t2i, tgt_t2i
예제 #2
0
 def test_returns_flat_mapped_data_lazily(self):
     result = lineflow.flat_map(lambda x: [x] * 3, self.data, lazy=True)
     self.assertIsInstance(result, itertools.chain)
     expected = list(itertools.chain.from_iterable(
         [[x] * 3 for x in self.data]))
     for x, y in zip(result, expected):
         self.assertEqual(x, y)
예제 #3
0
def build(dpath, savedir):
    '''
    1. Read dpath csv file.
    2. Preprocess. (tokenizing, stripping)
    3. Build vocab.
    4. Replace tokens with ids.
    5. Save.
    '''

    # Load csv data.
    dpath = Path(dpath)
    savedir = Path(savedir)

    # Preprocess
    tokenizer = Tokenizer()
    train = lf.CsvDataset(str(dpath / 'train.csv'),
                          header=True).map(get_preprocess(tokenizer))
    test = lf.CsvDataset(str(dpath / 'test.csv'),
                         header=True).map(get_preprocess(tokenizer))

    # Collect all tokens.
    tokens = lf.flat_map(lambda x: x['tokens'], train, lazy=True)

    # Build vocab.
    words, t2i = build_vocab(tokens)

    # Save vocab.
    with open(savedir / 'vocab.pkl', 'wb') as f:
        pickle.dump((t2i, words), f)

    # Save dataset.
    train.map(get_postprocess(t2i, t2i[UNK_TOKEN])).save(
        str(savedir / 'dataset.train.token.pkl'))
    test.map(get_postprocess(t2i, t2i[UNK_TOKEN])).save(
        str(savedir / 'dataset.test.token.pkl'))
예제 #4
0
def build(datapath='./data/example.txt', savedir='./'):
    datapath = Path(datapath)
    savedir = Path(savedir)

    docs = lf.TextDataset(str(datapath))
    ids = lf.Dataset(range(len(docs)))
    docs = docs.map(preprocess)
    ds = lf.zip(ids, docs)

    tokens = lf.flat_map(lambda x: x[1], ds, lazy=True)
    t2i, words = build_vocab(tokens, str(savedir / 'vocab.pkl'))

    unk_index = t2i[UNK_TOKEN]

    ds.map(postprocess(t2i, unk_index))\
        .save(str(savedir / 'dataset.token.pkl'))
예제 #5
0
 def test_returns_flat_mapped_data_eagerly(self):
     result = lineflow.flat_map(lambda x: [x] * 3, self.data)
     expected = [[x] * 3 for x in self.data]
     expected = [x for xs in expected for x in xs]
     self.assertListEqual(result, expected)
예제 #6
0
        padded_src = [x + [pad_index] * (src_max_length - len(x)) for x in src]
        padded_tgt = [y + [IGNORE_INDEX] * (tgt_max_length - len(y)) for y in tgt]
        return torch.LongTensor(padded_src), torch.LongTensor(padded_tgt)
    return f


if __name__ == '__main__':
    print('Reading...')
    train = lfds.SmallParallelEnJa('train')
    validation = lfds.SmallParallelEnJa('dev')

    train = train.map(preprocess)
    validation = validation.map(preprocess)

    en_tokens = lf.flat_map(lambda x: x[0],
                            train + validation,
                            lazy=True)
    ja_tokens = lf.flat_map(lambda x: x[1],
                            train + validation,
                            lazy=True)
    print('Building vocabulary...')
    en_token_to_index, _ = build_vocab(en_tokens, 'en.vocab')
    ja_token_to_index, _ = build_vocab(ja_tokens, 'ja.vocab')
    print(f'Vocab Size: {len(en_token_to_index)}')
    print(f'Vocab Size: {len(ja_token_to_index)}')

    pad_index = en_token_to_index[PAD_TOKEN]
    en_unk_index = en_token_to_index[UNK_TOKEN]
    ja_unk_index = ja_token_to_index[UNK_TOKEN]

    loader = DataLoader(
예제 #7
0
 def __iter__(self) -> Iterator[Any]:
     yield from lf.flat_map(self._map_func, self._dataset, lazy=True)
예제 #8
0
def get_collate_fn(pad_index):
    def f(batch):
        indices, labels = zip(*batch)
        max_length = max(len(x) for x in indices)
        padded = [x + [pad_index] * (max_length - len(x)) for x in indices]
        return torch.LongTensor(padded), torch.LongTensor(labels)
    return f


if __name__ == '__main__':
    print('Reading...')
    train = lfds.Imdb('train').map(preprocess)

    tokens = lf.flat_map(lambda x: x[0],
                         train,
                         lazy=True)
    print('Building vocabulary...')
    token_to_index, _ = build_vocab(tokens, 'vocab.pkl')
    print(f'Vocab Size: {len(token_to_index)}')

    pad_index = token_to_index[PAD_TOKEN]
    unk_index = token_to_index[UNK_TOKEN]

    loader = DataLoader(
        train
        .map(postprocess(token_to_index, unk_index))
        .save('imdb.train.cache'),
        batch_size=32,
        num_workers=4,
        collate_fn=get_collate_fn(pad_index))
예제 #9
0
    print('Reading...')
    train = Seq2SeqDataset(
        source_file_path='./cnndm/train.txt.src',
        target_file_path='./cnndm/train.txt.tgt.tagged') \
        .to_dict(source_field_name=SOURCE_FIELD, target_field_name=TARGET_FIELD)
    validation = Seq2SeqDataset(
        source_file_path='./cnndm/val.txt.src',
        target_file_path='./cnndm/val.txt.tgt.tagged') \
        .to_dict(source_field_name=SOURCE_FIELD, target_field_name=TARGET_FIELD)

    train = train.map(preprocess)
    validation = validation.map(preprocess)

    tokens = lf.flat_map(lambda x: x[SOURCE_FIELD] + x[TARGET_FIELD],
                         train + validation,
                         lazy=True)
    print('Building vocabulary...')
    token_to_index, words = build_vocab(tokens)
    print(f'Vocab Size: {len(token_to_index)}')

    pad_index = token_to_index[PAD_TOKEN]
    unk_index = token_to_index[UNK_TOKEN]

    loader = DataLoader(train.map(postprocess(
        token_to_index, unk_index)).save('cnndm.preprossed'),
                        batch_size=32,
                        num_workers=4,
                        collate_fn=collate(pad_index))

    for batch in tqdm(loader):