def get(ddir: str, ft_path: str, split: str): random.seed(1111) ddir = Path(ddir) ft_model = fastText.load_model(ft_path) swem = SWEM(ft_model) quality = lf.TextDataset(str(ddir / (f'quality.{split}.txt'))).map(int) sent1 = lf.TextDataset(str(ddir / (f'sent1.{split}.txt'))).map(sent_preprocess(swem)) sent2 = lf.TextDataset(str(ddir / (f'sent2.{split}.txt'))).map(sent_preprocess(swem)) ds = lf.zip(quality, sent1, sent2) return ds
def build(datapath='./data/example.txt', savedir='./'): datapath = Path(datapath) savedir = Path(savedir) docs = lf.TextDataset(str(datapath)) ids = lf.Dataset(range(len(docs))) docs = docs.map(preprocess) ds = lf.zip(ids, docs) tokens = lf.flat_map(lambda x: x[1], ds, lazy=True) t2i, words = build_vocab(tokens, str(savedir / 'vocab.pkl')) unk_index = t2i[UNK_TOKEN] ds.map(postprocess(t2i, unk_index))\ .save(str(savedir / 'dataset.token.pkl'))
def test_get(ddir: str, savedir: str, bsize: int, ft_path: str): ddir = Path(ddir) savedir = Path(savedir) ft_model = fastText.load_model(ft_path) swem = SWEM(ft_model) quality = lf.TextDataset(str(ddir / ('quality.test.txt'))).map(int) sent1 = lf.TextDataset(str(ddir / ('sent1.test.txt'))).map(sent_preprocess(swem)) sent2 = lf.TextDataset(str(ddir / ('sent2.test.txt'))).map(sent_preprocess(swem)) ds = lf.zip(quality, sent1, sent2) test_dataloader = DataLoader( ds.save(savedir / 'swem.test.cache'), batch_size=bsize, shuffle=False, num_workers=4, collate_fn=get_collate_fn() ) return test_dataloader
def setUp(self): self.base = range(100) self.n = 5 self.data = lineflow.zip(*[Dataset(self.base)] * self.n)