def load_penn(path, batch_size, max_size=1000000, min_freq=1, gpu=False, shuffle=True): train_data = load_lines(os.path.join(path, 'train.txt')) valid_data = load_lines(os.path.join(path, 'valid.txt')) test_data = load_lines(os.path.join(path, 'test.txt')) d = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq) d.fit(train_data, valid_data) train = PairedDataset(train_data, None, {'src': d}, batch_size, gpu=gpu) valid = PairedDataset(valid_data, None, {'src': d}, batch_size, gpu=gpu, evaluation=True) test = PairedDataset(test_data, None, {'src': d}, batch_size, gpu=gpu, evaluation=True) return train.sort_(), valid.sort_(), test.sort_()
def load_dataset(src, trg, batch_size, max_size=100000, min_freq=5, gpu=False, shuffle=True, sort_key=default_sort_key, **kwargs): """ Wrapper function for dataset with sensible, overwritable defaults """ tweets_dict = Dict(pad_token='<pad>', eos_token='<eos>', bos_token='<bos>', max_size=max_size, min_freq=min_freq) labels_dict = Dict(sequential=False, force_unk=False) tweets_dict.fit(src) labels_dict.fit(trg) d = {'src': tweets_dict, 'trg': labels_dict} splits = PairedDataset(src, trg, d, batch_size, gpu=gpu).splits(shuffle=shuffle, sort_key=sort_key, **kwargs) return splits
def load_split_data(path, batch_size, max_size, min_freq, max_len, device, processor): """ Load corpus that is already splitted in 'train.txt', 'valid.txt', 'test.txt' """ train = load_lines(os.path.join(path, 'train.txt'), max_len, processor) valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor) d = Dict( pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq, force_unk=True ).fit(train, valid) train = load_lines(os.path.join(path, 'train.txt'), max_len, processor) valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor) test = load_lines(os.path.join(path, 'test.txt'), max_len, processor) train = PairedDataset(train, None, {'src': d}, batch_size, device=device) valid = PairedDataset(valid, None, {'src': d}, batch_size, device=device) test = PairedDataset(test, None, {'src': d}, batch_size, device=device) return train.sort_(), valid.sort_(), test.sort_()
def load_penn(path, batch_size, max_size=1000000, min_freq=1, gpu=False, shuffle=True, sort_key=lambda pair: len(pair[0])): train_data = load_lines(os.path.join(path, 'train.txt')) train_labels = make_mock_labels(train_data) valid_data = load_lines(os.path.join(path, 'valid.txt')) valid_labels = make_mock_labels(valid_data) test_data = load_lines(os.path.join(path, 'test.txt')) test_labels = make_mock_labels(test_data) ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq) ldict.fit(train_data, valid_data) mock = Dict().fit(train_labels) d = {'src': ldict, 'trg': mock} train = PairedDataset(train_data, train_labels, d, batch_size, gpu=gpu ).sort_(sort_key=sort_key) valid = PairedDataset(valid_data, valid_labels, d, batch_size, gpu=gpu, evaluation=True).sort_(sort_key=sort_key) test = PairedDataset(test_data, test_labels, d, batch_size, gpu=gpu, evaluation=True).sort_(sort_key=sort_key) return train, valid, test
def load_from_lines( path, batch_size, max_size=1000000, min_freq=5, gpu=False, shuffle=True, sort_key=lambda x: len(x[0]), **kwargs): lines = load_lines(path) ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq) ldict.fit(lines) mock_labels = make_mock_labels(train) mock = Dict() mock.fit(mock_labels) d = {'src': ldict, 'trg': mock} splits = PairedDataset(lines, mock_labels, d, batch_size, gpu=gpu).splits( shuffle=shuffle, sort_key=sort_key, **kwargs) return splits
def load_from_lines(path, batch_size, max_size=1000000, min_freq=5, gpu=False, shuffle=True, **kwargs): lines = load_lines(path) ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq).fit(lines) return PairedDataset(lines, None, { 'src': ldict }, batch_size, gpu=gpu).splits(shuffle=shuffle, **kwargs)
def shingle_dataset(args, vocab_dict=None, focus_size=None, right_size=None): if focus_size: args.focus_size = focus_size if right_size: args.right_size = right_size # load the data: if args.task == 'sentences': dataset = list( SentenceCouples(args.input, max_items=args.max_items, tokenize=args.tokenize, level=args.level)) print(f'* loaded {len(dataset)} sentences') elif args.task == 'snippets': dataset = list( SnippetCouples(args.input, focus_size=args.focus_size, right_size=args.right_size, max_items=args.max_items)) print(f'* loaded {len(dataset)} snippets') else: raise ValueError("`Task` should be one of ('sentences', 'snippets')") # random shuffle: if args.shuffle: print('* shuffling batches...') random.seed(args.rnd_seed) random.shuffle(dataset) for c in dataset[:10]: print('\t'.join(' '.join(s[:10]) for s in c)) if vocab_dict is None: vocab_dict = Dict(pad_token=u.PAD, bos_token=u.BOS, eos_token=u.EOS, min_freq=args.min_item_freq, sequential=True, force_unk=True, max_size=args.max_vocab_size) focus, right = zip(*dataset) del dataset if not vocab_dict.fitted: vocab_dict.fit( focus, right ) # sometimes inefficient? # do a partial fit in the triple store? train, valid = PairedDataset(src=(focus, ), trg=(right, ), d={ 'src': (vocab_dict, ), 'trg': (vocab_dict, ) }, batch_size=args.batch_size, gpu=args.gpu, align_right=args.reverse, fitted=False).splits(sort_by='src', dev=args.dev, test=None, sort=True) return train, valid, vocab_dict
dataset = PairedDataset.from_disk(f) dataset.set_batch_size(args.batch_size) dataset.set_gpu(args.gpu) train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None) src_dict = dataset.dicts['src'] else: src, trg = zip(*d.generate_set(size, vocab, args.min_len, args.max_len, sample_fn)) src, trg = list(map(list, src)), list(map(list, trg)) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(src, trg) train, valid = PairedDataset(src, trg, { 'src': src_dict, 'trg': src_dict }, batch_size=args.batch_size, gpu=args.gpu).splits(dev=args.dev, test=None, sort_by='src') print(' * vocabulary size. %d' % len(src_dict)) print(' * number of train batches. %d' % len(train)) print(' * maximum batch size. %d' % batch_size) print('Building model...') model = EncoderDecoder((args.layers, args.dec_layers), args.emb_dim, args.hid_dim, args.att_dim,
parser.add_argument('--pretrained', type=str, default='empty') # Logging parser.add_argument('--gen_src', default=None) parser.add_argument('--gen_tgt', default=None) parser.add_argument('--csv', type=str, default='empty') parser.add_argument('--logging', action='store_true') parser.add_argument('--visdom', action='store_true') args = parser.parse_args() src, trg = load_data(args.path, ('.answers', '.questions')) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=args.max_size, min_freq=args.min_freq) src_dict.fit(src, trg) train, valid = PairedDataset( src, trg, {'src': src_dict, 'trg': src_dict}, batch_size=args.batch_size, gpu=args.gpu ).splits(dev=args.dev, test=None, sort_key=lambda pair: len(pair[0])) print(' * vocabulary size. %d' % len(src_dict)) print(' * number of train batches. %d' % len(train)) print(' * maximum batch size. %d' % args.batch_size) print('Building model...') model = EncoderDecoder( # removed (args.hid_dim, args.hid_dim) added args.hid_dim (args.layers, args.layers), args.emb_dim, args.hid_dim, args.att_dim, src_dict, att_type=args.att_type, dropout=args.dropout, bidi=args.bidi, cell=args.cell) # Load Glove Pretrained Embeddings
for target in args.targets: sample_fn = wrap_autoencode(getattr(d, target)) src, trg = zip(*d.generate_set( args.train_len, args.vocab, args.min_len, args.max_len, sample_fn)) src, trg = list(map(list, src)), list(map(list, trg)) datasets[target] = {'src': src, 'trg': trg} src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(*[data for target in datasets for data in datasets[target].values()]) for target in datasets: train, valid = PairedDataset( datasets[target]['src'], datasets[target]['trg'], {'src': src_dict, 'trg': src_dict}, batch_size=args.batch_size, gpu=args.gpu).splits( dev=args.dev, test=None, shuffle=True, sort_key=lambda pair: len(pair[0])) del datasets[target] src, trg = zip(*d.generate_set( int(args.train_len * 0.1), args.vocab, args.min_len, args.max_len, getattr(d, target))) src, trg = list(map(list, src)), list(map(list, trg)) test = PairedDataset(src, trg, {'src': src_dict, 'trg': src_dict}, batch_size=args.batch_size, gpu=args.gpu) datasets[target] = {'train': train, 'valid': valid, 'test': test} print('Building model...') model = ForkableMultiTarget( (args.layers, args.layers), args.emb_dim, (args.hid_dim, args.hid_dim), args.att_dim, src_dict, att_type=args.att_type, dropout=args.dropout,