def setUp(self): self.corpus = [lorem.sentence().split() for _ in range(100)] self.tagged_corpus = [[fake_tags(w) for w in s] for s in self.corpus] self.tag1_corpus = [[tup[1] for tup in s] for s in self.tagged_corpus] self.tag2_corpus = [[tup[2] for tup in s] for s in self.tagged_corpus] # dicts self.seq_d = Dict(eos_token=utils.EOS, bos_token=utils.BOS, force_unk=True, sequential=True) self.seq_d.fit(self.corpus) self.tag1_d = Dict(eos_token=utils.EOS, bos_token=utils.BOS, force_unk=True, sequential=True) self.tag1_d.fit(self.tag1_corpus) self.tag2_d = Dict(eos_token=utils.EOS, bos_token=utils.BOS, force_unk=True, sequential=True) self.tag2_d.fit(self.tag2_corpus) # props self.batch_size = 10 self.bptt = 5 # datasets self.simple_dataset = BlockDataset(self.corpus, self.seq_d, self.batch_size, self.bptt) words, tags1, tags2 = [], [], [] for s in self.tagged_corpus: words.append([tup[0] for tup in s]) tags1.append([tup[1] for tup in s]) tags2.append([tup[2] for tup in s]) self.multi_dataset = BlockDataset( (words, tags1, tags2), (self.seq_d, self.tag1_d, self.tag2_d), self.batch_size, self.bptt)
def load_twisty_dataset(src, trg, batch_size, max_size=100000, min_freq=5, gpu=False, shuffle=True, **kwargs): """ Wrapper function for twisty with sensible, overwritable defaults """ tweets_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq) labels_dict = Dict(sequential=False, force_unk=False) tweets_dict.fit(src) labels_dict.fit(trg) d = {'src': tweets_dict, 'trg': labels_dict} splits = PairedDataset(src, trg, d, batch_size, gpu=gpu).splits( shuffle=shuffle, **kwargs) return splits
def setUp(self): self.corpus = [lorem.sentence().split() for _ in range(1000)] self.path = '/tmp/lorem.test.txt' with open(self.path, 'w') as f: for s in self.corpus: f.write(' '.join(s) + '\n') self.d = Dict(force_unk=True, sequential=True).fit(self.corpus)
def setUp(self): self.sents = [] for _ in range(5000): sent = lorem.sentence().split() if sent not in self.sents: # avoid duplicates since `test_pairing` relies on sorting self.sents.append(sent) props = [0.1, 0.4, 0.3, 0.2] self.labels = np.random.multinomial(1, props, (len(self.sents))).argmax(1) d = Dict(pad_token='<PAD>').fit(self.sents) ld = Dict(sequential=False).fit(self.labels) self.dataset = PairedDataset(self.sents, self.labels, { 'src': d, 'trg': ld }, batch_size=10)
def setUp(self): self.corpus = [lorem.sentence().split() for _ in range(100)] self.seq_vocab = Counter(w for s in self.corpus for w in s) self.seq_d = Dict(eos_token=utils.EOS, bos_token=utils.BOS, force_unk=True, sequential=True) self.seq_d.fit(self.corpus) self.seq_transformed = list(self.seq_d.transform(self.corpus))
def setUp(self): self.corpus = [lorem.sentence().split() for _ in range(100)] self.tagged_corpus = [[fake_tags(w) for w in s] for s in self.corpus] self.tag1_corpus = [[tup[1] for tup in s] for s in self.tagged_corpus] self.tag2_corpus = [[tup[1] for tup in s] for s in self.tagged_corpus] self.seq_d = Dict(eos_token=utils.EOS, bos_token=utils.BOS, force_unk=True, sequential=True) self.seq_d.fit(self.corpus) self.tag1_d = Dict(eos_token=utils.EOS, bos_token=utils.BOS, force_unk=True, sequential=True) self.tag1_d.fit(self.tag1_corpus) self.tag2_d = Dict(eos_token=utils.EOS, bos_token=utils.BOS, force_unk=True, sequential=True) self.tag2_d.fit(self.tag2_corpus)
def load_split_data(path, batch_size, max_size, min_freq, max_len, gpu, processor): """ Load corpus that is already splitted in 'train.txt', 'valid.txt', 'test.txt' """ train_data = load_lines(os.path.join(path, 'train.txt'), max_len=max_len, processor=processor) valid_data = load_lines(os.path.join(path, 'valid.txt'), max_len=max_len, processor=processor) test_data = load_lines(os.path.join(path, 'test.txt'), max_len=max_len, processor=processor) d = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq, force_unk=True) d.fit(train_data, valid_data) train = PairedDataset(train_data, None, {'src': d}, batch_size, gpu=gpu) valid = PairedDataset(valid_data, None, {'src': d}, batch_size, gpu=gpu, evaluation=True) test = PairedDataset(test_data, None, {'src': d}, batch_size, gpu=gpu, evaluation=True) return train.sort_(), valid.sort_(), test.sort_()
batch_size = args.batch_size sample_fn = getattr(d, args.sample_fn) if args.path is not None: with open(args.path, 'rb+') as f: dataset = PairedDataset.from_disk(f) dataset.set_batch_size(args.batch_size) dataset.set_device(args.device) train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None) src_dict = dataset.dicts['src'] else: str_generator = d.generate_set(size, vocab, args.min_len, args.max_len, sample_fn) src, trg = zip(*str_generator) src, trg = list(map(list, src)), list(map(list, trg)) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(src, trg) trg_dict = src_dict if args.reverse: trg_dict = copy.deepcopy(src_dict) trg_dict.align_right = True train, valid = PairedDataset( src, trg, { 'src': src_dict, 'trg': trg_dict }, batch_size=args.batch_size, device=args.device, ).splits(dev=args.dev, test=None, sort=True)
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--path', required=True) parser.add_argument('--output', help='prefix for the stored dataset', required=True) parser.add_argument('--max_size', type=int, default=100000) parser.add_argument('--min_freq', default=1, type=int) parser.add_argument('--lower', action='store_true') parser.add_argument('--num', action='store_true') parser.add_argument('--level', default='char') args = parser.parse_args() processor = text_processor( lower=args.lower, num=args.num, level=args.level) d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS, force_unk=True) trainpath = os.path.join(args.path, 'train.txt') testpath = os.path.join(args.path, 'test.txt') outputformat = (args.output + ".{}.npz").format if os.path.isfile(outputformat("train")): raise ValueError("Output train file already exists") if os.path.isfile(outputformat("test")): raise ValueError("Output test file already exists") print("Fitting dictionary") d.fit(load_lines(trainpath, processor=processor), load_lines(testpath, processor=processor)) u.save_model(d, args.output + '.dict')
parser.add_argument('--patience', default=2, type=int) parser.add_argument('--epochs', type=int, default=25) parser.add_argument('--batch_size', type=int, default=50) parser.add_argument('--device', default='cpu') parser.add_argument('--checkpoint', type=int, default=100) parser.add_argument('--hook', type=int, default=1) parser.add_argument('--test', action='store_true') args = parser.parse_args() print("Loading data...") train_conds, train = zip( *load_sents(args.basedir, 'train', max_lines=args.max_lines)) train_conds, train = list(train_conds), list(train) d = Dict(eos_token=u.EOS, bos_token=u.BOS, unk_token=u.UNK, pad_token=u.PAD, max_size=args.max_size, force_unk=True).fit(train) d2 = copy.deepcopy(d) d2.align_right = args.reverse conds_d = Dict(sequential=False).fit(train_conds) conditional = args.cond_emb > 0 # AE+GRL+C if args.grl and conditional: src, trg = (train, train_conds), (train, train_conds) dicts = {'src': (d, conds_d), 'trg': (d2, conds_d)} # AE+GRL elif args.grl: src, trg = (train, train_conds), train
grl_loss = [] for cond, grl in zip(conds, self.grls): cond_out = F.log_softmax(grad_reverse(grl(out)), 1) grl_loss.append(F.nll_loss(cond_out, cond, size_average=True)) if not test: (sum(grl_loss) / len(self.grls)).backward(retain_graph=True) return [l.data[0] for l in grl_loss] GRLEncoder = type( 'GRL{}'.format(EncoderBaseClass.__name__), (EncoderBaseClass, ), { '__init__': __init__, 'loss': loss, 'conditional': property(lambda self: True) }) return GRLEncoder GRLRNNEncoder = GRLWrapper(RNNEncoder) GRLCNNEncoder = GRLWrapper(CNNEncoder) if __name__ == '__main__': import os from seqmod.misc import Dict from seqmod.modules.embedding import Embedding text = open(os.path.realpath(__file__)).read().split() emb = Embedding.from_dict(Dict().fit(text), 100) GRLRNNEncoder([10], [10], emb, 10, 1, 'LSTM', summary='mean')
parser.add_argument('--device', default='cpu') parser.add_argument('--checkpoint', default=1000, type=int) parser.add_argument('--hook', default=1, type=int) parser.add_argument('--test', action='store_true', help="Don't save") args = parser.parse_args() print("Loading data...") src, src_conds, trg, trg_conds = \ zip(*load_pairs(args.basedir, 'train', tt=args.tt, max_lines=args.max_lines)) src, src_conds = list(src), list(src_conds) trg, trg_conds = list(trg), list(trg_conds) d = Dict( eos_token=u.EOS, bos_token=u.BOS, unk_token=u.UNK, pad_token=u.PAD, max_size=args.max_size, force_unk=True, ).fit(src, trg) d2 = copy.deepcopy(d) d2.align_right = args.reverse conds_d = Dict(sequential=False).fit(src_conds, trg_conds) # S2S+GRL if args.grl: if args.tt: raise ValueError("GRL+TT doesn't quite make sense") src, trg = (src, src_conds), trg dicts = {'src': (d, conds_d), 'trg': d2} # S2S or TT else: