def test_uneven_batch(self): test_seqs = [ "a b".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], len(tss)) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = (torch.LongTensor([[1]]), torch.LongTensor([[1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[1:] ]), torch.LongTensor([1])) self.assertEqual(batch, expectation)
def test_no_discard_even_lenght_small_batch(self): test_seqs = [ "b b".split(), "b c".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 1, discard_h=False) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[1]]), torch.LongTensor([[1]]), torch.stack([self.ivec_eetor(" ".join(test_seqs[0][:-1]))]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1]]), torch.LongTensor([[2]]), torch.stack([self.ivec_eetor(" ".join(test_seqs[1][:-1]))]), torch.LongTensor([0]), ) self.assertEqual(batch, expectation)
def test_even_batch_multi_sample_len(self): test_seqs = [ "a b c".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], len(tss)) batches = iter(batches) self.assertEqual(len(list(batches)), 2)
def test_reproducibility(self): test_seqs = [ "a b c".split(), "a b".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) epoch1 = list(iter(batches)) epoch2 = list(iter(batches)) self.assertEqual(epoch1, epoch2)
def test_even_batch_single_sample_no_ivecs(self): test_seqs = [ "a b".split(), "b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder(tss, len(tss)) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.LongTensor([]), ) self.assertEqual(batch, expectation)
def test_even_lenght_small_batch_2(self): test_seqs = [ "a b".split(), "b b".split(), "b c".split(), "c a".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[0:2] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1], [2]]), torch.LongTensor([[2], [0]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[2:4] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation)
def test_insufficient_stream_length(self): test_seqs = [ "a b c".split(), "a".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[2]] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1], [1]]), torch.LongTensor([[2], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[2]] ]), torch.LongTensor([0, 1]), ) self.assertEqual(batch, expectation)
init_seeds(args.seed, args.cuda) print("loading LM...") with open(args.load, 'rb') as f: lm = language_model.load(f) if args.cuda: lm.model.cuda() print(lm.model) print("preparing data...") def ivec_ts_from_file(f): da_ts = DomainAdaptationSplitFFMultiTarget( f, lm.vocab, lm.model.in_len, args.target_seq_len, end_portion=args.domain_portion, ) return da_ts tss = filelist_to_objects(args.file_list, ivec_ts_from_file) data = BatchBuilder(tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: data = CudaStream(data) loss = evaluate_( lm.model, data, use_ivecs=False, custom_batches=True, ) print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
ts, ivec_extractor) print("\ttraining...") def ivec_ts_from_file(f): ts = TokenizedSplitFFBase( f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args. target_seq_len)) return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor) train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file) print("\tvalidation...") valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file) valid_data = BatchBuilder(valid_data_ivecs, args.batch_size, discard_h=not args.concat_articles) if args.cuda: valid_data = CudaStream(valid_data) print("training...") lr = args.lr best_val_loss = None for epoch in range(1, args.epochs + 1): random.shuffle(train_data_ivecs) train_data = BatchBuilder(train_data_ivecs, args.batch_size, discard_h=not args.concat_articles) if args.cuda: train_data = CudaStream(train_data)
print(ivec_extractor) translator = ivec_extractor.build_translator(lm.vocab) print("preparing data...") ivec_app_creator = lambda ts: ivec_appenders.HistoryIvecAppender( ts, ivec_extractor) print("\ttraining...") train_tss = filelist_to_tokenized_splits(args.train_list, lm.vocab, args.target_seq_len) print("\tvalidation...") valid_tss = filelist_to_tokenized_splits(args.valid_list, lm.vocab, args.target_seq_len) valid_data = BatchBuilder([ivec_app_creator(ts) for ts in valid_tss], args.batch_size, discard_h=not args.concat_articles) if args.cuda: valid_data = CudaStream(valid_data) print("training...") lr = args.lr best_val_loss = None for epoch in range(1, args.epochs + 1): random.shuffle(train_tss) train_data = BatchBuilder(train_tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: