def test_uneven_batch(self): test_seqs = [ "a b".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], len(tss)) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = (torch.LongTensor([[1]]), torch.LongTensor([[1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[1:] ]), torch.LongTensor([1])) self.assertEqual(batch, expectation)
def test_no_discard_uneven_length_small_batch(self): test_seqs = [ "a b c".split(), "a b".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2, discard_h=False) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [0]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[1]] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1], [1]]), torch.LongTensor([[2], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[2]] ]), torch.LongTensor([0, 1]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1]]), torch.LongTensor([[1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[2]] ]), torch.LongTensor([1]), ) self.assertEqual(batch, expectation)
def test_even_batch_multi_sample_len(self): test_seqs = [ "a b c".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], len(tss)) batches = iter(batches) self.assertEqual(len(list(batches)), 2)
def test_reproducibility(self): test_seqs = [ "a b c".split(), "a b".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) epoch1 = list(iter(batches)) epoch2 = list(iter(batches)) self.assertEqual(epoch1, epoch2)
def test_even_batch_single_sample_no_ivecs(self): test_seqs = [ "a b".split(), "b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder(tss, len(tss)) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.LongTensor([]), ) self.assertEqual(batch, expectation)
def test_even_lenght_small_batch_2(self): test_seqs = [ "a b".split(), "b b".split(), "b c".split(), "c a".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[0:2] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1], [2]]), torch.LongTensor([[2], [0]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs[2:4] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation)
def test_insufficient_stream_length(self): test_seqs = [ "a b c".split(), "a".split(), "b b b".split(), ] tss = self.get_tokenized_splits(test_seqs, unroll=1) tokens = self.get_tokens(test_seqs) batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2) batches = iter(batches) batch = next(batches) expectation = ( torch.LongTensor([[0], [1]]), torch.LongTensor([[1], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[2]] ]), torch.LongTensor([]), ) self.assertEqual(batch, expectation) batch = next(batches) expectation = ( torch.LongTensor([[1], [1]]), torch.LongTensor([[2], [1]]), torch.stack([ self.ivec_eetor(" ".join(words[:-1])) for words in [test_seqs[0], test_seqs[2]] ]), torch.LongTensor([0, 1]), ) self.assertEqual(batch, expectation)
parser.add_argument('--load', type=str, required=True, help='where to load a model from') args = parser.parse_args() print(args) init_seeds(args.seed, args.cuda) print("loading model...") lm = torch.load(args.load) if args.cuda: lm.cuda() print(lm.model) print("preparing data...") def temp_splits_from_fn(fn): tokens = tokens_from_file(fn, lm.vocab, randomize=False) return TemporalSplits(tokens, lm.model.in_len, args.target_seq_len) tss = filelist_to_objects(args.file_list, temp_splits_from_fn) data = BatchBuilder(tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: data = CudaStream(data) loss = evaluate(lm, data, use_ivecs=False) print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
print("loading model...") lm = torch.load(args.load) if args.cuda: lm.cuda() print(lm.model) print("preparing data...") def temp_splits_from_fn(fn): tokens = tokens_from_file(fn, lm.vocab, randomize=False) return TemporalSplits(tokens, lm.model.in_len, args.target_seq_len) print("\ttraining...") train_tss = filelist_to_objects(args.train_list, temp_splits_from_fn) train_data = BatchBuilder(train_tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: train_data = CudaStream(train_data) print("\tvalidation...") valid_tss = filelist_to_objects(args.valid_list, temp_splits_from_fn) valid_data = BatchBuilder(valid_tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: valid_data = CudaStream(valid_data) print("training...") lr = args.lr best_val_loss = None
print(ivec_extractor) translator = ivec_extractor.build_translator(lm.vocab) print("preparing data...") ivec_app_creator = lambda ts: ivec_appenders.HistoryIvecAppender( ts, ivec_extractor) print("\ttraining...") train_tss = filelist_to_tokenized_splits(args.train_list, lm.vocab, args.target_seq_len) print("\tvalidation...") valid_tss = filelist_to_tokenized_splits(args.valid_list, lm.vocab, args.target_seq_len) valid_data = BatchBuilder([ivec_app_creator(ts) for ts in valid_tss], args.batch_size, discard_h=not args.concat_articles) if args.cuda: valid_data = CudaStream(valid_data) print("training...") lr = args.lr best_val_loss = None for epoch in range(1, args.epochs + 1): random.shuffle(train_tss) train_data = BatchBuilder(train_tss, args.batch_size, discard_h=not args.concat_articles) if args.cuda: