示例#1
0
    def test_uneven_batch(self):
        test_seqs = [
            "a b".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               len(tss))
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1])) for words in test_seqs
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (torch.LongTensor([[1]]), torch.LongTensor([[1]]),
                       torch.stack([
                           self.ivec_eetor(" ".join(words[:-1]))
                           for words in test_seqs[1:]
                       ]), torch.LongTensor([1]))

        self.assertEqual(batch, expectation)
示例#2
0
    def test_no_discard_even_lenght_small_batch(self):
        test_seqs = [
            "b b".split(),
            "b c".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               1,
                               discard_h=False)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1]]),
            torch.LongTensor([[1]]),
            torch.stack([self.ivec_eetor(" ".join(test_seqs[0][:-1]))]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1]]),
            torch.LongTensor([[2]]),
            torch.stack([self.ivec_eetor(" ".join(test_seqs[1][:-1]))]),
            torch.LongTensor([0]),
        )

        self.assertEqual(batch, expectation)
示例#3
0
    def test_even_batch_multi_sample_len(self):
        test_seqs = [
            "a b c".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss],
                               len(tss))
        batches = iter(batches)

        self.assertEqual(len(list(batches)), 2)
示例#4
0
    def test_reproducibility(self):
        test_seqs = [
            "a b c".split(),
            "a b".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        epoch1 = list(iter(batches))
        epoch2 = list(iter(batches))

        self.assertEqual(epoch1, epoch2)
示例#5
0
    def test_even_batch_single_sample_no_ivecs(self):
        test_seqs = [
            "a b".split(),
            "b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder(tss, len(tss))
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)
示例#6
0
    def test_even_lenght_small_batch_2(self):
        test_seqs = [
            "a b".split(),
            "b b".split(),
            "b c".split(),
            "c a".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in test_seqs[0:2]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1], [2]]),
            torch.LongTensor([[2], [0]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in test_seqs[2:4]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)
示例#7
0
    def test_insufficient_stream_length(self):
        test_seqs = [
            "a b c".split(),
            "a".split(),
            "b b b".split(),
        ]
        tss = self.get_tokenized_splits(test_seqs, unroll=1)
        tokens = self.get_tokens(test_seqs)

        batches = BatchBuilder([self.ivec_app_ctor(ts) for ts in tss], 2)
        batches = iter(batches)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[0], [1]]),
            torch.LongTensor([[1], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[2]]
            ]),
            torch.LongTensor([]),
        )

        self.assertEqual(batch, expectation)

        batch = next(batches)
        expectation = (
            torch.LongTensor([[1], [1]]),
            torch.LongTensor([[2], [1]]),
            torch.stack([
                self.ivec_eetor(" ".join(words[:-1]))
                for words in [test_seqs[0], test_seqs[2]]
            ]),
            torch.LongTensor([0, 1]),
        )

        self.assertEqual(batch, expectation)
示例#8
0
    init_seeds(args.seed, args.cuda)

    print("loading LM...")
    with open(args.load, 'rb') as f:
        lm = language_model.load(f)
    if args.cuda:
        lm.model.cuda()
    print(lm.model)

    print("preparing data...")

    def ivec_ts_from_file(f):
        da_ts = DomainAdaptationSplitFFMultiTarget(
            f, lm.vocab, lm.model.in_len,
            args.target_seq_len, end_portion=args.domain_portion,
        )
        return da_ts

    tss = filelist_to_objects(args.file_list, ivec_ts_from_file)
    data = BatchBuilder(tss, args.batch_size,
                        discard_h=not args.concat_articles)
    if args.cuda:
        data = CudaStream(data)

    loss = evaluate_(
        lm.model, data,
        use_ivecs=False,
        custom_batches=True,
    )
    print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
示例#9
0
        ts, ivec_extractor)

    print("\ttraining...")

    def ivec_ts_from_file(f):
        ts = TokenizedSplitFFBase(
            f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                    target_seq_len))
        return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)

    train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file)

    print("\tvalidation...")
    valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file)
    valid_data = BatchBuilder(valid_data_ivecs,
                              args.batch_size,
                              discard_h=not args.concat_articles)
    if args.cuda:
        valid_data = CudaStream(valid_data)

    print("training...")
    lr = args.lr
    best_val_loss = None

    for epoch in range(1, args.epochs + 1):
        random.shuffle(train_data_ivecs)
        train_data = BatchBuilder(train_data_ivecs,
                                  args.batch_size,
                                  discard_h=not args.concat_articles)
        if args.cuda:
            train_data = CudaStream(train_data)
示例#10
0
    print(ivec_extractor)
    translator = ivec_extractor.build_translator(lm.vocab)

    print("preparing data...")
    ivec_app_creator = lambda ts: ivec_appenders.HistoryIvecAppender(
        ts, ivec_extractor)

    print("\ttraining...")
    train_tss = filelist_to_tokenized_splits(args.train_list, lm.vocab,
                                             args.target_seq_len)

    print("\tvalidation...")
    valid_tss = filelist_to_tokenized_splits(args.valid_list, lm.vocab,
                                             args.target_seq_len)
    valid_data = BatchBuilder([ivec_app_creator(ts) for ts in valid_tss],
                              args.batch_size,
                              discard_h=not args.concat_articles)

    if args.cuda:
        valid_data = CudaStream(valid_data)

    print("training...")
    lr = args.lr
    best_val_loss = None

    for epoch in range(1, args.epochs + 1):
        random.shuffle(train_tss)
        train_data = BatchBuilder(train_tss,
                                  args.batch_size,
                                  discard_h=not args.concat_articles)
        if args.cuda: