Exemplo n.º 1
0
def test(args):
    tokenizer, vocab2id, id2vocab = bert_tokenizer()
    detokenizer = bert_detokenizer()

    data_path = os.path.join(args.data_path, args.dataset + '/')
    dev_samples = torch.load(os.path.join(data_path, args.dataset + '.dev.pkl'))
    if len(dev_samples) > 10:
        dev_dataset = GTTPDataset(dev_samples, None, None, None, None, None,None, None, sample_tensor=torch.load(os.path.join(data_path, args.dataset+ '.dev.GTTP.dataset.pkl')))

    test_samples = torch.load(os.path.join(data_path, args.dataset + '.test.pkl'))
    if len(test_samples) > 10:
        test_dataset = GTTPDataset(test_samples, None, None, None, None, None,None, None, sample_tensor=torch.load(os.path.join(data_path, args.dataset+ '.test.GTTP.dataset.pkl')))

    for i in range(args.epoch):
        print('epoch', i)
        file = args.output_path + 'model/' + str(i) + '.pkl'

        if os.path.exists(file):
            model = GTTP(args.embedding_size, args.hidden_size, vocab2id, id2vocab, max_dec_len=args.max_target_length, beam_width=1)
            model.load_state_dict(torch.load(file, map_location='cpu'))
            trainer = CumulativeTrainer(model, tokenizer, detokenizer, args.local_rank, args.num_gpu)
            if dev_dataset:
                predictions=trainer.predict('test', dev_dataset, collate_fn, args.batch_size)
                save_result(predictions, dev_dataset, model.to_sentence, detokenizer, args.output_path, args.local_rank, i, args.dataset+'_dev')
            if test_dataset:
                predictions =trainer.predict('test', test_dataset, collate_fn, args.batch_size)
                save_result(predictions, test_dataset, model.to_sentence, detokenizer, args.output_path, args.local_rank, i, args.dataset+'_test')
Exemplo n.º 2
0
def test(args):
    batch_size = 16

    output_path = base_output_path
    dataset = args.dataset
    data_path = args.data_path + dataset + '/' + dataset

    tokenizer, vocab2id, id2vocab = bert_tokenizer()
    detokenizer = bert_detokenizer()

    marco_dev_samples = torch.load(data_path + 'marco/marco.dev.pkl')
    marco_dev_dataset = S2SADataset(
        marco_dev_samples,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        sample_tensor=torch.load(data_path +
                                 'marco/marco.dev.S2SA.dataset.pkl'))

    marco_test_samples = torch.load(data_path + 'marco/marco.test.pkl')
    marco_test_dataset = S2SADataset(
        marco_test_samples,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        sample_tensor=torch.load(data_path +
                                 'marco/marco.test.S2SA.dataset.pkl'))

    cast_test_samples = torch.load(data_path + '.pkl')
    cast_test_dataset = S2SADataset(
        cast_test_samples,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        None,
        sample_tensor=torch.load(data_path + '.S2SA.dataset.pkl'))

    for i in range(epoch):
        print('epoch', i)
        file = output_path + 'model/' + str(i) + '.pkl'

        if os.path.exists(file):
            model = S2SA(embedding_size,
                         hidden_size,
                         vocab2id,
                         id2vocab,
                         max_dec_len=max_target_length,
                         beam_width=1)
            model.load_state_dict(torch.load(file, map_location='cpu'))
            trainer = CumulativeTrainer(model, tokenizer, detokenizer,
                                        args.local_rank, 4)
            predictions = trainer.predict('test', marco_dev_dataset,
                                          collate_fn, batch_size)
            save_result(predictions, marco_dev_dataset, model.to_sentence,
                        detokenizer, output_path, args.local_rank, i,
                        'marco_dev')
            predictions = trainer.predict('test', marco_test_dataset,
                                          collate_fn, batch_size)
            save_result(predictions, marco_test_dataset, model.to_sentence,
                        detokenizer, output_path, args.local_rank, i,
                        'marco_test')
            predictions = trainer.predict('test', cast_test_dataset,
                                          collate_fn, batch_size)
            save_result(predictions, cast_test_dataset, model.to_sentence,
                        detokenizer, output_path, args.local_rank, i,
                        'cast_test')
Exemplo n.º 3
0
def test(args):
    batch_size = args.test_batch_size
    ratio = args.profile_dropout_ratio
    policy = args.neighbor_policy
    task_dir = '%s/%s-%s' % (src, task, policy)
    drop_attr = ''
    if args.keep_attributes is not None:
        for k in _keys:
            if k not in args.keep_attributes:
                drop_attr += '_%s' % k

    _, _, _, kb_vocab = torch.load('%s/kbs.pkl' % task_dir)
    candidates = torch.load('%s/candidates.pkl' % task_dir)
    candidate_tensor = torch.load('%s/candidate.ctds.pkl' % task_dir)
    train_samples = torch.load('%s/train.pkl' % task_dir)
    dev_samples = torch.load('%s/dev.pkl' % task_dir)
    test_samples = torch.load('%s/test.pkl' % task_dir)
    meta_data = torch.load('%s/meta.pkl' % task_dir)
    vocab2id, id2vocab = torch.load('%s/vocab.pkl' % task_dir)
    tokenizer = babi_tokenizer
    print('Item size', len(vocab2id))
    train_sample_tensor = torch.load('%s/train.ctds-%s%s.pkl' %
                                     (task_dir, ratio, drop_attr))
    dev_sample_tensor = torch.load('%s/dev.ctds-%s%s.pkl' %
                                   (task_dir, ratio, drop_attr))
    test_sample_tensor = torch.load('%s/test.ctds-%s%s.pkl' %
                                    (task_dir, ratio, drop_attr))
    dev_dataset = CTDSDataset(dev_samples[:cut_data_index],
                              candidates,
                              meta_data,
                              tokenizer,
                              vocab2id,
                              id2vocab,
                              sample_tensor=dev_sample_tensor[:cut_data_index],
                              train_sample_tensor=train_sample_tensor)
    test_dataset = CTDSDataset(
        test_samples[:cut_data_index],
        candidates,
        meta_data,
        tokenizer,
        vocab2id,
        id2vocab,
        sample_tensor=test_sample_tensor[:cut_data_index],
        train_sample_tensor=train_sample_tensor)

    for i in range(args.infer_epoch_start, epoches):
        print('epoch', i)
        file = os.path.join(output_model_path, str(i) + '.pkl')

        if os.path.exists(file):
            model = CTDS(hidden_size, vocab2id, id2vocab, candidate_tensor,
                         meta_data)
            model.load_state_dict(torch.load(file, map_location='cpu'))

            model_trainer = CumulativeTrainer(
                model,
                tokenizer,
                None,
                args.local_rank,
                4,
                accumulation_steps=accumulation_steps,
                max_grad_norm=args.max_grad_norm,
                save_data_attributes=save_data_attributes)

            # Dev infer # dev_list_output[0][1].tolist()
            dev_list_output = model_trainer.predict('infer', dev_dataset,
                                                    collate_fn, batch_size)
            #save result, note each GPU process will save a separate file
            # save_dev = [[batch_data[0][0], batch_data[0][1], batch_data[0][2], batch_data[1][0], batch_data[1][1]] for batch_data in dev_list_output]
            # save_dev = [[batch_data[0]['response_id'], batch_data[1]] for batch_data in dev_list_output]
            torch.save(
                dev_list_output,
                os.path.join(output_result_path,
                             'dev.%s.%s' % (i, args.local_rank)))

            # Test infer
            test_list_output = model_trainer.predict('infer', test_dataset,
                                                     collate_fn, batch_size)
            # save_test = [[batch_data[0][0], batch_data[0][1], batch_data[1][0], batch_data[1][1]] for batch_data in test_list_output]
            # save_test = [[batch_data[0]['response_id'], batch_data[1]] for batch_data in test_list_output]
            torch.save(
                test_list_output,
                os.path.join(output_result_path,
                             'test.%s.%s' % (i, args.local_rank)))