def test(args): tokenizer, vocab2id, id2vocab = bert_tokenizer() detokenizer = bert_detokenizer() data_path = os.path.join(args.data_path, args.dataset + '/') dev_samples = torch.load(os.path.join(data_path, args.dataset + '.dev.pkl')) if len(dev_samples) > 10: dev_dataset = GTTPDataset(dev_samples, None, None, None, None, None,None, None, sample_tensor=torch.load(os.path.join(data_path, args.dataset+ '.dev.GTTP.dataset.pkl'))) test_samples = torch.load(os.path.join(data_path, args.dataset + '.test.pkl')) if len(test_samples) > 10: test_dataset = GTTPDataset(test_samples, None, None, None, None, None,None, None, sample_tensor=torch.load(os.path.join(data_path, args.dataset+ '.test.GTTP.dataset.pkl'))) for i in range(args.epoch): print('epoch', i) file = args.output_path + 'model/' + str(i) + '.pkl' if os.path.exists(file): model = GTTP(args.embedding_size, args.hidden_size, vocab2id, id2vocab, max_dec_len=args.max_target_length, beam_width=1) model.load_state_dict(torch.load(file, map_location='cpu')) trainer = CumulativeTrainer(model, tokenizer, detokenizer, args.local_rank, args.num_gpu) if dev_dataset: predictions=trainer.predict('test', dev_dataset, collate_fn, args.batch_size) save_result(predictions, dev_dataset, model.to_sentence, detokenizer, args.output_path, args.local_rank, i, args.dataset+'_dev') if test_dataset: predictions =trainer.predict('test', test_dataset, collate_fn, args.batch_size) save_result(predictions, test_dataset, model.to_sentence, detokenizer, args.output_path, args.local_rank, i, args.dataset+'_test')
def test(args): batch_size = 16 output_path = base_output_path dataset = args.dataset data_path = args.data_path + dataset + '/' + dataset tokenizer, vocab2id, id2vocab = bert_tokenizer() detokenizer = bert_detokenizer() marco_dev_samples = torch.load(data_path + 'marco/marco.dev.pkl') marco_dev_dataset = S2SADataset( marco_dev_samples, None, None, None, None, None, None, None, None, sample_tensor=torch.load(data_path + 'marco/marco.dev.S2SA.dataset.pkl')) marco_test_samples = torch.load(data_path + 'marco/marco.test.pkl') marco_test_dataset = S2SADataset( marco_test_samples, None, None, None, None, None, None, None, None, sample_tensor=torch.load(data_path + 'marco/marco.test.S2SA.dataset.pkl')) cast_test_samples = torch.load(data_path + '.pkl') cast_test_dataset = S2SADataset( cast_test_samples, None, None, None, None, None, None, None, None, sample_tensor=torch.load(data_path + '.S2SA.dataset.pkl')) for i in range(epoch): print('epoch', i) file = output_path + 'model/' + str(i) + '.pkl' if os.path.exists(file): model = S2SA(embedding_size, hidden_size, vocab2id, id2vocab, max_dec_len=max_target_length, beam_width=1) model.load_state_dict(torch.load(file, map_location='cpu')) trainer = CumulativeTrainer(model, tokenizer, detokenizer, args.local_rank, 4) predictions = trainer.predict('test', marco_dev_dataset, collate_fn, batch_size) save_result(predictions, marco_dev_dataset, model.to_sentence, detokenizer, output_path, args.local_rank, i, 'marco_dev') predictions = trainer.predict('test', marco_test_dataset, collate_fn, batch_size) save_result(predictions, marco_test_dataset, model.to_sentence, detokenizer, output_path, args.local_rank, i, 'marco_test') predictions = trainer.predict('test', cast_test_dataset, collate_fn, batch_size) save_result(predictions, cast_test_dataset, model.to_sentence, detokenizer, output_path, args.local_rank, i, 'cast_test')
def test(args): batch_size = args.test_batch_size ratio = args.profile_dropout_ratio policy = args.neighbor_policy task_dir = '%s/%s-%s' % (src, task, policy) drop_attr = '' if args.keep_attributes is not None: for k in _keys: if k not in args.keep_attributes: drop_attr += '_%s' % k _, _, _, kb_vocab = torch.load('%s/kbs.pkl' % task_dir) candidates = torch.load('%s/candidates.pkl' % task_dir) candidate_tensor = torch.load('%s/candidate.ctds.pkl' % task_dir) train_samples = torch.load('%s/train.pkl' % task_dir) dev_samples = torch.load('%s/dev.pkl' % task_dir) test_samples = torch.load('%s/test.pkl' % task_dir) meta_data = torch.load('%s/meta.pkl' % task_dir) vocab2id, id2vocab = torch.load('%s/vocab.pkl' % task_dir) tokenizer = babi_tokenizer print('Item size', len(vocab2id)) train_sample_tensor = torch.load('%s/train.ctds-%s%s.pkl' % (task_dir, ratio, drop_attr)) dev_sample_tensor = torch.load('%s/dev.ctds-%s%s.pkl' % (task_dir, ratio, drop_attr)) test_sample_tensor = torch.load('%s/test.ctds-%s%s.pkl' % (task_dir, ratio, drop_attr)) dev_dataset = CTDSDataset(dev_samples[:cut_data_index], candidates, meta_data, tokenizer, vocab2id, id2vocab, sample_tensor=dev_sample_tensor[:cut_data_index], train_sample_tensor=train_sample_tensor) test_dataset = CTDSDataset( test_samples[:cut_data_index], candidates, meta_data, tokenizer, vocab2id, id2vocab, sample_tensor=test_sample_tensor[:cut_data_index], train_sample_tensor=train_sample_tensor) for i in range(args.infer_epoch_start, epoches): print('epoch', i) file = os.path.join(output_model_path, str(i) + '.pkl') if os.path.exists(file): model = CTDS(hidden_size, vocab2id, id2vocab, candidate_tensor, meta_data) model.load_state_dict(torch.load(file, map_location='cpu')) model_trainer = CumulativeTrainer( model, tokenizer, None, args.local_rank, 4, accumulation_steps=accumulation_steps, max_grad_norm=args.max_grad_norm, save_data_attributes=save_data_attributes) # Dev infer # dev_list_output[0][1].tolist() dev_list_output = model_trainer.predict('infer', dev_dataset, collate_fn, batch_size) #save result, note each GPU process will save a separate file # save_dev = [[batch_data[0][0], batch_data[0][1], batch_data[0][2], batch_data[1][0], batch_data[1][1]] for batch_data in dev_list_output] # save_dev = [[batch_data[0]['response_id'], batch_data[1]] for batch_data in dev_list_output] torch.save( dev_list_output, os.path.join(output_result_path, 'dev.%s.%s' % (i, args.local_rank))) # Test infer test_list_output = model_trainer.predict('infer', test_dataset, collate_fn, batch_size) # save_test = [[batch_data[0][0], batch_data[0][1], batch_data[1][0], batch_data[1][1]] for batch_data in test_list_output] # save_test = [[batch_data[0]['response_id'], batch_data[1]] for batch_data in test_list_output] torch.save( test_list_output, os.path.join(output_result_path, 'test.%s.%s' % (i, args.local_rank)))