def testLoad(self,cfg): cfg = self.cfg entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) train_documents = load_documents(cfg['data_folder'] + cfg['train_documents']) train_document_entity_indices, train_document_texts = index_document_entities(train_documents, word2id, entity2id, cfg['max_document_word']) train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices, train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation'])
def test(cfg): entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) test_documents = load_documents(cfg['data_folder'] + cfg['test_documents']) test_document_entity_indices, test_document_texts = index_document_entities( test_documents, word2id, entity2id, cfg['max_document_word']) test_data = DataLoader(cfg['data_folder'] + cfg['test_data'], test_documents, test_document_entity_indices, test_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) my_model = get_model(cfg, test_data.num_kb_relation, len(entity2id), len(word2id)).to(device) test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True) return test_acc
def train(cfg): print("training ...") # prepare data entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) train_documents = load_documents(cfg['data_folder'] + cfg['train_documents']) train_document_entity_indices, train_document_texts = index_document_entities( train_documents, word2id, entity2id, cfg['max_document_word']) train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices, train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['dev_documents'] != cfg['train_documents']: valid_documents = load_documents(cfg['data_folder'] + cfg['dev_documents']) valid_document_entity_indices, valid_document_texts = index_document_entities( valid_documents, word2id, entity2id, cfg['max_document_word']) else: valid_documents = train_documents valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'], valid_documents, valid_document_entity_indices, valid_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['test_documents'] != cfg['dev_documents']: test_documents = load_documents(cfg['data_folder'] + cfg['test_documents']) test_document_entity_indices, test_document_texts = index_document_entities( test_documents, word2id, entity2id, cfg['max_document_word']) else: test_documents = valid_documents test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts test_data = DataLoader(cfg['data_folder'] + cfg['test_data'], test_documents, test_document_entity_indices, test_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) # create model & set parameters my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id), len(word2id)).to(device) trainable_parameters = [ p for p in my_model.parameters() if p.requires_grad ] optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate']) best_dev_acc = 0.0 for epoch in range(cfg['num_epoch']): try: print('epoch', epoch) train_data.reset_batches(is_sequential=cfg['is_debug']) # Train my_model.train() train_loss, train_acc, train_max_acc = [], [], [] for iteration in tqdm( range(train_data.num_data // cfg['batch_size'])): batch = train_data.get_batch(iteration, cfg['batch_size'], cfg['fact_dropout']) loss, pred, _ = my_model(batch) pred = pred.data.cpu().numpy() acc, max_acc = cal_accuracy(pred, batch[-1]) train_loss.append(float(loss.data)) train_acc.append(acc) train_max_acc.append(max_acc) # back propogate my_model.zero_grad() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(my_model.parameters(), cfg['gradient_clip']) optimizer.step() print('avg_training_loss', sum(train_loss) / len(train_loss)) print('max_training_acc', sum(train_max_acc) / len(train_max_acc)) print('avg_training_acc', sum(train_acc) / len(train_acc)) print("validating ...") eval_acc = inference(my_model, valid_data, entity2id, cfg) if eval_acc > best_dev_acc and cfg['to_save_model']: print("saving model to", cfg['save_model_file']) torch.save(my_model.state_dict(), cfg['save_model_file']) best_dev_acc = eval_acc except KeyboardInterrupt: break # Test set evaluation print("evaluating on test") print('loading model from ...', cfg['save_model_file']) my_model.load_state_dict(torch.load(cfg['save_model_file'])) test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True) return test_acc
def train(cfg): print("training ...") # prepare data # entities.txt, vocab.txt,relations.txt等文件 """ entity2id是一个字典,表示每个entity对应的id word2id也是一个字典,表示每个word对应的id relation2id也是,表示relation,比如has_tags这些谓词 以上三者数据量较小 train_documents 文件较大。是一个list,每个元素一个dict,包含document,title,tokens属性。 document属性下,有text, entities属性。text就是文本,entities是一个列表。表示text中每个entity以及其kb_id。 title下与document类似,只不过是title文本 token中应该是document和title的entity列表。 index_document_entities应该是 """ entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) # train_document.json train_documents = load_documents(cfg['data_folder'] + cfg['train_documents']) train_document_entity_indices, train_document_texts = index_document_entities( train_documents, word2id, entity2id, cfg['max_document_word']) train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices, train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['dev_documents'] != cfg['train_documents']: valid_documents = load_documents(cfg['data_folder'] + cfg['dev_documents']) valid_document_entity_indices, valid_document_texts = index_document_entities( valid_documents, word2id, entity2id, cfg['max_document_word']) else: valid_documents = train_documents valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'], valid_documents, valid_document_entity_indices, valid_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['test_documents'] != cfg['dev_documents']: test_documents = load_documents(cfg['data_folder'] + cfg['test_documents']) test_document_entity_indices, test_document_texts = index_document_entities( test_documents, word2id, entity2id, cfg['max_document_word']) else: test_documents = valid_documents test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts test_data = DataLoader(cfg['data_folder'] + cfg['test_data'], test_documents, test_document_entity_indices, test_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) # create model & set parameters my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id), len(word2id), "train") trainable_parameters = [ p for p in my_model.parameters() if p.requires_grad ] optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate']) best_dev_acc = 0.0 for epoch in range(cfg['num_epoch']): try: print('epoch', epoch) train_data.reset_batches(is_sequential=cfg['is_debug']) # Train my_model.train() train_loss, train_acc, train_max_acc = [], [], [] for iteration in tqdm( range(train_data.num_data // cfg['batch_size'])): batch = train_data.get_batch(iteration, cfg['batch_size'], cfg['fact_dropout']) loss, pred, _ = my_model(batch) pred = pred.data.cpu().numpy() acc, max_acc = cal_accuracy(pred, batch[-1]) train_loss.append(loss.data) train_acc.append(acc) train_max_acc.append(max_acc) # back propogate my_model.zero_grad() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(my_model.parameters(), cfg['gradient_clip']) optimizer.step() print('avg_training_loss', sum(train_loss) / len(train_loss)) print('max_training_acc', sum(train_max_acc) / len(train_max_acc)) print('avg_training_acc', sum(train_acc) / len(train_acc)) print("validating ...") eval_acc = inference(my_model, valid_data, entity2id, cfg) if eval_acc > best_dev_acc and cfg['to_save_model']: print("saving model to", cfg['save_model_file']) torch.save(my_model.state_dict(), cfg['save_model_file']) best_dev_acc = eval_acc except KeyboardInterrupt: break # Test set evaluation print("evaluating on test") print('loading model from ...', cfg['save_model_file']) my_model.load_state_dict(torch.load(cfg['save_model_file'])) test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True) return test_acc