def testLoad(self,cfg): cfg = self.cfg entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) train_documents = load_documents(cfg['data_folder'] + cfg['train_documents']) train_document_entity_indices, train_document_texts = index_document_entities(train_documents, word2id, entity2id, cfg['max_document_word']) train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices, train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation'])
def test(cfg): entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) test_documents = load_documents(cfg['data_folder'] + cfg['test_documents']) test_document_entity_indices, test_document_texts = index_document_entities( test_documents, word2id, entity2id, cfg['max_document_word']) test_data = DataLoader(cfg['data_folder'] + cfg['test_data'], test_documents, test_document_entity_indices, test_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) my_model = get_model(cfg, test_data.num_kb_relation, len(entity2id), len(word2id)).to(device) test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True) return test_acc
def train(cfg): tf_logger = SummaryWriter('tf_logs/' + cfg['model_id']) # train and test share the same set of documents documents = load_documents(cfg['data_folder'] + cfg['{}_documents'.format(cfg['mode'])]) # train data train_data = DataLoader(cfg, documents) valid_data = DataLoader(cfg, documents, mode='dev') model = KAReader(cfg) model = model.to(torch.device('cuda')) trainable = filter(lambda p: p.requires_grad, model.parameters()) optim = torch.optim.Adam(trainable, lr=cfg['learning_rate']) if cfg['lr_schedule']: scheduler = torch.optim.lr_scheduler.MultiStepLR(optim, [30], gamma=0.5) model.train() best_val_f1 = 0 best_val_hits = 0 for epoch in range(cfg['num_epoch']): batcher = train_data.batcher(shuffle=True) train_loss = [] for feed in batcher: loss, pred, pred_dist = model(feed) train_loss.append(loss.item()) # acc, max_acc = cal_accuracy(pred, feed['answers'].cpu().numpy()) # train_acc.append(acc) # train_max_acc.append(max_acc) optim.zero_grad() loss.backward() if cfg['gradient_clip'] != 0: torch.nn.utils.clip_grad_norm_(trainable, cfg['gradient_clip']) optim.step() tf_logger.add_scalar('avg_batch_loss', np.mean(train_loss), epoch) val_f1, val_hits = test(model, valid_data, cfg['eps']) if cfg['lr_schedule']: scheduler.step() tf_logger.add_scalar('eval_f1', val_f1, epoch) tf_logger.add_scalar('eval_hits', val_hits, epoch) if val_f1 > best_val_f1: best_val_f1 = val_f1 if val_hits > best_val_hits: best_val_hits = val_hits torch.save( model.state_dict(), 'model/{}/{}_best.pt'.format(cfg['name'], cfg['model_id'])) print('evaluation best f1:{} current:{}'.format(best_val_f1, val_f1)) print('evaluation best hits:{} current:{}'.format( best_val_hits, val_hits)) print('save final model') torch.save(model.state_dict(), 'model/{}/{}_final.pt'.format(cfg['name'], cfg['model_id'])) # model_save_path = 'model/{}/{}_best.pt'.format(cfg['name'], cfg['model_id']) # model.load_state_dict(torch.load(model_save_path)) print('\n..........Finished training, start testing.......') test_data = DataLoader(cfg, documents, mode='test') model.eval() print('finished training, testing final model...') test(model, test_data, cfg['eps'])
print('how many eval samples......', len(f1s)) print('avg_f1', np.mean(f1s)) print('avg_hits', np.mean(hits)) model.train() return np.mean(f1s), np.mean(hits) if __name__ == "__main__": # config_file = sys.argv[2] cfg = get_config() random.seed(cfg['seed']) np.random.seed(cfg['seed']) torch.manual_seed(cfg['seed']) torch.cuda.manual_seed_all(cfg['seed']) if cfg['mode'] == 'train': train(cfg) elif cfg['mode'] == 'test': documents = load_documents(cfg['data_folder'] + cfg['{}_documents'.format(cfg['mode'])]) test_data = DataLoader(cfg, documents, mode='test') model = KAReader(cfg) model = model.to(torch.device('cuda')) model_save_path = 'model/{}/{}_best.pt'.format(cfg['name'], cfg['model_id']) model.load_state_dict(torch.load(model_save_path)) model.eval() test(model, test_data, cfg['eps']) else: assert False, "--train or --test?"
def train(cfg): print("training ...") # prepare data entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) train_documents = load_documents(cfg['data_folder'] + cfg['train_documents']) train_document_entity_indices, train_document_texts = index_document_entities( train_documents, word2id, entity2id, cfg['max_document_word']) train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices, train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['dev_documents'] != cfg['train_documents']: valid_documents = load_documents(cfg['data_folder'] + cfg['dev_documents']) valid_document_entity_indices, valid_document_texts = index_document_entities( valid_documents, word2id, entity2id, cfg['max_document_word']) else: valid_documents = train_documents valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'], valid_documents, valid_document_entity_indices, valid_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['test_documents'] != cfg['dev_documents']: test_documents = load_documents(cfg['data_folder'] + cfg['test_documents']) test_document_entity_indices, test_document_texts = index_document_entities( test_documents, word2id, entity2id, cfg['max_document_word']) else: test_documents = valid_documents test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts test_data = DataLoader(cfg['data_folder'] + cfg['test_data'], test_documents, test_document_entity_indices, test_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) # create model & set parameters my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id), len(word2id)).to(device) trainable_parameters = [ p for p in my_model.parameters() if p.requires_grad ] optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate']) best_dev_acc = 0.0 for epoch in range(cfg['num_epoch']): try: print('epoch', epoch) train_data.reset_batches(is_sequential=cfg['is_debug']) # Train my_model.train() train_loss, train_acc, train_max_acc = [], [], [] for iteration in tqdm( range(train_data.num_data // cfg['batch_size'])): batch = train_data.get_batch(iteration, cfg['batch_size'], cfg['fact_dropout']) loss, pred, _ = my_model(batch) pred = pred.data.cpu().numpy() acc, max_acc = cal_accuracy(pred, batch[-1]) train_loss.append(float(loss.data)) train_acc.append(acc) train_max_acc.append(max_acc) # back propogate my_model.zero_grad() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(my_model.parameters(), cfg['gradient_clip']) optimizer.step() print('avg_training_loss', sum(train_loss) / len(train_loss)) print('max_training_acc', sum(train_max_acc) / len(train_max_acc)) print('avg_training_acc', sum(train_acc) / len(train_acc)) print("validating ...") eval_acc = inference(my_model, valid_data, entity2id, cfg) if eval_acc > best_dev_acc and cfg['to_save_model']: print("saving model to", cfg['save_model_file']) torch.save(my_model.state_dict(), cfg['save_model_file']) best_dev_acc = eval_acc except KeyboardInterrupt: break # Test set evaluation print("evaluating on test") print('loading model from ...', cfg['save_model_file']) my_model.load_state_dict(torch.load(cfg['save_model_file'])) test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True) return test_acc
def testLoadDoc(self): print(load_documents(self.cfg["data_folder"] + self.cfg['train_documents'])[:3])
def train(cfg): print("training ...") # prepare data # entities.txt, vocab.txt,relations.txt等文件 """ entity2id是一个字典,表示每个entity对应的id word2id也是一个字典,表示每个word对应的id relation2id也是,表示relation,比如has_tags这些谓词 以上三者数据量较小 train_documents 文件较大。是一个list,每个元素一个dict,包含document,title,tokens属性。 document属性下,有text, entities属性。text就是文本,entities是一个列表。表示text中每个entity以及其kb_id。 title下与document类似,只不过是title文本 token中应该是document和title的entity列表。 index_document_entities应该是 """ entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) # train_document.json train_documents = load_documents(cfg['data_folder'] + cfg['train_documents']) train_document_entity_indices, train_document_texts = index_document_entities( train_documents, word2id, entity2id, cfg['max_document_word']) train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices, train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['dev_documents'] != cfg['train_documents']: valid_documents = load_documents(cfg['data_folder'] + cfg['dev_documents']) valid_document_entity_indices, valid_document_texts = index_document_entities( valid_documents, word2id, entity2id, cfg['max_document_word']) else: valid_documents = train_documents valid_document_entity_indices, valid_document_texts = train_document_entity_indices, train_document_texts valid_data = DataLoader(cfg['data_folder'] + cfg['dev_data'], valid_documents, valid_document_entity_indices, valid_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) if cfg['test_documents'] != cfg['dev_documents']: test_documents = load_documents(cfg['data_folder'] + cfg['test_documents']) test_document_entity_indices, test_document_texts = index_document_entities( test_documents, word2id, entity2id, cfg['max_document_word']) else: test_documents = valid_documents test_document_entity_indices, test_document_texts = valid_document_entity_indices, valid_document_texts test_data = DataLoader(cfg['data_folder'] + cfg['test_data'], test_documents, test_document_entity_indices, test_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) # create model & set parameters my_model = get_model(cfg, train_data.num_kb_relation, len(entity2id), len(word2id), "train") trainable_parameters = [ p for p in my_model.parameters() if p.requires_grad ] optimizer = torch.optim.Adam(trainable_parameters, lr=cfg['learning_rate']) best_dev_acc = 0.0 for epoch in range(cfg['num_epoch']): try: print('epoch', epoch) train_data.reset_batches(is_sequential=cfg['is_debug']) # Train my_model.train() train_loss, train_acc, train_max_acc = [], [], [] for iteration in tqdm( range(train_data.num_data // cfg['batch_size'])): batch = train_data.get_batch(iteration, cfg['batch_size'], cfg['fact_dropout']) loss, pred, _ = my_model(batch) pred = pred.data.cpu().numpy() acc, max_acc = cal_accuracy(pred, batch[-1]) train_loss.append(loss.data) train_acc.append(acc) train_max_acc.append(max_acc) # back propogate my_model.zero_grad() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(my_model.parameters(), cfg['gradient_clip']) optimizer.step() print('avg_training_loss', sum(train_loss) / len(train_loss)) print('max_training_acc', sum(train_max_acc) / len(train_max_acc)) print('avg_training_acc', sum(train_acc) / len(train_acc)) print("validating ...") eval_acc = inference(my_model, valid_data, entity2id, cfg) if eval_acc > best_dev_acc and cfg['to_save_model']: print("saving model to", cfg['save_model_file']) torch.save(my_model.state_dict(), cfg['save_model_file']) best_dev_acc = eval_acc except KeyboardInterrupt: break # Test set evaluation print("evaluating on test") print('loading model from ...', cfg['save_model_file']) my_model.load_state_dict(torch.load(cfg['save_model_file'])) test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True) return test_acc
from collections import namedtuple import sys from sklearn import svm import numpy as np from nltk.corpus import wordnet as wn from util import load_documents from util import load_mention_pairs from kernels import load_labels documents = load_documents() Feature = namedtuple('Feature', ['word', 'pos', 'cpos', 'chunktag', 'hypernym', 'netypes']) Instance = namedtuple('Instance', ['tree', 'sent_index', 'filename']) def features(t, sent_index, filename): """ Get the features of a dep tree features include: word, POS, Collapsed_POS, ChunkTag, WordNet_Hypernym """ word = t.token.lower() parse_tree = documents[filename].parsed_sents[sent_index] pos = parse_tree.pos()[t.index] collapsed = pos[0] treeposition = parse_tree.treeposition_spanning_leaves(t.index, t.index+1)[:-2] chunktag = parse_tree[treeposition].label() if chunktag.startswith("N"): chunktag = 'NP' elif chunktag.startswith("V"): chunktag = "VP"