dev_datasets = [('aida-A', conll.testA), ('aida-B', conll.testB), ('msnbc', conll.msnbc), ('aquaint', conll.aquaint), ('ace2004', conll.ace2004), ('clueweb', conll.clueweb), ('wikipedia', conll.wikipedia)] if args.mode == 'train': print('training...') config = {'lr': args.learning_rate, 'n_epochs': args.n_epochs} pprint(config) ranker.train(conll.train, dev_datasets, config) elif args.mode == 'eval': org_dev_datasets = dev_datasets # + [('aida-train', conll.train)] dev_datasets = [] for dname, data in org_dev_datasets: dev_datasets.append( (dname, ranker.get_data_items(data, predict=True))) print(dname, '#dev docs', len(dev_datasets[-1][1])) vecs = ranker.model.rel_embs.cpu().data.numpy() for di, (dname, data) in enumerate(dev_datasets): ranker.model._coh_ctx_vecs = [] # predict each dataset one by one predictions = ranker.predict(data) print( dname, utils.tokgreen( 'micro F1: ' + str(D.eval(org_dev_datasets[di][1], predictions))))
def train(self, org_train_dataset, org_dev_datasets, config, preranked_train=None, preranked_dev=None): print('extracting training data') if preranked_train is None: train_dataset = self.get_data_items(org_train_dataset, predict=False) else: train_dataset = preranked_train print('#train docs', len(train_dataset)) if preranked_dev is None: dev_datasets = [] for dname, data in org_dev_datasets: dev_datasets.append( (dname, self.get_data_items(data, predict=True))) print(dname, '#dev docs', len(dev_datasets[-1][1])) else: dev_datasets = preranked_dev print('creating optimizer') optimizer = optim.Adam( [p for p in self.model.parameters() if p.requires_grad], lr=config['lr']) best_f1 = -1 not_better_count = 0 is_counting = False stop = False eval_after_n_epochs = self.args.eval_after_n_epochs final_result_str = '' print('total training items', len(train_dataset)) n_updates = 0 if config['multi_instance']: n_updates_to_eval = 1000 n_updates_to_stop = 60000 f1_threshold = 0.875 f1_start_couting = 0.87 elif config['semisup']: n_updates_to_eval = 5000 n_update_to_stop = 1e10 f1_threshold = 0.86 f1_start_couting = 0.86 else: # for supervised learning n_updates_to_eval = 1000 n_updates_to_stop = 1000 * self.args.n_epochs f1_threshold = 0.95 f1_start_couting = 0.95 for e in range(config['n_epochs']): shuffle(train_dataset) total_loss = 0 total = 0 for dc, batch in enumerate( train_dataset): # each document is a minibatch self.model.train() optimizer.zero_grad() tps = [m['selected_cands']['true_pos'] >= 0 for m in batch] any_true = np.any(tps) if any_true: inputs = self.minibatch2input(batch) else: inputs = self.minibatch2input(batch, topk=2) if config['semisup']: if any_true: # from supervision (i.e. CoNLL) scores = self.model.forward( inputs, gold=inputs['true_pos'].view(-1, 1), inference='LBP') else: scores = self.model.forward( inputs, gold=inputs['true_pos'].view(-1, 1), inference='star') else: scores = self.model.forward(inputs, gold=inputs['true_pos'].view( -1, 1)) if any_true: loss = self.model.loss(scores, inputs['true_pos']) else: loss = self.model.multi_instance_loss(scores, inputs) loss.backward() optimizer.step() loss = loss.cpu().data.item() total_loss += loss if dc % 100 == 0: print('epoch', e, "%0.2f%%" % (dc / len(train_dataset) * 100), loss, end='\r') n_updates += 1 if n_updates % n_updates_to_eval == 0: # only continue if the best f1 is larger than if n_updates >= n_updates_to_stop and best_f1 < f1_threshold: stop = True print( 'this initialization is not good. Run another one... STOP' ) break print('\n--------------------') dev_f1 = 0 results = '' for di, (dname, data) in enumerate(dev_datasets): #if dname == 'aida-A': #dname != '': if dname != '': #a = 0.1 #b = 1. #c = 0.95 #global alpha #global beta #global gamma #alpha = a #beta = b #gamma = c predictions = self.predict(data, n_best=n_best) cats = None # **YD** only ignore .conll for reddit data # if'cat' in data[0][0]['raw']['conll_m']: if 'reddit' not in dname and 'cat' in data[0][0][ 'raw']['conll_m']: cats = [] for doc in data: cats += [ m['raw']['conll_m']['cat'] for m in doc ] # **YD** change output of D.eval to include prec, rec and f1 """ if cats is None: f1 = D.eval(org_dev_datasets[di][1], predictions) else: f1 = D.eval(org_dev_datasets[di][1], predictions, cats) #print(alpha, beta, gamma, dname, utils.tokgreen('micro F1: ' + str(f1))) print(dname, utils.tokgreen('micro F1: ' + str(f1))) results += dname + '\t' + utils.tokgreen('micro F1: ' + str(f1)) + '\n' """ if cats is None: f1, out_s = D.eval(org_dev_datasets[di][1], predictions) else: f1, out_s = D.eval(org_dev_datasets[di][1], predictions, cats) print(dname, utils.tokgreen(out_s)) results = dname + '\t' + utils.tokgreen( out_s) + '\n' if dname == 'aida-A': dev_f1 = f1 continue if config['multi_instance']: predictions = self.predict(data, n_best=n_best) else: # including semisup predictions = self.predict(data, n_best=1) cats = None if 'cat' in data[0][0]['raw']['conll_m']: cats = [] for doc in data: cats += [ m['raw']['conll_m']['cat'] for m in doc ] if cats is None: f1 = D.eval(org_dev_datasets[di][1], predictions) else: f1, tab = D.eval(org_dev_datasets[di][1], predictions, cats) pprint(tab) print(dname, utils.tokgreen('micro F1: ' + str(f1))) if dev_f1 >= best_f1 and dev_f1 >= f1_start_couting: # 0.82 (for weak supervised learning alone) is_counting = True not_better_count = 0 if is_counting: if dev_f1 < best_f1: not_better_count += 1 print('not dev f1 inc after', not_better_count) else: final_result_str = results not_better_count = 0 best_f1 = dev_f1 if self.args.model_path is not None: print('save model to', self.args.model_path) self.model.save(self.args.model_path) if not_better_count == self.args.n_not_inc: print('dev f1 not inc after', not_better_count, '... STOP') stop = True break print('epoch', e, 'total loss', total_loss, total_loss / len(train_dataset)) if stop: print('**********************************************') print('best results (f1 on aida-A):') print(final_result_str) break
def train(self, org_train_dataset, org_dev_datasets, config): print('extracting training data') train_dataset = self.get_data_items(org_train_dataset, predict=False) print('#train docs', len(train_dataset)) dev_datasets = [] for dname, data in org_dev_datasets: dev_datasets.append((dname, self.get_data_items(data, predict=True))) print(dname, '#dev docs', len(dev_datasets[-1][1])) print('creating optimizer') optimizer = optim.Adam( [p for p in self.model.parameters() if p.requires_grad], lr=config['lr']) best_f1 = -1 not_better_count = 0 is_counting = False eval_after_n_epochs = self.args.eval_after_n_epochs for e in range(config['n_epochs']): shuffle(train_dataset) total_loss = 0 for dc, batch in enumerate( train_dataset): # each document is a minibatch self.model.train() optimizer.zero_grad() # convert data items to pytorch inputs token_ids = [ m['context'][0] + m['context'][1] if len(m['context'][0]) + len(m['context'][1]) > 0 else [self.model.word_voca.unk_id] for m in batch ] s_ltoken_ids = [m['snd_ctx'][0] for m in batch] s_rtoken_ids = [m['snd_ctx'][1] for m in batch] s_mtoken_ids = [m['snd_ment'] for m in batch] entity_ids = Variable( torch.LongTensor( [m['selected_cands']['cands'] for m in batch]).cuda()) true_pos = Variable( torch.LongTensor([ m['selected_cands']['true_pos'] for m in batch ]).cuda()) p_e_m = Variable( torch.FloatTensor( [m['selected_cands']['p_e_m'] for m in batch]).cuda()) entity_mask = Variable( torch.FloatTensor( [m['selected_cands']['mask'] for m in batch]).cuda()) token_ids, token_mask = utils.make_equal_len( token_ids, self.model.word_voca.unk_id) s_ltoken_ids, s_ltoken_mask = utils.make_equal_len( s_ltoken_ids, self.model.snd_word_voca.unk_id, to_right=False) s_rtoken_ids, s_rtoken_mask = utils.make_equal_len( s_rtoken_ids, self.model.snd_word_voca.unk_id) s_rtoken_ids = [l[::-1] for l in s_rtoken_ids] s_rtoken_mask = [l[::-1] for l in s_rtoken_mask] s_mtoken_ids, s_mtoken_mask = utils.make_equal_len( s_mtoken_ids, self.model.snd_word_voca.unk_id) token_ids = Variable(torch.LongTensor(token_ids).cuda()) token_mask = Variable(torch.FloatTensor(token_mask).cuda()) # too ugly but too lazy to fix it self.model.s_ltoken_ids = Variable( torch.LongTensor(s_ltoken_ids).cuda()) self.model.s_ltoken_mask = Variable( torch.FloatTensor(s_ltoken_mask).cuda()) self.model.s_rtoken_ids = Variable( torch.LongTensor(s_rtoken_ids).cuda()) self.model.s_rtoken_mask = Variable( torch.FloatTensor(s_rtoken_mask).cuda()) self.model.s_mtoken_ids = Variable( torch.LongTensor(s_mtoken_ids).cuda()) self.model.s_mtoken_mask = Variable( torch.FloatTensor(s_mtoken_mask).cuda()) scores = self.model.forward(token_ids, token_mask, entity_ids, entity_mask, p_e_m, gold=true_pos.view(-1, 1)) loss = self.model.loss(scores, true_pos) loss.backward() optimizer.step() # self.model.regularize(max_norm=100) loss = loss.cpu().data.numpy() total_loss += loss print('epoch', e, "%0.2f%%" % (dc / len(train_dataset) * 100), loss, end='\r') print('epoch', e, 'total loss', total_loss, total_loss / len(train_dataset)) if (e + 1) % eval_after_n_epochs == 0: dev_f1 = 0 for di, (dname, data) in enumerate(dev_datasets): predictions = self.predict(data) f1 = D.eval(org_dev_datasets[di][1], predictions) print(dname, utils.tokgreen('micro F1: ' + str(f1))) if dname == 'aida-A': dev_f1 = f1 if config[ 'lr'] == 1e-4 and dev_f1 >= self.args.dev_f1_change_lr: eval_after_n_epochs = 2 is_counting = True best_f1 = dev_f1 not_better_count = 0 config['lr'] = 1e-5 print('change learning rate to', config['lr']) if self.args.mulrel_type == 'rel-norm': optimizer = optim.Adam([ p for p in self.model.parameters() if p.requires_grad ], lr=config['lr']) elif self.args.mulrel_type == 'ment-norm': for param_group in optimizer.param_groups: param_group['lr'] = config['lr'] if is_counting: if dev_f1 < best_f1: not_better_count += 1 else: not_better_count = 0 best_f1 = dev_f1 print('save model to', self.args.model_path) self.model.save(self.args.model_path) if not_better_count == self.args.n_not_inc: break self.model.print_weight_norm()
preranked_train = preranked_train[:min(args.n_docs, len(preranked_train))] org_dev_datasets = [(all_datasets[i][0], all_datasets[i][1]) for i in range(1, len(all_datasets))] dev_datasets = [(all_datasets[i][0], all_datasets[i][2]) for i in range(1, len(all_datasets))] dev_datasets = [(all_datasets[i][0], all_datasets[i][1], all_datasets[2]) for i in range(1, len(all_datasets))] for di, (dname, data, preranked) in enumerate(dev_datasets): ranker.model._coh_ctx_vecs = [] predictions = ranker.predict(preranked) print( dname, utils.tokgreen('micro F1: ' + str(D.eval(data, predictions)))) elif args.mode == 'ed': with open(args.filelist, 'r') as flist: for fname in flist: fname = fname.strip() print('load file from', fname) conll_path = fname cands_path = conll_path + '.csv' data = D.CoNLLDataset.load_file(conll_path, cands_path, person_path) data = ranker.get_data_items(data, predict=True) print('#docs', len(data)) continue
('reddit2020silver', conll.reddit2020silver), ('reddit2020g_s', conll.reddit2020g_s), ] if args.mode == 'train': print('training...') config = {'lr': args.learning_rate, 'n_epochs': args.n_epochs} pprint(config) ranker.train(conll.train, dev_datasets, config) elif args.mode == 'eval': org_dev_datasets = dev_datasets # + [('aida-train', conll.train)] dev_datasets = [] for dname, data in org_dev_datasets: dev_datasets.append( (dname, ranker.get_data_items(data, predict=True))) print(dname, '#dev docs', len(dev_datasets[-1][1])) vecs = ranker.model.rel_embs.cpu().data.numpy() for di, (dname, data) in enumerate(dev_datasets): ranker.model._coh_ctx_vecs = [] predictions = ranker.predict(data) # **YD** change output of D.eval to include prec, rec and f1 """ print(dname, utils.tokgreen('micro F1: ' + str(D.eval(org_dev_datasets[di][1], predictions)))) """ f1, out_s = D.eval(data, predictions) print(dname, utils.tokgreen(out_s))
pprint(config) ranker.train(conll.train, dev_datasets, config) elif args.mode == 'eval': org_dev_datasets = dev_datasets # + [('aida-train', conll.train)] dev_datasets = [] for dname, data in org_dev_datasets: dev_datasets.append((dname, ranker.get_data_items(data, predict=True))) print(dname, '#dev docs', len(dev_datasets[-1][1])) if dname == 'test': with open('test_data.pickle', 'wb') as f: pickle.dump(ranker.get_data_items(data, predict=False), f, pickle.HIGHEST_PROTOCOL) vecs = ranker.model.rel_embs.cpu().data.numpy() for di, (dname, data) in enumerate(dev_datasets): if di == 1: break ranker.model._coh_ctx_vecs = [] predictions = ranker.predict(data) print(dname, utils.tokgreen('micro F1: ' + str(D.eval(org_dev_datasets[di][1], predictions)))) elif args.mode == 'test': text = '' with open(args.input_file, 'r', encoding='utf-8') as f: for line in f.readlines(): text += line + '\n' result = entity_linking_plain(text) print(result)