def evaluate_all(my_arg, pr=True): emb = LoadEmbedding('res/emb.txt') print 'finish loading embedding' encoder = CNNEncoder(emb, dropout_p=0) decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers'], dropout_p=0) en_dict = torch.load('model/encoder_params.pkl') de_dict = torch.load('model/decoder_params.pkl') # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} # print en_dict.keys() encoder.load_state_dict(en_dict) decoder.load_state_dict(de_dict) # decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) # encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) # decoder_optimizer.zero_grad() # encoder_optimizer.zero_grad() batch_getter = BatchGetter('data/eng_dev.txt', 1, shuffle=False) if config['USE_CUDA']: encoder.cuda(config['cuda_num']) decoder.cuda(config['cuda_num']) ner_tag = Vocab('res/ner_xx', unk_id=config['UNK_token'], pad_id=config['PAD_token']) evaluator = BoundaryPerformance(ner_tag) evaluator.reset() out_file = codecs.open('data/eva_result' + str(my_arg), mode='wb', encoding='utf-8') for i, this_batch in enumerate(batch_getter): top_path = eva_one_sentence(encoder, decoder, this_batch) top_path = top_path[1:] # print [ner_tag.getWord(tag) for tag in top_path] evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr) if i % 100 == 0: print '{} sentences processed'.format(i) evaluator.get_performance() return evaluator.get_performance()
patch = utils.wiki_short2full_patch() short2full.update(patch) elif fg_config['data'] == 'bbn': short2full = get_short2full_map(utils.get_bbn_types()) for iteration, this_batch in enumerate(batch_getter): pred, label = evaluate_one(ex_iterations + iteration, word_embedding_layer, type_embedding_layer, ctx_lstm, ctx_att, warp_loss, this_batch) # evaluator.evaluate(label, pred, type_lst, short2full) evaluator.evaluate(this_batch['types_str'], pred, type_lst, short2full) if (iteration+1)*batch_size % 100 == 0: print('{} sentences processed'.format((iteration+1)*batch_size)) evaluator.get_performance() return evaluator.get_performance() if __name__ == '__main__': fg_config['cuda_num'] = 0 fg_config['batch_size'] = 64 fg_config['att'] = 'label_att' fg_config['zero_shot'] = True fg_config['no_zero'] = 'all' fg_config['topk'] = 3 fg_config['data'] = 'bbn' fg_config['type_id'] = Vocab('res/{}/zero_type_voc.txt'.format(fg_config['data']), unk_id=fg_config['UNK_token'], pad_id=fg_config['PAD_token']) torch.cuda.set_device(fg_config['cuda_num']) evaluate_free(0)
def evaluate_all(my_arg, pr=True): emb = LoadEmbedding('res/emb.txt') print 'finish loading embedding' # batch_getter = BatchGetter('data/dev', 'GPE_NAM', 1, False) batch_getter_lst = [] if my_arg == 0: pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'PER', 1, False) batch_getter_lst.append(pernam_batch_getter) loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'LOC', 1, False) batch_getter_lst.append(loc_batch_getter) misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'MISC', 1, False) batch_getter_lst.append(misc_batch_getter) org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'ORG', 1, False) batch_getter_lst.append(org_batch_getter) if my_arg == 1: pernam_batch_getter = BatchGetter('data/dev', 'PER_NAM', 1, False) batch_getter_lst.append(pernam_batch_getter) fac_batch_getter = BatchGetter('data/dev', 'FAC_NAM', 1, False) batch_getter_lst.append(fac_batch_getter) loc_batch_getter = BatchGetter('data/dev', 'LOC_NAM', 1, False) batch_getter_lst.append(loc_batch_getter) gpe_batch_getter = BatchGetter('data/dev', 'GPE_NAM', 1, False) batch_getter_lst.append(gpe_batch_getter) org_batch_getter = BatchGetter('data/dev', 'ORG_NAM', 1, False) batch_getter_lst.append(org_batch_getter) if my_arg == 2: pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'PER', 1, False) batch_getter_lst.append(pernam_batch_getter) loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'LOC', 1, False) batch_getter_lst.append(loc_batch_getter) misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'MISC', 1, False) batch_getter_lst.append(misc_batch_getter) org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'ORG', 1, False) batch_getter_lst.append(org_batch_getter) batch_getter = MergeBatchGetter(batch_getter_lst, 1, False) print 'finish loading dev data' embedding_layer = EmbeddingLayer(emb, 0) d = embedding_layer.get_out_dim() att_layer = AttentionFlowLayer(2 * d) # if my_arg == 2: model_out_layer = ModelingOutLayer(8 * d, d, 2, 3, 0) # else: # model_out_layer = ModelingOutLayer(8*d, d, 2, 2, 0) model_dir = 'model' + str(my_arg) embedding_layer.load_state_dict( torch.load(model_dir + '/embedding_layer.pkl')) att_layer.load_state_dict(torch.load(model_dir + '/att_layer.pkl')) model_out_layer.load_state_dict( torch.load(model_dir + '/model_out_layer.pkl')) # models = [embedding_layer, att_layer, model_out_layer] # opts = [emb_opt, att_opt, model_out_opt] ner_tag = Vocab('res/ner_xx', unk_id=config['UNK_token'], pad_id=config['PAD_token']) # if my_arg == 2: evaluator = ConllBoundaryPerformance(ner_tag) # else: # evaluator = BoundaryPerformance(ner_tag) evaluator.reset() if config['USE_CUDA']: att_layer.cuda(config['cuda_num']) embedding_layer.cuda(config['cuda_num']) model_out_layer.cuda(config['cuda_num']) emb_opt = torch.optim.Adam(embedding_layer.parameters()) att_opt = torch.optim.Adam(att_layer.parameters()) model_out_opt = torch.optim.Adam(model_out_layer.parameters()) out_file = codecs.open('data/eva_result' + str(my_arg), mode='wb', encoding='utf-8') ex_iterations = 0 for iteration, this_batch in enumerate(batch_getter): target, rec = evaluate_one(ex_iterations + iteration, embedding_layer, att_layer, model_out_layer, emb_opt, att_opt, model_out_opt, this_batch) evaluator.evaluate(iteration, target.numpy().tolist(), rec.numpy().tolist(), out_file, pr=pr) if iteration % 100 == 0: print '{} sentences processed'.format(iteration) evaluator.get_performance() return evaluator.get_performance()
def free_evaluate_all(my_arg, pr=True): emb = LoadEmbedding('res/emb.txt') if config['label_emb'] or config['question_alone']: onto_emb = LoadEmbedding('res/onto_embedding.txt') print('finish loading embedding') batch_getter_lst = [] if my_arg == 0: # pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'PER', 1, False) # batch_getter_lst.append(pernam_batch_getter) # loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'LOC', 1, False) # batch_getter_lst.append(loc_batch_getter) misc_batch_getter = ConllBatchGetter( 'data/conll2003/bio_eng.testa', 'MISC', 1, False) batch_getter_lst.append(misc_batch_getter) # org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'ORG', 1, False) # batch_getter_lst.append(org_batch_getter) if my_arg == 1: # pernam_batch_getter = ConllBatchGetter('data/ttt', 'PER', 1, False) # batch_getter_lst.append(pernam_batch_getter) # pernam_batch_getter = ConllBatchGetter('data/ttt', 'singer', 1, False) # batch_getter_lst.append(pernam_batch_getter) pernam_batch_getter = ConllBatchGetter( 'data/conll2003/bio_eng.testb', 'PER', 1, False) batch_getter_lst.append(pernam_batch_getter) loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testb', 'LOC', 1, False) batch_getter_lst.append(loc_batch_getter) # misc_batch_getter = ConllBatchGetter( 'data/conll2003/bio_eng.testb', 'MISC', 1, False) batch_getter_lst.append(misc_batch_getter) org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testb', 'ORG', 1, False) batch_getter_lst.append(org_batch_getter) if my_arg == 2: # pernam_batch_getter = ConllBatchGetter('data/conll2003/bioes_eng.testb', 'food', 1, False) # batch_getter_lst.append(pernam_batch_getter) pernam_batch_getter = ConllBatchGetter( 'data/conll2003/bioes_eng.testb', 'PER', 1, False) batch_getter_lst.append(pernam_batch_getter) loc_batch_getter = ConllBatchGetter( 'data/conll2003/bioes_eng.testb', 'LOC', 1, False) batch_getter_lst.append(loc_batch_getter) misc_batch_getter = ConllBatchGetter( 'data/conll2003/bioes_eng.testb', 'MISC', 1, False) batch_getter_lst.append(misc_batch_getter) org_batch_getter = ConllBatchGetter( 'data/conll2003/bioes_eng.testb', 'ORG', 1, False) batch_getter_lst.append(org_batch_getter) if my_arg == 3: # onto_notes = OntoNotesGetter('data/OntoNotes/test.json', '/person', 1, False) # batch_getter_lst.append(onto_notes) onto_notes_data = OntoNotesGetter('data/OntoNotes/test.json', utils.get_ontoNotes_type_lst(), 1, False) batch_getter_lst.append(onto_notes_data) batch_size = 100 batch_getter = MergeBatchGetter(batch_getter_lst, batch_size, False, data_name=config['data']) print('finish loading dev data') # if config['data'] == 'OntoNotes': # emb_onto = True # else: # emb_onto = False embedding_layer = EmbeddingLayer(emb, 0) if config['label_emb']: q_word_embedding = nn.Embedding(onto_emb.get_voc_size(), onto_emb.get_emb_size()) q_word_embedding.weight.data.copy_(onto_emb.get_embedding_tensor()) q_word_embedding.weight.requires_grad = False else: q_word_embedding = None d = config['hidden_size'] if config['question_alone']: q_emb_layer = QLabel(onto_emb, 0) else: q_emb_layer = None att_layer = AttentionFlowLayer(2 * d) model_layer = ModelingLayer(8 * d, d, 2, 0) ner_hw_layer = NerHighway(2 * d, 8 * d, 1) ner_out_layer = NerOutLayer(10 * d, len(config['Tags']), 0) crf = CRF(config, config['Tags'], len(config['Tags'])) if config['USE_CUDA']: att_layer.cuda(config['cuda_num']) embedding_layer.cuda(config['cuda_num']) if config['label_emb']: q_word_embedding.cuda(config['cuda_num']) model_layer.cuda(config['cuda_num']) ner_hw_layer.cuda(config['cuda_num']) ner_out_layer.cuda(config['cuda_num']) crf.cuda(config['cuda_num']) if config['question_alone']: q_emb_layer.cuda(config['cuda_num']) model_dir = 'ner_model8' att_layer.load_state_dict( torch.load(model_dir + '/early_att_layer.pkl', map_location=lambda storage, loc: storage)) model_layer.load_state_dict( torch.load(model_dir + '/early_model_layer.pkl', map_location=lambda storage, loc: storage)) ner_hw_layer.load_state_dict( torch.load(model_dir + '/early_ner_hw_layer.pkl', map_location=lambda storage, loc: storage)) ner_out_layer.load_state_dict( torch.load(model_dir + '/early_ner_out_layer.pkl', map_location=lambda storage, loc: storage)) crf.load_state_dict( torch.load(model_dir + '/early_crf.pkl', map_location=lambda storage, loc: storage)) embedding_layer.load_state_dict( torch.load(model_dir + '/early_embedding_layer.pkl', map_location=lambda storage, loc: storage)) if config['question_alone']: q_emb_layer.load_state_dict( torch.load(model_dir + '/q_emb_layer.pkl', map_location=lambda storage, loc: storage)) else: q_emb_layer = None if config['question_alone']: q_emb_layer.eval() embedding_layer.eval() att_layer.eval() model_layer.eval() ner_hw_layer.eval() ner_out_layer.eval() crf.eval() ner_tag = Vocab('res/ner_xx', unk_id=config['UNK_token'], pad_id=config['PAD_token']) if my_arg == 3: evaluator = ConllBoundaryPerformance(ner_tag, onto_notes_data) else: evaluator = ConllBoundaryPerformance(ner_tag) evaluator.reset() out_file = codecs.open('data/eva_result' + str(my_arg), mode='wb', encoding='utf-8') # writer.add_embedding(embedding_layer.word_embedding.weight.data.cpu()) # return all_emb = None all_metadata = [] ex_iterations = 0 summary_emb = False for iteration, this_batch in enumerate(batch_getter): # if iteration >= 15: # break if summary_emb: top_path, all_emb, all_metadata, q = evaluate_one( ex_iterations + iteration, embedding_layer, q_word_embedding, q_emb_layer, att_layer, model_layer, ner_hw_layer, ner_out_layer, crf, this_batch, summary_emb, all_emb, all_metadata) else: top_path = evaluate_one(ex_iterations + iteration, embedding_layer, q_word_embedding, q_emb_layer, att_layer, model_layer, ner_hw_layer, ner_out_layer, crf, this_batch) for batch_no, path in enumerate(top_path): evaluator.evaluate( iteration * batch_size + batch_no, remove_end_tag( this_batch[1].numpy()[batch_no, :].tolist()), path, out_file, pr) if (iteration + 1) * batch_size % 100 == 0: print('{} sentences processed'.format( (iteration + 1) * batch_size)) evaluator.get_performance() if summary_emb: writer.add_embedding(torch.cat([q, all_emb], 0), metadata=['question'] + all_metadata) return evaluator.get_performance()
# en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} print en_dict.keys() encoder.load_state_dict(en_dict) decoder.load_state_dict(de_dict) decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() batch_getter = BatchGetter('data/dev', 1, shuffle=False) if config['USE_CUDA']: encoder.cuda(config['cuda_num']) decoder.cuda(config['cuda_num']) ner_tag = Vocab('res/ner_xx', unk_id=config['UNK_token'], pad_id=config['PAD_token']) evaluator = BoundaryPerformance(ner_tag) evaluator.reset() out_file = codecs.open('data/eva_result.txt', mode='wb', encoding='utf-8') for i, this_batch in enumerate(batch_getter): top_path = eva_one_sentence(encoder, decoder, this_batch) top_path = top_path[1:] # print [ner_tag.getWord(tag) for tag in top_path] evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file) if i % 100 == 0: print '{} sentences processed'.format(i) evaluator.get_performance()
evaluator.reset() out_file = codecs.open(os.path.join(log_dir, 'bio_eva_result'), mode='wb', encoding='utf-8') for i, this_batch in enumerate(batch_getter): top_path = crf_eval_one_sen(config, encoder, bidencoder, decoder, this_batch) # top_path = top_path[1:] for batch_no, path in enumerate(top_path): evaluator.evaluate(i, remove_end_tag(this_batch[1].numpy()[batch_no, :].tolist()), path, out_file, pr) # print [ner_tag.getWord(tag) for tag in top_path] # evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr) if (i + 1) * batch_size % 100 == 0: print '{} sentences processed'.format((i + 1) * batch_size) evaluator.get_performance() return evaluator.get_performance() config = get_conf('cmn') config['dev_data'] = 'data/bio_cmn_test.txt' config['decoder_output_size'] = 25 config['model_dir'] = 'crf_' + config['model_dir'] config['decoder_layers'] = 1 config['BioOutTags'] = Vocab('res_cmn/crf_ner_bio.txt', unk_id=config['UNK_token'], pad_id=config['PAD_token']) f, p, r = cmn_crf_eval_free(config, 'test', False)