def test(data, opt): corpus_dir = opt.test_file if opt.nlp_tool == "nltk": nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle') else: raise RuntimeError("invalid nlp tool") corpus_files = [f for f in os.listdir(corpus_dir) if f.find('.xml') != -1] model = SeqModel(data, opt) if opt.test_in_cpu: model.load_state_dict( torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu')) else: model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl'))) meddra_dict = load_meddra_dict(data) # initialize norm models if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble logging.info("use ensemble normer") multi_sieve.init(opt, None, data, meddra_dict, None, True) if opt.ensemble == 'learn': if opt.test_in_cpu: ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'), map_location='cpu') else: ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl')) ensemble_model.eval() else: if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl')) vsm_model.eval() neural_model.eval() elif opt.norm_rule: logging.info("use rule-based normer") multi_sieve.init(opt, None, data, meddra_dict) elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() else: logging.info("no normalization is performed.") makedir_and_clear(opt.predict) ct_success = 0 ct_error = 0 for fileName in corpus_files: try: start = time.time() document, annotation_file = processOneFile_fda(fileName, corpus_dir, nlp_tool, False, opt.types, opt.type_filter, True, False) pred_entities = [] for section in document: data.test_texts = [] data.test_Ids = [] read_instance_from_one_document(section, data.word_alphabet, data.char_alphabet, data.label_alphabet, data.test_texts, data.test_Ids, data) _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest) entities = translateResultsintoEntities(section.sentences, pred_results) # remove the entity in the ignore_region and fill section_id section_id = section.name[section.name.rfind('_')+1: ] entities = remove_entity_in_the_ignore_region(annotation_file.ignore_regions, entities, section_id) if opt.norm_rule and opt.norm_vsm and opt.norm_neural: if opt.ensemble == 'learn': ensemble_model.process_one_doc(section, entities, meddra_dict, None, True) else: pred_entities1 = copy.deepcopy(entities) pred_entities2 = copy.deepcopy(entities) pred_entities3 = copy.deepcopy(entities) multi_sieve.runMultiPassSieve(section, pred_entities1, meddra_dict, True) vsm_model.process_one_doc(section, pred_entities2, meddra_dict, None, True) neural_model.process_one_doc(section, pred_entities3, meddra_dict, None, True) # merge pred_entities1, pred_entities2, pred_entities3 into entities ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, meddra_dict, True, vsm_model.dict_alphabet, data) elif opt.norm_rule: multi_sieve.runMultiPassSieve(section, entities, meddra_dict, True) elif opt.norm_vsm: vsm_model.process_one_doc(section, entities, meddra_dict, None, True) elif opt.norm_neural: neural_model.process_one_doc(section, entities, meddra_dict, None, True) for entity in entities: if len(entity.norm_ids)!=0: # if a mention can't be normed, not output it pred_entities.append(entity) dump_results(fileName, pred_entities, opt, annotation_file) end = time.time() logging.info("process %s complete with %.2fs" % (fileName, end - start)) ct_success += 1 except Exception as e: logging.error("process file {} error: {}".format(fileName, e)) ct_error += 1 if opt.norm_rule: multi_sieve.finalize(True) logging.info("test finished, total {}, error {}".format(ct_success + ct_error, ct_error))
def metamap_ner_my_norm(d): print("load umls ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO( d.config['norm_dict']) predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap" annotation_dir = os.path.join(opt.test_file, 'bioc') corpus_dir = os.path.join(opt.test_file, 'txt') annotation_files = [ f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f)) ] if opt.norm_rule: multi_sieve.init(opt, None, d, UMLS_dict, UMLS_dict_reverse, False) elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() ct_norm_predict = 0 ct_norm_gold = 0 ct_norm_correct = 0 for gold_file_name in annotation_files: print("# begin {}".format(gold_file_name)) gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) predict_document = metamap.load_metamap_result_from_file( join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt")) # copy entities from metamap entities pred_entities = [] for gold in predict_document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) if opt.norm_rule: multi_sieve.runMultiPassSieve(gold_document, pred_entities, UMLS_dict, False) elif opt.norm_neural: neural_model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse, False) elif opt.norm_vsm: vsm_model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse, False) else: raise RuntimeError("wrong configuration") p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict) ct_norm_gold += p1 ct_norm_predict += p2 ct_norm_correct += p3 p = ct_norm_correct * 1.0 / ct_norm_predict r = ct_norm_correct * 1.0 / ct_norm_gold f1 = 2.0 * p * r / (p + r) print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
def train(train_data, dev_data, test_data, d, dictionary, dictionary_reverse, opt, fold_idx, isMeddra_dict): logging.info("train the ensemble normalization model ...") external_train_data = [] if d.config.get('norm_ext_corpus') is not None: for k, v in d.config['norm_ext_corpus'].items(): if k == 'tac': external_train_data.extend( load_data_fda(v['path'], True, v.get('types'), v.get('types'), False, True)) else: raise RuntimeError("not support external corpus") if len(external_train_data) != 0: train_data.extend(external_train_data) logging.info("build alphabet ...") word_alphabet = Alphabet('word') norm_utils.build_alphabet_from_dict(word_alphabet, dictionary, isMeddra_dict) norm_utils.build_alphabet(word_alphabet, train_data) if opt.dev_file: norm_utils.build_alphabet(word_alphabet, dev_data) if opt.test_file: norm_utils.build_alphabet(word_alphabet, test_data) norm_utils.fix_alphabet(word_alphabet) if d.config.get('norm_emb') is not None: logging.info("load pretrained word embedding ...") pretrain_word_embedding, word_emb_dim = build_pretrain_embedding( d.config.get('norm_emb'), word_alphabet, opt.word_emb_dim, False) word_embedding = nn.Embedding(word_alphabet.size(), word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy(pretrain_word_embedding)) embedding_dim = word_emb_dim else: logging.info("randomly initialize word embedding ...") word_embedding = nn.Embedding(word_alphabet.size(), d.word_emb_dim, padding_idx=0) word_embedding.weight.data.copy_( torch.from_numpy( random_embedding(word_alphabet.size(), d.word_emb_dim))) embedding_dim = d.word_emb_dim dict_alphabet = Alphabet('dict') norm_utils.init_dict_alphabet(dict_alphabet, dictionary) norm_utils.fix_alphabet(dict_alphabet) # rule logging.info("init rule-based normer") multi_sieve.init(opt, train_data, d, dictionary, dictionary_reverse, isMeddra_dict) if opt.ensemble == 'learn': logging.info("init ensemble normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) ensemble_model = Ensemble(word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses) if pretrain_neural_model is not None: ensemble_model.neural_linear.weight.data.copy_( pretrain_neural_model.linear.weight.data) if pretrain_vsm_model is not None: ensemble_model.vsm_linear.weight.data.copy_( pretrain_vsm_model.linear.weight.data) ensemble_train_X = [] ensemble_train_Y = [] for doc in train_data: temp_X, temp_Y = generate_instances(doc, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict) ensemble_train_X.extend(temp_X) ensemble_train_Y.extend(temp_Y) ensemble_train_loader = DataLoader(MyDataset(ensemble_train_X, ensemble_train_Y), opt.batch_size, shuffle=True, collate_fn=my_collate) ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(ensemble_model.word_embedding) else: # vsm logging.info("init vsm-based normer") poses = vsm.init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict) # alphabet can share between vsm and neural since they don't change # but word_embedding cannot vsm_model = vsm.VsmNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet, poses) vsm_train_X = [] vsm_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = vsm.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = vsm.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) vsm_train_X.extend(temp_X) vsm_train_Y.extend(temp_Y) vsm_train_loader = DataLoader(vsm.MyDataset(vsm_train_X, vsm_train_Y), opt.batch_size, shuffle=True, collate_fn=vsm.my_collate) vsm_optimizer = optim.Adam(vsm_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(vsm_model.word_embedding) if d.config['norm_vsm_pretrain'] == '1': vsm.dict_pretrain(dictionary, dictionary_reverse, d, True, vsm_optimizer, vsm_model) # neural logging.info("init neural-based normer") neural_model = norm_neural.NeuralNormer(word_alphabet, copy.deepcopy(word_embedding), embedding_dim, dict_alphabet) neural_train_X = [] neural_train_Y = [] for doc in train_data: if isMeddra_dict: temp_X, temp_Y = norm_neural.generate_instances( doc.entities, word_alphabet, dict_alphabet) else: temp_X, temp_Y = norm_neural.generate_instances_ehr( doc.entities, word_alphabet, dict_alphabet, dictionary_reverse) neural_train_X.extend(temp_X) neural_train_Y.extend(temp_Y) neural_train_loader = DataLoader(norm_neural.MyDataset( neural_train_X, neural_train_Y), opt.batch_size, shuffle=True, collate_fn=norm_neural.my_collate) neural_optimizer = optim.Adam(neural_model.parameters(), lr=opt.lr, weight_decay=opt.l2) if opt.tune_wordemb == False: freeze_net(neural_model.word_embedding) if d.config['norm_neural_pretrain'] == '1': neural_model.dict_pretrain(dictionary, dictionary_reverse, d, True, neural_optimizer, neural_model) best_dev_f = -10 best_dev_p = -10 best_dev_r = -10 bad_counter = 0 logging.info("start training ...") for idx in range(opt.iter): epoch_start = time.time() if opt.ensemble == 'learn': ensemble_model.train() ensemble_train_iter = iter(ensemble_train_loader) ensemble_num_iter = len(ensemble_train_loader) for i in range(ensemble_num_iter): x, rules, lengths, y = next(ensemble_train_iter) y_pred = ensemble_model.forward(x, rules, lengths) l = ensemble_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(ensemble_model.parameters(), opt.gradient_clip) ensemble_optimizer.step() ensemble_model.zero_grad() else: vsm_model.train() vsm_train_iter = iter(vsm_train_loader) vsm_num_iter = len(vsm_train_loader) for i in range(vsm_num_iter): x, lengths, y = next(vsm_train_iter) l, _ = vsm_model.forward_train(x, lengths, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(vsm_model.parameters(), opt.gradient_clip) vsm_optimizer.step() vsm_model.zero_grad() neural_model.train() neural_train_iter = iter(neural_train_loader) neural_num_iter = len(neural_train_loader) for i in range(neural_num_iter): x, lengths, y = next(neural_train_iter) y_pred = neural_model.forward(x, lengths) l = neural_model.loss(y_pred, y) l.backward() if opt.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(neural_model.parameters(), opt.gradient_clip) neural_optimizer.step() neural_model.zero_grad() epoch_finish = time.time() logging.info("epoch: %s training finished. Time: %.2fs" % (idx, epoch_finish - epoch_start)) if opt.dev_file: if opt.ensemble == 'learn': # logging.info("weight w1: %.4f, w2: %.4f, w3: %.4f" % (ensemble_model.w1.data.item(), ensemble_model.w2.data.item(), ensemble_model.w3.data.item())) p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, None, None, ensemble_model, d, isMeddra_dict) else: p, r, f = norm_utils.evaluate(dev_data, dictionary, dictionary_reverse, vsm_model, neural_model, None, d, isMeddra_dict) logging.info("Dev: p: %.4f, r: %.4f, f: %.4f" % (p, r, f)) else: f = best_dev_f if f > best_dev_f: logging.info("Exceed previous best f score on dev: %.4f" % (best_dev_f)) if opt.ensemble == 'learn': if fold_idx is None: torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save( ensemble_model, os.path.join(opt.output, "ensemble_{}.pkl".format(fold_idx + 1))) else: if fold_idx is None: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) else: torch.save( vsm_model, os.path.join(opt.output, "vsm_{}.pkl".format(fold_idx + 1))) torch.save( neural_model, os.path.join(opt.output, "norm_neural_{}.pkl".format(fold_idx + 1))) best_dev_f = f best_dev_p = p best_dev_r = r bad_counter = 0 else: bad_counter += 1 if len(opt.dev_file) != 0 and bad_counter >= opt.patience: logging.info('Early Stop!') break logging.info("train finished") if fold_idx is None: multi_sieve.finalize(True) else: if fold_idx == opt.cross_validation - 1: multi_sieve.finalize(True) else: multi_sieve.finalize(False) if len(opt.dev_file) == 0: if opt.ensemble == 'learn': torch.save(ensemble_model, os.path.join(opt.output, "ensemble.pkl")) else: torch.save(vsm_model, os.path.join(opt.output, "vsm.pkl")) torch.save(neural_model, os.path.join(opt.output, "norm_neural.pkl")) return best_dev_p, best_dev_r, best_dev_f
def test(data, opt): # corpus_dir = join(opt.test_file, 'corpus') # corpus_dir = join(opt.test_file, 'txt') corpus_dir = opt.test_file if opt.nlp_tool == "spacy": nlp_tool = spacy.load('en') elif opt.nlp_tool == "nltk": nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle') elif opt.nlp_tool == "stanford": nlp_tool = StanfordCoreNLP('http://localhost:{0}'.format(9000)) else: raise RuntimeError("invalid nlp tool") corpus_files = [ f for f in listdir(corpus_dir) if isfile(join(corpus_dir, f)) ] model = SeqModel(data, opt) if opt.test_in_cpu: model.load_state_dict( torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu')) else: model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl'))) dictionary, dictionary_reverse = umls.load_umls_MRCONSO( data.config['norm_dict']) isMeddra_dict = False # initialize norm models if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble logging.info("use ensemble normer") multi_sieve.init(opt, None, data, dictionary, dictionary_reverse, False) if opt.ensemble == 'learn': if opt.test_in_cpu: ensemble_model = torch.load(os.path.join( opt.output, 'ensemble.pkl'), map_location='cpu') else: ensemble_model = torch.load( os.path.join(opt.output, 'ensemble.pkl')) ensemble_model.eval() else: if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) vsm_model.eval() neural_model.eval() elif opt.norm_rule: logging.info("use rule-based normer") multi_sieve.init(opt, None, data, dictionary, dictionary_reverse, False) elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() else: logging.info("no normalization is performed.") makedir_and_clear(opt.predict) ct_success = 0 ct_error = 0 for fileName in corpus_files: try: start = time.time() document, _, _, _ = processOneFile(fileName, None, corpus_dir, nlp_tool, False, opt.types, opt.type_filter) data.test_texts = [] data.test_Ids = [] read_instance_from_one_document(document, data.word_alphabet, data.char_alphabet, data.label_alphabet, data.test_texts, data.test_Ids, data) _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest) entities = translateResultsintoEntities(document.sentences, pred_results) if opt.norm_rule and opt.norm_vsm and opt.norm_neural: if opt.ensemble == 'learn': ensemble_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) else: pred_entities1 = copy.deepcopy(entities) pred_entities2 = copy.deepcopy(entities) pred_entities3 = copy.deepcopy(entities) multi_sieve.runMultiPassSieve(document, pred_entities1, dictionary, isMeddra_dict) vsm_model.process_one_doc(document, pred_entities2, dictionary, dictionary_reverse, isMeddra_dict) neural_model.process_one_doc(document, pred_entities3, dictionary, dictionary_reverse, isMeddra_dict) # merge pred_entities1, pred_entities2, pred_entities3 into entities ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, dictionary, isMeddra_dict, vsm_model.dict_alphabet, data) elif opt.norm_rule: multi_sieve.runMultiPassSieve(document, entities, dictionary, isMeddra_dict) elif opt.norm_vsm: vsm_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) elif opt.norm_neural: neural_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) dump_results(fileName, entities, opt) end = time.time() logging.info("process %s complete with %.2fs" % (fileName, end - start)) ct_success += 1 except Exception as e: logging.error("process file {} error: {}".format(fileName, e)) ct_error += 1 logging.info("test finished, total {}, error {}".format( ct_success + ct_error, ct_error))
else: train_annotations_dict = None predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap" annotation_dir = os.path.join(opt.test_file, 'bioc') corpus_dir = os.path.join(opt.test_file, 'txt') annotation_files = [ f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f)) and f.find('.xml') != -1 ] logging.info("load dictionary ... ") dictionary = load_dictionary("dictionary.txt") dictionary_full = load_dictionary("dictionary_full.txt") multi_sieve.init(opt, None, d, UMLS_dict, UMLS_dict_reverse, False) ct_norm_predict = 0 ct_norm_gold = 0 ct_norm_correct = 0 for gold_file_name in annotation_files: print("# begin {}".format(gold_file_name)) gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) predict_document = metamap.load_metamap_result_from_file( join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt")) # copy entities from metamap entities