def metamap_ner_re(d): print("load umls ...") UMLS_dict, _ = umls.load_umls_MRCONSO(d.config['norm_dict']) predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap" annotation_dir = os.path.join(opt.test_file, 'bioc') corpus_dir = os.path.join(opt.test_file, 'txt') annotation_files = [ f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f)) ] ct_ner_predict = 0 ct_ner_gold = 0 ct_ner_correct = 0 ct_norm_predict = 0 ct_norm_gold = 0 ct_norm_correct = 0 for gold_file_name in annotation_files: gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) predict_document = metamap.load_metamap_result_from_file( join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt")) ct_ner_gold += len(gold_document.entities) ct_ner_predict += len(predict_document.entities) for predict_entity in predict_document.entities: for gold_entity in gold_document.entities: if predict_entity.equals_span(gold_entity): ct_ner_correct += 1 break p1, p2, p3 = evaluate_for_ehr(gold_document.entities, predict_document.entities, UMLS_dict) ct_norm_gold += p1 ct_norm_predict += p2 ct_norm_correct += p3 p = ct_ner_correct * 1.0 / ct_ner_predict r = ct_ner_correct * 1.0 / ct_ner_gold f1 = 2.0 * p * r / (p + r) print("NER p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1)) p = ct_norm_correct * 1.0 / ct_norm_predict r = ct_norm_correct * 1.0 / ct_norm_gold f1 = 2.0 * p * r / (p + r) print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
def metamap_ner_my_norm(d): print("load umls ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict']) predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap" annotation_dir = os.path.join(opt.test_file, 'bioc') corpus_dir = os.path.join(opt.test_file, 'txt') annotation_files = [f for f in os.listdir(annotation_dir) if os.path.isfile(os.path.join(annotation_dir, f))] if opt.test_in_cpu: model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: model = torch.load(os.path.join(opt.output, 'norm_neural.pkl')) model.eval() ct_norm_predict = 0 ct_norm_gold = 0 ct_norm_correct = 0 correct_counter = Counter() wrong_counter = Counter() for gold_file_name in annotation_files: print("# begin {}".format(gold_file_name)) gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) predict_document = metamap.load_metamap_result_from_file( os.path.join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt")) # copy entities from metamap entities pred_entities = [] for gold in predict_document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse) p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict, predict_document.entities, correct_counter, wrong_counter) ct_norm_gold += p1 ct_norm_predict += p2 ct_norm_correct += p3 sorted_correct_entities = OrderedDict(correct_counter.most_common()) sorted_correct_entities = json.dumps(sorted_correct_entities, indent=4) with codecs.open("sorted_correct_entities.txt", 'w', 'UTF-8') as fp: fp.write(sorted_correct_entities) sorted_wrong_entities = OrderedDict(wrong_counter.most_common()) sorted_wrong_entities = json.dumps(sorted_wrong_entities, indent=4) with codecs.open("sorted_wrong_entities.txt", 'w', 'UTF-8') as fp: fp.write(sorted_wrong_entities) p = ct_norm_correct * 1.0 / ct_norm_predict r = ct_norm_correct * 1.0 / ct_norm_gold f1 = 2.0 * p * r / (p + r) print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
logging.info(d.config) makedir_and_clear(opt.output) logging.info("load data ...") train_data = data.loadData(opt.train_file, True, opt.types, opt.type_filter) dev_data = data.loadData(opt.dev_file, True, opt.types, opt.type_filter) if opt.test_file: test_data = data.loadData(opt.test_file, False, opt.types, opt.type_filter) else: test_data = None logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict']) logging.info("dict concept number {}".format(len(UMLS_dict))) train(train_data, dev_data, test_data, d, UMLS_dict, UMLS_dict_reverse, opt, None, False) elif opt.whattodo == 2: logger = logging.getLogger() logger.setLevel(logging.INFO) from collections import OrderedDict import json def metamap_ner_my_norm(d): print("load umls ...")
def metamap_ner_my_norm(d): print("load umls ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO( d.config['norm_dict']) predict_dir = "/Users/feili/Desktop/umass/CancerADE_SnoM_30Oct2017_test/metamap" annotation_dir = os.path.join(opt.test_file, 'bioc') corpus_dir = os.path.join(opt.test_file, 'txt') annotation_files = [ f for f in listdir(annotation_dir) if isfile(join(annotation_dir, f)) ] if opt.norm_rule: multi_sieve.init(opt, None, d, UMLS_dict, UMLS_dict_reverse, False) elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() ct_norm_predict = 0 ct_norm_gold = 0 ct_norm_correct = 0 for gold_file_name in annotation_files: print("# begin {}".format(gold_file_name)) gold_document = parse_one_gold_file(annotation_dir, corpus_dir, gold_file_name) predict_document = metamap.load_metamap_result_from_file( join(predict_dir, gold_file_name[:gold_file_name.find('.')] + ".field.txt")) # copy entities from metamap entities pred_entities = [] for gold in predict_document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) if opt.norm_rule: multi_sieve.runMultiPassSieve(gold_document, pred_entities, UMLS_dict, False) elif opt.norm_neural: neural_model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse, False) elif opt.norm_vsm: vsm_model.process_one_doc(gold_document, pred_entities, UMLS_dict, UMLS_dict_reverse, False) else: raise RuntimeError("wrong configuration") p1, p2, p3 = evaluate_for_ehr(gold_document.entities, pred_entities, UMLS_dict) ct_norm_gold += p1 ct_norm_predict += p2 ct_norm_correct += p3 p = ct_norm_correct * 1.0 / ct_norm_predict r = ct_norm_correct * 1.0 / ct_norm_gold f1 = 2.0 * p * r / (p + r) print("NORM p: %.4f | r: %.4f | f1: %.4f" % (p, r, f1))
def pretrain(opt): samples_per_epoch = [] pregenerated_data = Path(opt.instance_dir) for i in range(opt.iter): epoch_file = pregenerated_data / f"epoch_{i}.json" metrics_file = pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = opt.iter if opt.gpu >= 0 and torch.cuda.is_available(): if opt.multi_gpu: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device('cuda', opt.gpu) n_gpu = 1 else: device = torch.device("cpu") n_gpu = 0 logging.info("device: {} n_gpu: {}".format(device, n_gpu)) if opt.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(opt.gradient_accumulation_steps)) opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps makedir_and_clear(opt.save) tokenizer = BertTokenizer.from_pretrained(opt.bert_dir, do_lower_case=opt.do_lower_case) total_train_examples = 0 for i in range(opt.iter): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / opt.batch_size / opt.gradient_accumulation_steps) logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict) logging.info("dict concept number {}".format(len(UMLS_dict))) dict_alphabet = Alphabet('dict') init_dict_alphabet(dict_alphabet, UMLS_dict) dict_alphabet.close() # Prepare model model, _ = BertForPreTraining.from_pretrained( opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, warmup=opt.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", opt.batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(opt.iter): epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, dict_alphabet=dict_alphabet) train_sampler = RandomSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=opt.batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 epoch_start = time.time() sum_loss = 0 sum_orginal_loss = 0 num_iter = len(train_dataloader) with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch loss, original_loss = model(input_ids, segment_ids, input_mask, lm_label_ids, input_ids_ent, input_mask_ent, is_next, norm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. original_loss = original_loss.mean() if opt.gradient_accumulation_steps > 1: loss = loss / opt.gradient_accumulation_steps original_loss = original_loss / opt.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % opt.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 sum_loss += loss.item() sum_orginal_loss += original_loss.item() epoch_finish = time.time() logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f" % (epoch, epoch_finish - epoch_start, sum_loss / num_iter, sum_orginal_loss / num_iter)) # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( opt.save, "pytorch_model_{}.bin".format(str(epoch + 1))) torch.save(model_to_save.state_dict(), str(output_model_file))
def test(data, opt): # corpus_dir = join(opt.test_file, 'corpus') # corpus_dir = join(opt.test_file, 'txt') corpus_dir = opt.test_file if opt.nlp_tool == "spacy": nlp_tool = spacy.load('en') elif opt.nlp_tool == "nltk": nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle') elif opt.nlp_tool == "stanford": nlp_tool = StanfordCoreNLP('http://localhost:{0}'.format(9000)) else: raise RuntimeError("invalid nlp tool") corpus_files = [ f for f in listdir(corpus_dir) if isfile(join(corpus_dir, f)) ] model = SeqModel(data, opt) if opt.test_in_cpu: model.load_state_dict( torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu')) else: model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl'))) dictionary, dictionary_reverse = umls.load_umls_MRCONSO( data.config['norm_dict']) isMeddra_dict = False # initialize norm models if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble logging.info("use ensemble normer") multi_sieve.init(opt, None, data, dictionary, dictionary_reverse, False) if opt.ensemble == 'learn': if opt.test_in_cpu: ensemble_model = torch.load(os.path.join( opt.output, 'ensemble.pkl'), map_location='cpu') else: ensemble_model = torch.load( os.path.join(opt.output, 'ensemble.pkl')) ensemble_model.eval() else: if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) vsm_model.eval() neural_model.eval() elif opt.norm_rule: logging.info("use rule-based normer") multi_sieve.init(opt, None, data, dictionary, dictionary_reverse, False) elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() else: logging.info("no normalization is performed.") makedir_and_clear(opt.predict) ct_success = 0 ct_error = 0 for fileName in corpus_files: try: start = time.time() document, _, _, _ = processOneFile(fileName, None, corpus_dir, nlp_tool, False, opt.types, opt.type_filter) data.test_texts = [] data.test_Ids = [] read_instance_from_one_document(document, data.word_alphabet, data.char_alphabet, data.label_alphabet, data.test_texts, data.test_Ids, data) _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest) entities = translateResultsintoEntities(document.sentences, pred_results) if opt.norm_rule and opt.norm_vsm and opt.norm_neural: if opt.ensemble == 'learn': ensemble_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) else: pred_entities1 = copy.deepcopy(entities) pred_entities2 = copy.deepcopy(entities) pred_entities3 = copy.deepcopy(entities) multi_sieve.runMultiPassSieve(document, pred_entities1, dictionary, isMeddra_dict) vsm_model.process_one_doc(document, pred_entities2, dictionary, dictionary_reverse, isMeddra_dict) neural_model.process_one_doc(document, pred_entities3, dictionary, dictionary_reverse, isMeddra_dict) # merge pred_entities1, pred_entities2, pred_entities3 into entities ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, dictionary, isMeddra_dict, vsm_model.dict_alphabet, data) elif opt.norm_rule: multi_sieve.runMultiPassSieve(document, entities, dictionary, isMeddra_dict) elif opt.norm_vsm: vsm_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) elif opt.norm_neural: neural_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) dump_results(fileName, entities, opt) end = time.time() logging.info("process %s complete with %.2fs" % (fileName, end - start)) ct_success += 1 except Exception as e: logging.error("process file {} error: {}".format(fileName, e)) ct_error += 1 logging.info("test finished, total {}, error {}".format( ct_success + ct_error, ct_error))
def make_dictionary(d): logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO( d.config['norm_dict']) logging.info("dict concept number {}".format(len(UMLS_dict))) fp = codecs.open("dictionary.txt", 'w', 'UTF-8') fp1 = codecs.open("dictionary_full.txt", 'w', 'UTF-8') for cui, concept in UMLS_dict.items(): new_names = set() new_names_full = set() write_str = cui + '|' write_str1 = cui + '|' for i, id in enumerate(concept.codes): if i == len(concept.codes) - 1: write_str += id + '|' write_str1 += id + '|' else: write_str += id + ',' write_str1 += id + ',' for name in concept.names: # replace (finding), NOS to whitespace name = dict_refine(name) # given a name, output its token set new_name, new_name_full = preprocess(name, True, False) if len(new_name) == 0 or len(new_name_full) == 0: raise RuntimeError("empty after preprocess: {}".format(name)) # all synonym merged new_names = new_names | new_name new_names_full = new_names_full | new_name_full for i, name in enumerate(new_names): if i == len(new_names) - 1: write_str += name else: write_str += name + ',' for i, name in enumerate(new_names_full): if i == len(new_names_full) - 1: write_str1 += name else: write_str1 += name + ',' fp.write(write_str + "\n") fp1.write(write_str1 + "\n") fp.close() fp1.close() fp = codecs.open("dictionary_reverse.txt", 'w', 'UTF-8') for code, cui_list in UMLS_dict_reverse.items(): write_str = code + '|' for i, cui in enumerate(cui_list): if i == len(cui_list) - 1: write_str += cui else: write_str += cui + ',' fp.write(write_str + "\n") fp.close()