def write_discharge_summaries(MIMIC_3_DIR, output_dir): notes_file = '%s/NOTEEVENTS.csv' % (MIMIC_3_DIR) makedir_and_clear(output_dir) logging.info("processing notes file") count_note = 0 with open(notes_file, 'r') as csvfile: notereader = csv.reader(csvfile) #header next(notereader) for line in notereader: # each line is saved as a file subj = int(line[1]) category = line[6] if category == "Discharge summary": file_name = line[1]+'_'+line[2]+".txt" file_path = os.path.join(output_dir, file_name) with open(file_path, 'w') as outfile: logging.info("writing to %s" % (file_path)) outfile.write(line[10]) count_note += 1 logging.info("totally write {} notes".format(count_note)) return
def apply_metamap_to(input_dir, output_dir): makedir_and_clear(output_dir) for input_file_name in listdir(input_dir): input_file_path = join(input_dir, input_file_name) if input_file_name.rfind('.') == -1: output_file_name = input_file_name + ".field.txt" else: output_file_name = input_file_name[0:input_file_name. rfind('.')] + ".field.txt" output_file_path = join(output_dir, output_file_name) os.system( '/Users/feili/tools/metamap/public_mm/bin/metamap -y -I -N --blanklines 0 -R SNOMEDCT_US,MDR -J acab,anab,comd,cgab,dsyn,emod,fndg,inpo,mobd,neop,patf,sosy {} {}' .format(input_file_path, output_file_path))
def prepare_instance(opt): tokenizer = BertTokenizer.from_pretrained(opt.bert_dir, do_lower_case=opt.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) with DocumentDatabase(reduce_memory=False) as docs: with open(opt.merged_file, 'r') as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: sentence = json.loads(line) # tokens = tokenizer.tokenize(sentence['text']) # doc.append(tokens) doc.append(sentence) makedir_and_clear(opt.instance_dir) for epoch in trange(opt.iter, desc="Epoch"): epoch_filename = os.path.join(opt.instance_dir, f"epoch_{epoch}.json") num_instances = 0 with open(epoch_filename, 'w') as epoch_file: for doc_idx in trange(len(docs), desc="Document"): doc_instances = create_instances_from_document( docs, doc_idx, max_seq_length=opt.max_seq_length, short_seq_prob=opt.short_seq_prob, masked_lm_prob=opt.masked_lm_prob, max_predictions_per_seq=opt.max_predictions_per_seq, vocab_list=vocab_list, tokenizer=tokenizer) doc_instances = [ json.dumps(instance) for instance in doc_instances ] for instance in doc_instances: epoch_file.write(instance + '\n') num_instances += 1 metrics_file = os.path.join(opt.instance_dir, f"epoch_{epoch}_metrics.json") with open(metrics_file, 'w') as metrics_file: metrics = { "num_training_examples": num_instances, "max_seq_len": opt.max_seq_length } metrics_file.write(json.dumps(metrics))
def apply_metamap_to(input_dir, output_dir, metamap_dir, metamap_process): makedir_and_clear(output_dir) logging.info('read text file ......') input_file_names = [] for input_file_name in os.listdir(input_dir): input_file_names.append(input_file_name) total_file_num = len(input_file_names) logging.info('totally {} files'.format(total_file_num)) file_index = [] file_num_per_process = total_file_num // metamap_process for group_idx in range(metamap_process): begin = group_idx * file_num_per_process end = (group_idx + 1) * file_num_per_process idx = begin tmp = [] while idx < end: tmp.append(input_file_names[idx]) idx += 1 file_index.append(tmp) if end < total_file_num: file_index[-1].extend(input_file_names[end:]) processes = [] for group in file_index: p = multiprocessing.Process(target=worker, args=(group, input_dir, output_dir, metamap_dir)) processes.append(p) p.start() for p in processes: p.join()
logging.info(opt) if opt.random_seed != 0: random.seed(opt.random_seed) np.random.seed(opt.random_seed) torch.manual_seed(opt.random_seed) torch.cuda.manual_seed_all(opt.random_seed) d = data.Data(opt) logging.info(d.config) makedir_and_clear(opt.output) logging.info("load data ...") train_data = data.loadData(opt.train_file, True, opt.types, opt.type_filter) dev_data = data.loadData(opt.dev_file, True, opt.types, opt.type_filter) if opt.test_file: test_data = data.loadData(opt.test_file, False, opt.types, opt.type_filter) else: test_data = None logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict']) logging.info("dict concept number {}".format(len(UMLS_dict))) train(train_data, dev_data, test_data, d, UMLS_dict, UMLS_dict_reverse, opt, None, False)
logger = logging.getLogger() if opt.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logging.info(opt) if opt.random_seed != 0: random.seed(opt.random_seed) np.random.seed(opt.random_seed) torch.manual_seed(opt.random_seed) torch.cuda.manual_seed_all(opt.random_seed) if opt.whattodo == 'train': makedir_and_clear(opt.save) logging.info('loading training data...') train_datasets = [] for train_jsonl_file in opt.train: train_dataset = load_data(train_jsonl_file, 'train') train_datasets.append(train_dataset) logging.info('loading dev indomain data...') dev_indomain_datasets = [] for dev_jsonl_file in opt.dev_indomain: dev_dataset = load_data(dev_jsonl_file, 'dev') dev_indomain_datasets.append(dev_dataset) logging.info('loading dev outdomain data...') dev_outdomain_datasets = []
def test(data, opt): corpus_dir = opt.test_file if opt.nlp_tool == "nltk": nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle') else: raise RuntimeError("invalid nlp tool") corpus_files = [f for f in os.listdir(corpus_dir) if f.find('.xml') != -1] model = SeqModel(data, opt) if opt.test_in_cpu: model.load_state_dict( torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu')) else: model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl'))) meddra_dict = load_meddra_dict(data) # initialize norm models if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble logging.info("use ensemble normer") multi_sieve.init(opt, None, data, meddra_dict, None, True) if opt.ensemble == 'learn': if opt.test_in_cpu: ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'), map_location='cpu') else: ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl')) ensemble_model.eval() else: if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl')) vsm_model.eval() neural_model.eval() elif opt.norm_rule: logging.info("use rule-based normer") multi_sieve.init(opt, None, data, meddra_dict) elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() else: logging.info("no normalization is performed.") makedir_and_clear(opt.predict) ct_success = 0 ct_error = 0 for fileName in corpus_files: try: start = time.time() document, annotation_file = processOneFile_fda(fileName, corpus_dir, nlp_tool, False, opt.types, opt.type_filter, True, False) pred_entities = [] for section in document: data.test_texts = [] data.test_Ids = [] read_instance_from_one_document(section, data.word_alphabet, data.char_alphabet, data.label_alphabet, data.test_texts, data.test_Ids, data) _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest) entities = translateResultsintoEntities(section.sentences, pred_results) # remove the entity in the ignore_region and fill section_id section_id = section.name[section.name.rfind('_')+1: ] entities = remove_entity_in_the_ignore_region(annotation_file.ignore_regions, entities, section_id) if opt.norm_rule and opt.norm_vsm and opt.norm_neural: if opt.ensemble == 'learn': ensemble_model.process_one_doc(section, entities, meddra_dict, None, True) else: pred_entities1 = copy.deepcopy(entities) pred_entities2 = copy.deepcopy(entities) pred_entities3 = copy.deepcopy(entities) multi_sieve.runMultiPassSieve(section, pred_entities1, meddra_dict, True) vsm_model.process_one_doc(section, pred_entities2, meddra_dict, None, True) neural_model.process_one_doc(section, pred_entities3, meddra_dict, None, True) # merge pred_entities1, pred_entities2, pred_entities3 into entities ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, meddra_dict, True, vsm_model.dict_alphabet, data) elif opt.norm_rule: multi_sieve.runMultiPassSieve(section, entities, meddra_dict, True) elif opt.norm_vsm: vsm_model.process_one_doc(section, entities, meddra_dict, None, True) elif opt.norm_neural: neural_model.process_one_doc(section, entities, meddra_dict, None, True) for entity in entities: if len(entity.norm_ids)!=0: # if a mention can't be normed, not output it pred_entities.append(entity) dump_results(fileName, pred_entities, opt, annotation_file) end = time.time() logging.info("process %s complete with %.2fs" % (fileName, end - start)) ct_success += 1 except Exception as e: logging.error("process file {} error: {}".format(fileName, e)) ct_error += 1 if opt.norm_rule: multi_sieve.finalize(True) logging.info("test finished, total {}, error {}".format(ct_success + ct_error, ct_error))
def main(): logger = logging.getLogger() if opt.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logging.info(opt) if opt.random_seed != 0: random.seed(opt.random_seed) np.random.seed(opt.random_seed) torch.manual_seed(opt.random_seed) torch.cuda.manual_seed_all(opt.random_seed) d = data.Data(opt) logging.info(d.config) makedir_and_clear(opt.output) documents = load_data_fda(opt.train_file, opt.types, opt.type_filter, True, True) logging.info("use {} fold cross validataion".format(opt.cross_validation)) fold_num = opt.cross_validation total_doc_num = len(documents) dev_doc_num = total_doc_num // fold_num macro_p = 0.0 macro_r = 0.0 macro_f = 0.0 meddra_dict = load_meddra_dict(d) for fold_idx in range(fold_num): fold_start = fold_idx * dev_doc_num fold_end = fold_idx * dev_doc_num + dev_doc_num if fold_end > total_doc_num: fold_end = total_doc_num if fold_idx == fold_num - 1 and fold_end < total_doc_num: fold_end = total_doc_num train_data = [] train_data.extend(documents[:fold_start]) train_data.extend(documents[fold_end:]) dev_data = documents[fold_start:fold_end] logging.info("begin fold {}".format(fold_idx)) logging.info("doc start {}, doc end {}".format(fold_start, fold_end)) p, r, f = train(train_data, dev_data, None, d, meddra_dict, None, opt, fold_idx, True) macro_p += p macro_r += r macro_f += f logging.info("the macro averaged p r f are %.4f, %.4f, %.4f" % (macro_p * 1.0 / fold_num, macro_r * 1.0 / fold_num, macro_f * 1.0 / fold_num))
def pretrain(opt): samples_per_epoch = [] pregenerated_data = Path(opt.instance_dir) for i in range(opt.iter): epoch_file = pregenerated_data / f"epoch_{i}.json" metrics_file = pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = opt.iter if opt.gpu >= 0 and torch.cuda.is_available(): if opt.multi_gpu: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device('cuda', opt.gpu) n_gpu = 1 else: device = torch.device("cpu") n_gpu = 0 logging.info("device: {} n_gpu: {}".format(device, n_gpu)) if opt.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(opt.gradient_accumulation_steps)) opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps makedir_and_clear(opt.save) tokenizer = BertTokenizer.from_pretrained(opt.bert_dir, do_lower_case=opt.do_lower_case) total_train_examples = 0 for i in range(opt.iter): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / opt.batch_size / opt.gradient_accumulation_steps) logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict) logging.info("dict concept number {}".format(len(UMLS_dict))) dict_alphabet = Alphabet('dict') init_dict_alphabet(dict_alphabet, UMLS_dict) dict_alphabet.close() # Prepare model model, _ = BertForPreTraining.from_pretrained( opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, warmup=opt.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", opt.batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(opt.iter): epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, dict_alphabet=dict_alphabet) train_sampler = RandomSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=opt.batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 epoch_start = time.time() sum_loss = 0 sum_orginal_loss = 0 num_iter = len(train_dataloader) with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch loss, original_loss = model(input_ids, segment_ids, input_mask, lm_label_ids, input_ids_ent, input_mask_ent, is_next, norm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. original_loss = original_loss.mean() if opt.gradient_accumulation_steps > 1: loss = loss / opt.gradient_accumulation_steps original_loss = original_loss / opt.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % opt.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 sum_loss += loss.item() sum_orginal_loss += original_loss.item() epoch_finish = time.time() logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f" % (epoch, epoch_finish - epoch_start, sum_loss / num_iter, sum_orginal_loss / num_iter)) # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( opt.save, "pytorch_model_{}.bin".format(str(epoch + 1))) torch.save(model_to_save.state_dict(), str(output_model_file))
def test(data, opt): # corpus_dir = join(opt.test_file, 'corpus') # corpus_dir = join(opt.test_file, 'txt') corpus_dir = opt.test_file if opt.nlp_tool == "spacy": nlp_tool = spacy.load('en') elif opt.nlp_tool == "nltk": nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle') elif opt.nlp_tool == "stanford": nlp_tool = StanfordCoreNLP('http://localhost:{0}'.format(9000)) else: raise RuntimeError("invalid nlp tool") corpus_files = [ f for f in listdir(corpus_dir) if isfile(join(corpus_dir, f)) ] model = SeqModel(data, opt) if opt.test_in_cpu: model.load_state_dict( torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu')) else: model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl'))) dictionary, dictionary_reverse = umls.load_umls_MRCONSO( data.config['norm_dict']) isMeddra_dict = False # initialize norm models if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble logging.info("use ensemble normer") multi_sieve.init(opt, None, data, dictionary, dictionary_reverse, False) if opt.ensemble == 'learn': if opt.test_in_cpu: ensemble_model = torch.load(os.path.join( opt.output, 'ensemble.pkl'), map_location='cpu') else: ensemble_model = torch.load( os.path.join(opt.output, 'ensemble.pkl')) ensemble_model.eval() else: if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) vsm_model.eval() neural_model.eval() elif opt.norm_rule: logging.info("use rule-based normer") multi_sieve.init(opt, None, data, dictionary, dictionary_reverse, False) elif opt.norm_vsm: logging.info("use vsm-based normer") if opt.test_in_cpu: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu') else: vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl')) vsm_model.eval() elif opt.norm_neural: logging.info("use neural-based normer") if opt.test_in_cpu: neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu') else: neural_model = torch.load( os.path.join(opt.output, 'norm_neural.pkl')) neural_model.eval() else: logging.info("no normalization is performed.") makedir_and_clear(opt.predict) ct_success = 0 ct_error = 0 for fileName in corpus_files: try: start = time.time() document, _, _, _ = processOneFile(fileName, None, corpus_dir, nlp_tool, False, opt.types, opt.type_filter) data.test_texts = [] data.test_Ids = [] read_instance_from_one_document(document, data.word_alphabet, data.char_alphabet, data.label_alphabet, data.test_texts, data.test_Ids, data) _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest) entities = translateResultsintoEntities(document.sentences, pred_results) if opt.norm_rule and opt.norm_vsm and opt.norm_neural: if opt.ensemble == 'learn': ensemble_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) else: pred_entities1 = copy.deepcopy(entities) pred_entities2 = copy.deepcopy(entities) pred_entities3 = copy.deepcopy(entities) multi_sieve.runMultiPassSieve(document, pred_entities1, dictionary, isMeddra_dict) vsm_model.process_one_doc(document, pred_entities2, dictionary, dictionary_reverse, isMeddra_dict) neural_model.process_one_doc(document, pred_entities3, dictionary, dictionary_reverse, isMeddra_dict) # merge pred_entities1, pred_entities2, pred_entities3 into entities ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, dictionary, isMeddra_dict, vsm_model.dict_alphabet, data) elif opt.norm_rule: multi_sieve.runMultiPassSieve(document, entities, dictionary, isMeddra_dict) elif opt.norm_vsm: vsm_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) elif opt.norm_neural: neural_model.process_one_doc(document, entities, dictionary, dictionary_reverse, isMeddra_dict) dump_results(fileName, entities, opt) end = time.time() logging.info("process %s complete with %.2fs" % (fileName, end - start)) ct_success += 1 except Exception as e: logging.error("process file {} error: {}".format(fileName, e)) ct_error += 1 logging.info("test finished, total {}, error {}".format( ct_success + ct_error, ct_error))
logging.info(opt) if opt.random_seed != 0: random.seed(opt.random_seed) np.random.seed(opt.random_seed) torch.manual_seed(opt.random_seed) torch.cuda.manual_seed_all(opt.random_seed) if opt.whattodo == 'train': config = data.read_config(opt.config) logging.info(config) makedir_and_clear(opt.save) logging.info("load data ...") train_data = data.loadData(opt.train_file, True, opt.types, opt.type_filter) dev_data = data.loadData(opt.dev_file, True, opt.types, opt.type_filter) if opt.test_file: test_data = data.loadData(opt.test_file, False, opt.types, opt.type_filter) else: test_data = None logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO( config['norm_dict'])