corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI') corpus_dev = sample.NewDataSet('dev corpus') corpus_dev.objects = load.load(config['corpus']['development_file'],'NCBI') for corpus in [corpus_train, corpus_dev]: mention_ids = [] # list of all ids (gold standard for each mention) mention_names = [] # list of all names mention_all = [] # list of tuples (mention_text,gold,context,(start,end,docid)) #sth wrong here that sometimes throw an error #import pdb;pdb.set_trace() for abstract in corpus.objects: for section in abstract.sections: # title and abstract for mention in section.mentions: nor_ids = [sample._nor_id(one_id) for one_id in mention.id] mention_ids.append(nor_ids) # append list of ids, usually len(list)=1 mention_names.append(mention.text) mention_all.append((mention.text,nor_ids,section.text,(mention.start,mention.end,abstract.docid))) # tokenization & vectorization of mentions import nltk mention_tokenize = [nltk.word_tokenize(name) for name in mention_names] mention_vectorize = np.array([[vocabulary.get(text.lower(),1) for text in mention] for mention in mention_tokenize]) if config.getint('embedding','elmo'): mention_elmo = elmo_default([mention_names]) corpus.ids = mention_ids corpus.names = mention_names corpus.all = mention_all corpus.tokenize = mention_tokenize
def emb_baseline(emb_path): #vector_model, vocabulary, inversed_vocabulary = prepare_embedding_vocab('/home/lenz/disease-normalization/data/embeddings/wvec_200_win-30_chiu-et-al.bin') vector_model, vocabulary, inversed_vocabulary = prepare_embedding_vocab(emb_path, binary = True) pretrained = load_pretrained_word_embeddings(vocabulary, vector_model) # MEDIC dictionary dictionary = load.Terminology() # dictionary of entries, key = canonical id, value = named tuple in the form of # MEDIC_ENTRY(DiseaseID='MESH:D005671', DiseaseName='Fused Teeth', # AllDiseaseIDs=('MESH:D005671',), AllNames=('Fused Teeth', 'Teeth, Fused') dictionary.loaded = load.load(config['terminology']['dict_file'],'MEDIC') import vectorizer dictionary.no_cangen_tokenized = vectorizer.MEDIC_dict_tokenizer_no_cangen(dictionary.loaded,config['methods']['tokenizer']) dictionary.no_cangen_vectorized = vectorizer.MEDIC_dict_vectorizer_no_cangen(dictionary.no_cangen_tokenized,vocabulary) # concepts concept_ids = [] # list of all concept ids concept_all_ids = [] # list of (lists of all concept ids with alt IDs) concept_names = [] # list of all names, same length as concept_ids concept_map = {} # names as keys, ids as concepts for k in dictionary.loaded.keys(): # keys should be in congruent order c_id = dictionary.loaded[k].DiseaseID a_ids = dictionary.loaded[k].AllDiseaseIDs if int(config['settings']['all_names']): for n in dictionary.loaded[k].AllNames: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] else: for n in dictionary.loaded[k].DiseaseName: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] # save the stuff to object concept = sample.NewDataSet('concepts') concept.ids = concept_ids concept.all_ids = concept_all_ids concept.names = concept_names concept.map = concept_map #concept_vectorize = np.array([dictionary.no_cangen_vectorized[k] for k in concept.ids]) # corpus #corpus_train = sample.NewDataSet('training corpus') #corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI') corpus_dev = sample.NewDataSet('dev corpus') corpus_dev.objects = load.load(config['corpus']['development_file'],'NCBI') #corpus_test = sample.NewDataSet('test corpus') #corpus_test.objects = load.load('/home/lhchan/disease_normalization/data/NCBItestset_corpus.txt','NCBI') #corpus_dev=corpus_test for corpus in [corpus_dev]: mention_ids = [] # list of all ids (gold standard for each mention) mention_names = [] # list of all names mention_all = [] # list of tuples (mention_text,gold,context,(start,end,docid)) #sth wrong here that sometimes throw an error #import pdb;pdb.set_trace() for abstract in corpus.objects: for section in abstract.sections: # title and abstract for mention in section.mentions: nor_ids = [sample._nor_id(one_id) for one_id in mention.id] mention_ids.append(nor_ids) # append list of ids, usually len(list)=1 mention_names.append(mention.text) mention_all.append((mention.text,nor_ids,section.text,(mention.start,mention.end,abstract.docid))) # tokenization & vectorization of mentions #mention_tokenize = [nltk.word_tokenize(name) for name in mention_names] #mention_vectorize = np.array([[vocabulary.get(text,1) for text in mention] for mention in mention_tokenize]) # mention_elmo = elmo_default([mention_names]) corpus.ids = mention_ids corpus.names = mention_names corpus.all = mention_all # corpus.tokenize = mention_tokenize # corpus.vectorize = mention_vectorize # corpus.elmo = mention_elmo # vector representations import nltk mention_embeddings = [] for mention in corpus.names: tokenized = nltk.word_tokenize(mention.lower()) index = [vocabulary.get(token,1) for token in tokenized] #emb = np.mean(np.array([pretrained[i] for i in index]), axis=0) emb = np.sum(np.array([pretrained[i] for i in index]), axis=0) mention_embeddings.append(emb) mention_embeddings = np.array(mention_embeddings) concept_embeddings = [] for mention in concept.names: tokenized = nltk.word_tokenize(mention.lower()) index = [vocabulary.get(token,1) for token in tokenized] #emb = np.mean(np.array([pretrained[i] for i in index]), axis=0) emb = np.sum(np.array([pretrained[i] for i in index]), axis=0) concept_embeddings.append(emb) concept_embeddings = np.array(concept_embeddings) ''' from vectorizer_elmo import elmo_default # chunk the concepts down since the list is too big concept_chunk = [concept.names[i:i + 5000] for i in range(0, len(concept.names), 5000)] concept.elmo = [] for chunk in concept_chunk: [elmo_chunk] = [c for c in elmo_default([chunk])] concept.elmo.append(elmo_chunk) [concept.elmo] = [chunk for chunk in elmo_default([concept_chunk])] #with open('gitig_concept_elmo.pickle','wb') as f: # pickle.dump(concept.elmo,f,protocol=4) #concept.elmo = pickle.load(open('gitig_concept_elmo.pickle','rb')) concept.elmo = np.array([item for sublist in concept.elmo for item in sublist]) [corpus_dev.elmo] = [chunk for chunk in elmo_default([corpus_dev.names])] ''' concept_emb = concept_embeddings #concept.elmo mention_emb = mention_embeddings #corpus_dev.elmo from sklearn.preprocessing import normalize nor_concepts = normalize(concept_emb) nor_corpus_dev = normalize(mention_emb) dot_product_matrix = np.dot(nor_corpus_dev,np.transpose(nor_concepts)) prediction_indices = np.argmax(dot_product_matrix,axis=1) predictions = np.array(concept.ids)[prediction_indices].tolist() correct = 0 #incorrect = 0 #incorrect_indices = [] for prediction, mention_gold in zip(predictions,corpus_dev.ids): if prediction == mention_gold[0] and len(mention_gold)==1: correct += 1 print('Accuracy:{0}'.format(correct/len(corpus_dev.names)))