def concept_obj(conf, dictionary, order=None): concept_ids = [] # list of all concept ids # concept_all_ids = [] # list of (lists of all concept ids with alt IDs) concept_names = [] # list of all names, same length as concept_ids concept_map = {} # names as keys, ids as concepts if order: use = order logger.info('Re-initializing concept object.') else: use = dictionary.loaded.keys() for k in use: # keys not in congruent order! To make them congruent: # k,v = zip(*dictionary.loaded.items()) # k = list(k) # k.sort() c_id = dictionary.loaded[k].DiseaseID # a_ids = dictionary.loaded[k].AllDiseaseIDs if int(conf['settings']['all_names']): for n in dictionary.loaded[k].AllNames: concept_ids.append(c_id) # concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] else: for n in dictionary.loaded[k].DiseaseName: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] # save the stuff to object concept = sample.NewDataSet('concepts') concept.ids = concept_ids # concept.all_ids = concept_all_ids concept.names = concept_names concept.map = concept_map concept.tokenize = [nltk.word_tokenize(name) for name in concept_names] concept.vectorize = np.array( [[vocabulary.get(text.lower(), 1) for text in concept] for concept in concept.tokenize]) return concept
def concept_obj(conf, dictionary, order=None): # concept_ids = [] # list of all concept ids # concept_all_ids = [] # list of (lists of all concept ids with alt IDs) concept_names = [] # list of all names, same length as concept_ids # concept_map = {} # names as keys, ids as concepts if order: use = order logger.info('Re-initialing concept object.') else: use = dictionary.loaded.keys() for k in use: # keys not in congruent order! To make them congruent: # k,v = zip(*dictionary.loaded.items()) # k = list(k) # k.sort() # c_id = dictionary.loaded[k].DiseaseID # a_ids = dictionary.loaded[k].AllDiseaseIDs for n in dictionary.loaded[k].AllNames: concept_names.append(n) # tokenization & vectorization of dictionary terms import nltk concept_tokenize = [nltk.word_tokenize(name) for name in concept_names ] # list of list of tokenized names concept_vectorize = np.array( [[vocabulary.get(text.lower(), 1) for text in concept] for concept in concept_tokenize]) # save the stuff to object concept = sample.NewDataSet('concepts') #concept.ids = concept_ids #concept.all_ids = concept_all_ids concept.names = concept_names #concept.map = concept_map concept.tokenize = concept_tokenize concept.vectorize = concept_vectorize for corpus in [concept]: logger.info('Padding {0}'.format(corpus.info)) logger.info('Old shape: {0}'.format(corpus.vectorize.shape)) corpus.padded = pad_sequences(corpus.vectorize, padding='post', maxlen=int( config['embedding']['length'])) #format of corpus.padded: numpy, mentions, padded logger.info('New shape: {0}'.format(corpus.padded.shape)) return concept
def concept_obj(conf,dictionary,order=None): concept_ids = [] # list of all concept ids concept_all_ids = [] # list of (lists of all concept ids with alt IDs) concept_names = [] # list of all names, same length as concept_ids concept_map = {} # names as keys, ids as concepts if order: use = order logger.info('Re-initialing concept object.') else: use = dictionary.loaded.keys() for k in use: # keys not in congruent order! To make them congruent: # k,v = zip(*dictionary.loaded.items()) # k = list(k) # k.sort() c_id = dictionary.loaded[k].DiseaseID a_ids = dictionary.loaded[k].AllDiseaseIDs if int(conf['settings']['all_names']): for n in dictionary.loaded[k].AllNames: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] else: for n in dictionary.loaded[k].DiseaseName: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] # tokenization & vectorization of dictionary terms import nltk concept_tokenize = [nltk.word_tokenize(name) for name in concept_names] # list of list of tokenized names concept_vectorize = np.array([[vocabulary.get(text.lower(),1) for text in concept] for concept in concept_tokenize]) if conf.getint('embedding','elmo'): from vectorizer_elmo import elmo_default concept_elmo = elmo_default([concept_names]) # save the stuff to object concept = sample.NewDataSet('concepts') concept.ids = concept_ids concept.all_ids = concept_all_ids concept.names = concept_names concept.map = concept_map concept.tokenize = concept_tokenize concept.vectorize = concept_vectorize if conf.getint('embedding','elmo'): concept.elmo = concept_elmo logger.info('Padding {0}'.format(concept.info)) logger.info('Old shape: {0}'.format(concept.vectorize.shape)) concept.padded = pad_sequences(concept.vectorize, padding='post', maxlen=int(config['embedding']['length'])) #format of corpus.padded: numpy, mentions, padded logger.info('New shape: {0}'.format(concept.padded.shape)) return concept
concept.vectorize = concept_vectorize if conf.getint('embedding','elmo'): concept.elmo = concept_elmo logger.info('Padding {0}'.format(concept.info)) logger.info('Old shape: {0}'.format(concept.vectorize.shape)) concept.padded = pad_sequences(concept.vectorize, padding='post', maxlen=int(config['embedding']['length'])) #format of corpus.padded: numpy, mentions, padded logger.info('New shape: {0}'.format(concept.padded.shape)) return concept concept = concept_obj(config,dictionary) # corpus corpus_train = sample.NewDataSet('training corpus') corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI') corpus_dev = sample.NewDataSet('dev corpus') corpus_dev.objects = load.load(config['corpus']['development_file'],'NCBI') for corpus in [corpus_train, corpus_dev]: mention_ids = [] # list of all ids (gold standard for each mention) mention_names = [] # list of all names mention_all = [] # list of tuples (mention_text,gold,context,(start,end,docid)) #sth wrong here that sometimes throw an error #import pdb;pdb.set_trace() for abstract in corpus.objects: for section in abstract.sections: # title and abstract for mention in section.mentions:
def emb_baseline(emb_path): #vector_model, vocabulary, inversed_vocabulary = prepare_embedding_vocab('/home/lenz/disease-normalization/data/embeddings/wvec_200_win-30_chiu-et-al.bin') vector_model, vocabulary, inversed_vocabulary = prepare_embedding_vocab(emb_path, binary = True) pretrained = load_pretrained_word_embeddings(vocabulary, vector_model) # MEDIC dictionary dictionary = load.Terminology() # dictionary of entries, key = canonical id, value = named tuple in the form of # MEDIC_ENTRY(DiseaseID='MESH:D005671', DiseaseName='Fused Teeth', # AllDiseaseIDs=('MESH:D005671',), AllNames=('Fused Teeth', 'Teeth, Fused') dictionary.loaded = load.load(config['terminology']['dict_file'],'MEDIC') import vectorizer dictionary.no_cangen_tokenized = vectorizer.MEDIC_dict_tokenizer_no_cangen(dictionary.loaded,config['methods']['tokenizer']) dictionary.no_cangen_vectorized = vectorizer.MEDIC_dict_vectorizer_no_cangen(dictionary.no_cangen_tokenized,vocabulary) # concepts concept_ids = [] # list of all concept ids concept_all_ids = [] # list of (lists of all concept ids with alt IDs) concept_names = [] # list of all names, same length as concept_ids concept_map = {} # names as keys, ids as concepts for k in dictionary.loaded.keys(): # keys should be in congruent order c_id = dictionary.loaded[k].DiseaseID a_ids = dictionary.loaded[k].AllDiseaseIDs if int(config['settings']['all_names']): for n in dictionary.loaded[k].AllNames: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] else: for n in dictionary.loaded[k].DiseaseName: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] # save the stuff to object concept = sample.NewDataSet('concepts') concept.ids = concept_ids concept.all_ids = concept_all_ids concept.names = concept_names concept.map = concept_map #concept_vectorize = np.array([dictionary.no_cangen_vectorized[k] for k in concept.ids]) # corpus #corpus_train = sample.NewDataSet('training corpus') #corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI') corpus_dev = sample.NewDataSet('dev corpus') corpus_dev.objects = load.load(config['corpus']['development_file'],'NCBI') #corpus_test = sample.NewDataSet('test corpus') #corpus_test.objects = load.load('/home/lhchan/disease_normalization/data/NCBItestset_corpus.txt','NCBI') #corpus_dev=corpus_test for corpus in [corpus_dev]: mention_ids = [] # list of all ids (gold standard for each mention) mention_names = [] # list of all names mention_all = [] # list of tuples (mention_text,gold,context,(start,end,docid)) #sth wrong here that sometimes throw an error #import pdb;pdb.set_trace() for abstract in corpus.objects: for section in abstract.sections: # title and abstract for mention in section.mentions: nor_ids = [sample._nor_id(one_id) for one_id in mention.id] mention_ids.append(nor_ids) # append list of ids, usually len(list)=1 mention_names.append(mention.text) mention_all.append((mention.text,nor_ids,section.text,(mention.start,mention.end,abstract.docid))) # tokenization & vectorization of mentions #mention_tokenize = [nltk.word_tokenize(name) for name in mention_names] #mention_vectorize = np.array([[vocabulary.get(text,1) for text in mention] for mention in mention_tokenize]) # mention_elmo = elmo_default([mention_names]) corpus.ids = mention_ids corpus.names = mention_names corpus.all = mention_all # corpus.tokenize = mention_tokenize # corpus.vectorize = mention_vectorize # corpus.elmo = mention_elmo # vector representations import nltk mention_embeddings = [] for mention in corpus.names: tokenized = nltk.word_tokenize(mention.lower()) index = [vocabulary.get(token,1) for token in tokenized] #emb = np.mean(np.array([pretrained[i] for i in index]), axis=0) emb = np.sum(np.array([pretrained[i] for i in index]), axis=0) mention_embeddings.append(emb) mention_embeddings = np.array(mention_embeddings) concept_embeddings = [] for mention in concept.names: tokenized = nltk.word_tokenize(mention.lower()) index = [vocabulary.get(token,1) for token in tokenized] #emb = np.mean(np.array([pretrained[i] for i in index]), axis=0) emb = np.sum(np.array([pretrained[i] for i in index]), axis=0) concept_embeddings.append(emb) concept_embeddings = np.array(concept_embeddings) ''' from vectorizer_elmo import elmo_default # chunk the concepts down since the list is too big concept_chunk = [concept.names[i:i + 5000] for i in range(0, len(concept.names), 5000)] concept.elmo = [] for chunk in concept_chunk: [elmo_chunk] = [c for c in elmo_default([chunk])] concept.elmo.append(elmo_chunk) [concept.elmo] = [chunk for chunk in elmo_default([concept_chunk])] #with open('gitig_concept_elmo.pickle','wb') as f: # pickle.dump(concept.elmo,f,protocol=4) #concept.elmo = pickle.load(open('gitig_concept_elmo.pickle','rb')) concept.elmo = np.array([item for sublist in concept.elmo for item in sublist]) [corpus_dev.elmo] = [chunk for chunk in elmo_default([corpus_dev.names])] ''' concept_emb = concept_embeddings #concept.elmo mention_emb = mention_embeddings #corpus_dev.elmo from sklearn.preprocessing import normalize nor_concepts = normalize(concept_emb) nor_corpus_dev = normalize(mention_emb) dot_product_matrix = np.dot(nor_corpus_dev,np.transpose(nor_concepts)) prediction_indices = np.argmax(dot_product_matrix,axis=1) predictions = np.array(concept.ids)[prediction_indices].tolist() correct = 0 #incorrect = 0 #incorrect_indices = [] for prediction, mention_gold in zip(predictions,corpus_dev.ids): if prediction == mention_gold[0] and len(mention_gold)==1: correct += 1 print('Accuracy:{0}'.format(correct/len(corpus_dev.names)))
os.path.join(directory, 'gitig_real_val_data_truncated_d50_p5.pickle'), 'rb')) real_val_data.y = np.array(real_val_data.y) concept = concept_obj(config, dictionary, order=concept_order) from sample import prepare_positives, examples positives_training, positives_dev, positives_dev_truncated = pickle.load( open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb')) # positives_dev = prepare_positives(positives_dev,nltk.word_tokenize,vocabulary) positives_dev_truncated = prepare_positives(positives_dev_truncated, nltk.word_tokenize, vocabulary) del positives_dev, positives_training # corpus corpus_train = sample.NewDataSet('training corpus') corpus_train.objects = load.load( os.path.normpath(config['corpus']['training_file']), 'NCBI') for corpus in [corpus_train]: corpus.ids = [] # list of all ids (gold standard for each mention) corpus.names = [] # list of all names corpus.all = [ ] # list of tuples (mention_text,gold,context,(start,end,docid)) #sth wrong here that sometimes throw an error #import pdb;pdb.set_trace() for abstract in corpus.objects: for section in abstract.sections: # title and abstract for mention in section.mentions: nor_ids = [sample._nor_id(one_id) for one_id in mention.id]
for k in dictionary.loaded.keys(): # keys should be in congruent order c_id = dictionary.loaded[k].DiseaseID a_ids = dictionary.loaded[k].AllDiseaseIDs for n in dictionary.loaded[k].AllNames: concept_ids.append(c_id) concept_all_ids.append(a_ids) concept_names.append(n) if n in concept_map: # one name corresponds to multiple concepts concept_map[n].append(c_id) # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n])) else: concept_map[n] = [c_id] # save the stuff to object concept = sample.NewDataSet('concepts') concept.ids = concept_ids concept.all_ids = concept_all_ids concept.names = concept_names concept.map = concept_map # corpus corpus_test = sample.NewDataSet('test corpus') corpus_test.objects = load.load( '/home/lhchan/disease_normalization/data/NCBItestset_corpus.txt', 'NCBI') # corpus_train = sample.NewDataSet('training corpus') # corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI') for corpus in [corpus_test]: mention_ids = [] # list of all ids (gold standard for each mention)
concept = concept_obj(config, dictionary, order=concept_order) from sample import prepare_positives, examples positives_training, positives_dev, positives_dev_truncated = pickle.load( open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb')) # positives_dev = prepare_positives(positives_dev,nltk.word_tokenize,vocabulary) positives_dev_truncated = prepare_positives(positives_dev_truncated, nltk.word_tokenize, vocabulary) del positives_dev, positives_training # corpus # corpus_train = sample.NewDataSet('training corpus') # corpus_train.objects = load.load(os.path.normpath(config['corpus']['training_file']),'NCBI') corpus_dev = sample.NewDataSet('dev corpus') corpus_dev.objects = load.load(config['corpus']['development_file'], 'NCBI') for corpus in [corpus_dev]: corpus.ids = [] # list of all ids (gold standard for each mention) corpus.names = [] # list of all names corpus.all = [ ] # list of tuples (mention_text,gold,context,(start,end,docid)) #sth wrong here that sometimes throw an error #import pdb;pdb.set_trace() for abstract in corpus.objects: for section in abstract.sections: # title and abstract for mention in section.mentions: nor_ids = [sample._nor_id(one_id) for one_id in mention.id] corpus.ids.append(