def concept_obj(conf, dictionary, order=None):
    concept_ids = []  # list of all concept ids
    # concept_all_ids = [] # list of (lists of all concept ids with alt IDs)
    concept_names = []  # list of all names, same length as concept_ids
    concept_map = {}  # names as keys, ids as concepts

    if order:
        use = order
        logger.info('Re-initializing concept object.')
    else:
        use = dictionary.loaded.keys()

    for k in use:
        # keys not in congruent order! To make them congruent:
        # k,v = zip(*dictionary.loaded.items())
        # k = list(k)
        # k.sort()
        c_id = dictionary.loaded[k].DiseaseID
        # a_ids = dictionary.loaded[k].AllDiseaseIDs

        if int(conf['settings']['all_names']):
            for n in dictionary.loaded[k].AllNames:
                concept_ids.append(c_id)
                # concept_all_ids.append(a_ids)
                concept_names.append(n)
                if n in concept_map:  # one name corresponds to multiple concepts
                    concept_map[n].append(c_id)
                    # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n]))
                else:
                    concept_map[n] = [c_id]
        else:
            for n in dictionary.loaded[k].DiseaseName:
                concept_ids.append(c_id)
                concept_all_ids.append(a_ids)
                concept_names.append(n)
                if n in concept_map:  # one name corresponds to multiple concepts
                    concept_map[n].append(c_id)
                    # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n]))
                else:
                    concept_map[n] = [c_id]

    # save the stuff to object
    concept = sample.NewDataSet('concepts')
    concept.ids = concept_ids
    # concept.all_ids = concept_all_ids
    concept.names = concept_names
    concept.map = concept_map
    concept.tokenize = [nltk.word_tokenize(name) for name in concept_names]
    concept.vectorize = np.array(
        [[vocabulary.get(text.lower(), 1) for text in concept]
         for concept in concept.tokenize])

    return concept
def concept_obj(conf, dictionary, order=None):
    # concept_ids = [] # list of all concept ids
    # concept_all_ids = [] # list of (lists of all concept ids with alt IDs)
    concept_names = []  # list of all names, same length as concept_ids
    # concept_map = {} # names as keys, ids as concepts

    if order:
        use = order
        logger.info('Re-initialing concept object.')
    else:
        use = dictionary.loaded.keys()

    for k in use:
        # keys not in congruent order! To make them congruent:
        # k,v = zip(*dictionary.loaded.items())
        # k = list(k)
        # k.sort()
        # c_id = dictionary.loaded[k].DiseaseID
        # a_ids = dictionary.loaded[k].AllDiseaseIDs
        for n in dictionary.loaded[k].AllNames:
            concept_names.append(n)

    # tokenization & vectorization of dictionary terms
    import nltk
    concept_tokenize = [nltk.word_tokenize(name) for name in concept_names
                        ]  # list of list of tokenized names
    concept_vectorize = np.array(
        [[vocabulary.get(text.lower(), 1) for text in concept]
         for concept in concept_tokenize])

    # save the stuff to object
    concept = sample.NewDataSet('concepts')
    #concept.ids = concept_ids
    #concept.all_ids = concept_all_ids
    concept.names = concept_names
    #concept.map = concept_map
    concept.tokenize = concept_tokenize
    concept.vectorize = concept_vectorize
    for corpus in [concept]:
        logger.info('Padding {0}'.format(corpus.info))
        logger.info('Old shape: {0}'.format(corpus.vectorize.shape))
        corpus.padded = pad_sequences(corpus.vectorize,
                                      padding='post',
                                      maxlen=int(
                                          config['embedding']['length']))
        #format of corpus.padded: numpy, mentions, padded
        logger.info('New shape: {0}'.format(corpus.padded.shape))

    return concept
Пример #3
0
def concept_obj(conf,dictionary,order=None):
    concept_ids = [] # list of all concept ids
    concept_all_ids = [] # list of (lists of all concept ids with alt IDs)
    concept_names = [] # list of all names, same length as concept_ids
    concept_map = {} # names as keys, ids as concepts

    if order:
        use = order
        logger.info('Re-initialing concept object.')
    else:
        use = dictionary.loaded.keys()

    for k in use:
    # keys not in congruent order! To make them congruent:
    # k,v = zip(*dictionary.loaded.items())
    # k = list(k)
    # k.sort()
        c_id = dictionary.loaded[k].DiseaseID
        a_ids = dictionary.loaded[k].AllDiseaseIDs
        
        if int(conf['settings']['all_names']):
            for n in dictionary.loaded[k].AllNames:
                concept_ids.append(c_id)
                concept_all_ids.append(a_ids)
                concept_names.append(n)
                if n in concept_map: # one name corresponds to multiple concepts
                    concept_map[n].append(c_id)
                    # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n]))
                else:
                    concept_map[n] = [c_id]
        else:
            for n in dictionary.loaded[k].DiseaseName:
                concept_ids.append(c_id)
                concept_all_ids.append(a_ids)
                concept_names.append(n)
                if n in concept_map: # one name corresponds to multiple concepts
                    concept_map[n].append(c_id)
                    # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n]))
                else:
                    concept_map[n] = [c_id]


    # tokenization & vectorization of dictionary terms
    import nltk
    concept_tokenize = [nltk.word_tokenize(name) for name in concept_names] # list of list of tokenized names
    concept_vectorize = np.array([[vocabulary.get(text.lower(),1) for text in concept] for concept in concept_tokenize])
    if conf.getint('embedding','elmo'):
        from vectorizer_elmo import elmo_default
        concept_elmo = elmo_default([concept_names])

    # save the stuff to object
    concept = sample.NewDataSet('concepts')
    concept.ids = concept_ids
    concept.all_ids = concept_all_ids
    concept.names = concept_names
    concept.map = concept_map
    concept.tokenize = concept_tokenize
    concept.vectorize = concept_vectorize
    if conf.getint('embedding','elmo'):
        concept.elmo = concept_elmo

    logger.info('Padding {0}'.format(concept.info))
    logger.info('Old shape: {0}'.format(concept.vectorize.shape))
    concept.padded = pad_sequences(concept.vectorize, padding='post', maxlen=int(config['embedding']['length']))
    #format of corpus.padded: numpy, mentions, padded
    logger.info('New shape: {0}'.format(concept.padded.shape))

    return concept
Пример #4
0
    concept.vectorize = concept_vectorize
    if conf.getint('embedding','elmo'):
        concept.elmo = concept_elmo

    logger.info('Padding {0}'.format(concept.info))
    logger.info('Old shape: {0}'.format(concept.vectorize.shape))
    concept.padded = pad_sequences(concept.vectorize, padding='post', maxlen=int(config['embedding']['length']))
    #format of corpus.padded: numpy, mentions, padded
    logger.info('New shape: {0}'.format(concept.padded.shape))

    return concept

concept = concept_obj(config,dictionary)

# corpus
corpus_train = sample.NewDataSet('training corpus')
corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI')

corpus_dev = sample.NewDataSet('dev corpus')
corpus_dev.objects = load.load(config['corpus']['development_file'],'NCBI')

for corpus in [corpus_train, corpus_dev]:
    mention_ids = [] # list of all ids (gold standard for each mention)
    mention_names = [] # list of all names
    mention_all = [] # list of tuples (mention_text,gold,context,(start,end,docid))

    #sth wrong here that sometimes throw an error
    #import pdb;pdb.set_trace()
    for abstract in corpus.objects:
        for section in abstract.sections: # title and abstract
            for mention in section.mentions:
def emb_baseline(emb_path):
    #vector_model, vocabulary, inversed_vocabulary = prepare_embedding_vocab('/home/lenz/disease-normalization/data/embeddings/wvec_200_win-30_chiu-et-al.bin')
    vector_model, vocabulary, inversed_vocabulary = prepare_embedding_vocab(emb_path, binary = True)
    pretrained = load_pretrained_word_embeddings(vocabulary, vector_model)


    # MEDIC dictionary
    dictionary = load.Terminology()
    # dictionary of entries, key = canonical id, value = named tuple in the form of
    #   MEDIC_ENTRY(DiseaseID='MESH:D005671', DiseaseName='Fused Teeth',
    #   AllDiseaseIDs=('MESH:D005671',), AllNames=('Fused Teeth', 'Teeth, Fused')
    dictionary.loaded = load.load(config['terminology']['dict_file'],'MEDIC')

    import vectorizer
    dictionary.no_cangen_tokenized = vectorizer.MEDIC_dict_tokenizer_no_cangen(dictionary.loaded,config['methods']['tokenizer'])
    dictionary.no_cangen_vectorized = vectorizer.MEDIC_dict_vectorizer_no_cangen(dictionary.no_cangen_tokenized,vocabulary)


    # concepts
    concept_ids = [] # list of all concept ids
    concept_all_ids = [] # list of (lists of all concept ids with alt IDs)
    concept_names = [] # list of all names, same length as concept_ids
    concept_map = {} # names as keys, ids as concepts

    for k in dictionary.loaded.keys(): # keys should be in congruent order
        c_id = dictionary.loaded[k].DiseaseID
        a_ids = dictionary.loaded[k].AllDiseaseIDs
        
        if int(config['settings']['all_names']):
            for n in dictionary.loaded[k].AllNames:
                concept_ids.append(c_id)
                concept_all_ids.append(a_ids)
                concept_names.append(n)
                if n in concept_map: # one name corresponds to multiple concepts
                    concept_map[n].append(c_id)
                    # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n]))
                else:
                    concept_map[n] = [c_id]
        else:
            for n in dictionary.loaded[k].DiseaseName:
                concept_ids.append(c_id)
                concept_all_ids.append(a_ids)
                concept_names.append(n)
                if n in concept_map: # one name corresponds to multiple concepts
                    concept_map[n].append(c_id)
                    # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n]))
                else:
                    concept_map[n] = [c_id]

    # save the stuff to object
    concept = sample.NewDataSet('concepts')
    concept.ids = concept_ids
    concept.all_ids = concept_all_ids
    concept.names = concept_names
    concept.map = concept_map

    #concept_vectorize = np.array([dictionary.no_cangen_vectorized[k] for k in concept.ids])


    # corpus
    #corpus_train = sample.NewDataSet('training corpus')
    #corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI')

    corpus_dev = sample.NewDataSet('dev corpus')
    corpus_dev.objects = load.load(config['corpus']['development_file'],'NCBI')

    #corpus_test = sample.NewDataSet('test corpus')
    #corpus_test.objects = load.load('/home/lhchan/disease_normalization/data/NCBItestset_corpus.txt','NCBI')
    #corpus_dev=corpus_test

    for corpus in [corpus_dev]:
        mention_ids = [] # list of all ids (gold standard for each mention)
        mention_names = [] # list of all names
        mention_all = [] # list of tuples (mention_text,gold,context,(start,end,docid))

        #sth wrong here that sometimes throw an error
        #import pdb;pdb.set_trace()
        for abstract in corpus.objects:
            for section in abstract.sections: # title and abstract
                for mention in section.mentions:
                    nor_ids = [sample._nor_id(one_id) for one_id in mention.id]
                    mention_ids.append(nor_ids) # append list of ids, usually len(list)=1
                    mention_names.append(mention.text)
                    mention_all.append((mention.text,nor_ids,section.text,(mention.start,mention.end,abstract.docid)))

        # tokenization & vectorization of mentions
        #mention_tokenize = [nltk.word_tokenize(name) for name in mention_names]
        #mention_vectorize = np.array([[vocabulary.get(text,1) for text in mention] for mention in mention_tokenize])
        # mention_elmo = elmo_default([mention_names])

        corpus.ids = mention_ids
        corpus.names = mention_names
        corpus.all = mention_all
        # corpus.tokenize = mention_tokenize
        # corpus.vectorize = mention_vectorize
        # corpus.elmo = mention_elmo


    # vector representations
    import nltk
    mention_embeddings = []
    for mention in corpus.names:
        tokenized = nltk.word_tokenize(mention.lower())
        index = [vocabulary.get(token,1) for token in tokenized]
        #emb = np.mean(np.array([pretrained[i] for i in index]), axis=0)
        emb = np.sum(np.array([pretrained[i] for i in index]), axis=0)
        mention_embeddings.append(emb)
    mention_embeddings = np.array(mention_embeddings)

    concept_embeddings = []
    for mention in concept.names:
        tokenized = nltk.word_tokenize(mention.lower())
        index = [vocabulary.get(token,1) for token in tokenized]
        #emb = np.mean(np.array([pretrained[i] for i in index]), axis=0)
        emb = np.sum(np.array([pretrained[i] for i in index]), axis=0)
        concept_embeddings.append(emb)
    concept_embeddings = np.array(concept_embeddings)




    '''
    from vectorizer_elmo import elmo_default
    # chunk the concepts down since the list is too big
    concept_chunk = [concept.names[i:i + 5000] for i in range(0, len(concept.names), 5000)]
    concept.elmo = []
    for chunk in concept_chunk:
        [elmo_chunk] = [c for c in elmo_default([chunk])]
        concept.elmo.append(elmo_chunk)
    [concept.elmo] = [chunk for chunk in elmo_default([concept_chunk])]

    #with open('gitig_concept_elmo.pickle','wb') as f:
    #    pickle.dump(concept.elmo,f,protocol=4)

    #concept.elmo = pickle.load(open('gitig_concept_elmo.pickle','rb'))

    concept.elmo =  np.array([item for sublist in concept.elmo for item in sublist])
    [corpus_dev.elmo] = [chunk for chunk in elmo_default([corpus_dev.names])]
    '''

    concept_emb = concept_embeddings #concept.elmo
    mention_emb = mention_embeddings #corpus_dev.elmo

    from sklearn.preprocessing import normalize
    nor_concepts = normalize(concept_emb)
    nor_corpus_dev = normalize(mention_emb)

    dot_product_matrix = np.dot(nor_corpus_dev,np.transpose(nor_concepts))
    prediction_indices = np.argmax(dot_product_matrix,axis=1)
    predictions = np.array(concept.ids)[prediction_indices].tolist()


    correct = 0
    #incorrect = 0
    #incorrect_indices = []
    for prediction, mention_gold in zip(predictions,corpus_dev.ids):
        if prediction == mention_gold[0] and len(mention_gold)==1:
            correct += 1
    print('Accuracy:{0}'.format(correct/len(corpus_dev.names)))
        os.path.join(directory, 'gitig_real_val_data_truncated_d50_p5.pickle'),
        'rb'))
real_val_data.y = np.array(real_val_data.y)

concept = concept_obj(config, dictionary, order=concept_order)

from sample import prepare_positives, examples
positives_training, positives_dev, positives_dev_truncated = pickle.load(
    open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb'))
# positives_dev = prepare_positives(positives_dev,nltk.word_tokenize,vocabulary)
positives_dev_truncated = prepare_positives(positives_dev_truncated,
                                            nltk.word_tokenize, vocabulary)
del positives_dev, positives_training

# corpus
corpus_train = sample.NewDataSet('training corpus')
corpus_train.objects = load.load(
    os.path.normpath(config['corpus']['training_file']), 'NCBI')

for corpus in [corpus_train]:
    corpus.ids = []  # list of all ids (gold standard for each mention)
    corpus.names = []  # list of all names
    corpus.all = [
    ]  # list of tuples (mention_text,gold,context,(start,end,docid))

    #sth wrong here that sometimes throw an error
    #import pdb;pdb.set_trace()
    for abstract in corpus.objects:
        for section in abstract.sections:  # title and abstract
            for mention in section.mentions:
                nor_ids = [sample._nor_id(one_id) for one_id in mention.id]
for k in dictionary.loaded.keys():  # keys should be in congruent order
    c_id = dictionary.loaded[k].DiseaseID
    a_ids = dictionary.loaded[k].AllDiseaseIDs

    for n in dictionary.loaded[k].AllNames:
        concept_ids.append(c_id)
        concept_all_ids.append(a_ids)
        concept_names.append(n)
        if n in concept_map:  # one name corresponds to multiple concepts
            concept_map[n].append(c_id)
            # logger.warning('{0} already in the dictionary with id {1}'.format(n,concept_map[n]))
        else:
            concept_map[n] = [c_id]

# save the stuff to object
concept = sample.NewDataSet('concepts')
concept.ids = concept_ids
concept.all_ids = concept_all_ids
concept.names = concept_names
concept.map = concept_map

# corpus
corpus_test = sample.NewDataSet('test corpus')
corpus_test.objects = load.load(
    '/home/lhchan/disease_normalization/data/NCBItestset_corpus.txt', 'NCBI')

# corpus_train = sample.NewDataSet('training corpus')
# corpus_train.objects = load.load(config['corpus']['training_file'],'NCBI')

for corpus in [corpus_test]:
    mention_ids = []  # list of all ids (gold standard for each mention)
Пример #8
0
concept = concept_obj(config, dictionary, order=concept_order)

from sample import prepare_positives, examples
positives_training, positives_dev, positives_dev_truncated = pickle.load(
    open(os.path.join(directory, 'gitig_positive_indices.pickle'), 'rb'))
# positives_dev = prepare_positives(positives_dev,nltk.word_tokenize,vocabulary)
positives_dev_truncated = prepare_positives(positives_dev_truncated,
                                            nltk.word_tokenize, vocabulary)
del positives_dev, positives_training

# corpus
# corpus_train = sample.NewDataSet('training corpus')
# corpus_train.objects = load.load(os.path.normpath(config['corpus']['training_file']),'NCBI')

corpus_dev = sample.NewDataSet('dev corpus')
corpus_dev.objects = load.load(config['corpus']['development_file'], 'NCBI')

for corpus in [corpus_dev]:
    corpus.ids = []  # list of all ids (gold standard for each mention)
    corpus.names = []  # list of all names
    corpus.all = [
    ]  # list of tuples (mention_text,gold,context,(start,end,docid))

    #sth wrong here that sometimes throw an error
    #import pdb;pdb.set_trace()
    for abstract in corpus.objects:
        for section in abstract.sections:  # title and abstract
            for mention in section.mentions:
                nor_ids = [sample._nor_id(one_id) for one_id in mention.id]
                corpus.ids.append(