示例#1
0
def context_sim(new_tokens, trained_model, type_range, metric='maxsimc'):
    moretokens = []

    for new_token in new_tokens:
        word_a = new_token.word1
        word_b = new_token.word2
        #list of synset pairs
        synsets_a = sp.synset_all(word_a)
        synsets_b = sp.synset_all(word_b)
        #average vector for the context for each word
        context_a = context_parser(word_a, new_token.sent1, trained_model)
        context_b = context_parser(word_b, new_token.sent2, trained_model)
        #clean synsets that only exist in the model
        vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model)
        vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model)

        if metric == 'maxsimc':
            sim_value = maxSimC(vec_syna, context_a, vec_synb, context_b,
                                type_range)
        elif metric == 'avgsimc':
            sim_value = avgSimC(vec_syna, context_a, vec_synb, context_b,
                                type_range)
        elif metric == 'globalsimc':
            sim_value = globalSimC(context_a, context_b, type_range)

        token_prime = bench_data.Token_Data(word_a, word_b, sim_value)
        moretokens.append(token_prime)
    return (moretokens)
def build_synset_packages(word, *pos):
    synset_wndata_list = []  #list of WNDATA{}
    #maybe deal with words that are not in the model
    if not pos:  #for all POS
        synsets = sp.synset_all(word)
    else:  #for specific POS
        synsets = sp.synset_pos(word, pos)

    for sys_element in synsets:
        synset_wndata_list.append(
            bench_data.WNData(sys_element, sys_element.offset(),
                              sys_element.pos(), sys_element.definition()))

    return (synset_wndata_list)
示例#3
0
def nocontext_sim(tokens, trained_model, type_range, metric='avgsim'):
    moretokens = []

    for token in tokens:
        word_a = token.word1
        word_b = token.word2
        synsets_a = sp.synset_all(word_a)
        synsets_b = sp.synset_all(word_b)

        #clean synsets that only exist in the model
        vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model)
        vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model)

        if metric == 'maxsim':
            sim_value = maxSim(vec_syna, vec_synb, type_range)
        elif metric == 'avgsim':
            sim_value = avgSim(vec_syna, vec_synb, type_range)
        elif metric == 'globalsim':
            sim_value = globalSim(vec_syna, vec_synb, type_range)

        token_prime = bench_data.Token_Data(word_a, word_b, sim_value)
        moretokens.append(token_prime)

    return (moretokens)
示例#4
0
def context_parser(anchor_word, text_items, trained_model):
    context_vector = []
    for text_item in text_items:
        #if text_item == anchor_word: continue #discard the target/anchor word from the context - avoid bias
        synsets = sp.synset_all(text_item)
        for synset in synsets:
            key = sp.key_parser(text_item, synset)
            try:
                v1 = trained_model.word_vec(key)
                context_vector.append(
                    v1
                )  #put all vector words in the sentence together and average later
            except KeyError:
                pass  #key not in the model

    return (numpy.average(context_vector, axis=0))
def build_synset_packages_refi(word, embed_model, *pos):
    synset_refidata_list = []

    if not pos:  #for all POS
        synsets = sp.synset_all(word)
    else:  #for specific POS
        synsets = sp.synset_pos(word, pos)

    for synset in synsets:  #create list of Synsets, offset, POS
        key = sp.key_parser(word, synset)
        wpack = bench_data.WNData(synset, synset.offset(), synset.pos(),
                                  synset.definition())
        vec = sp.retrieve_synsetvec(key, embed_model)
        wpack.vector = vec
        synset_refidata_list.append(wpack)

    return (synset_refidata_list)