예제 #1
0
def context_sim(new_tokens, trained_model, type_range, metric='maxsimc'):
    moretokens = []

    for new_token in new_tokens:
        word_a = new_token.word1
        word_b = new_token.word2
        #list of synset pairs
        synsets_a = sp.synset_all(word_a)
        synsets_b = sp.synset_all(word_b)
        #average vector for the context for each word
        context_a = context_parser(word_a, new_token.sent1, trained_model)
        context_b = context_parser(word_b, new_token.sent2, trained_model)
        #clean synsets that only exist in the model
        vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model)
        vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model)

        if metric == 'maxsimc':
            sim_value = maxSimC(vec_syna, context_a, vec_synb, context_b,
                                type_range)
        elif metric == 'avgsimc':
            sim_value = avgSimC(vec_syna, context_a, vec_synb, context_b,
                                type_range)
        elif metric == 'globalsimc':
            sim_value = globalSimC(context_a, context_b, type_range)

        token_prime = bench_data.Token_Data(word_a, word_b, sim_value)
        moretokens.append(token_prime)
    return (moretokens)
예제 #2
0
def process_yp130(file):
    tokens_list = []
    print('Processing %s' % file)
    with open(file, 'r', encoding='utf-8') as fin:
        for line in fin:
            block = line.split(' ')  #delimiter
            tmp_token = bench_data.Token_Data(block[0], block[1],
                                              float(block[2].strip('\n')))
            tokens_list.append(tmp_token)
    return (tokens_list)
예제 #3
0
def nocontext_sim(tokens, trained_model, type_range, metric='avgsim'):
    moretokens = []

    for token in tokens:
        word_a = token.word1
        word_b = token.word2
        synsets_a = sp.synset_all(word_a)
        synsets_b = sp.synset_all(word_b)

        #clean synsets that only exist in the model
        vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model)
        vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model)

        if metric == 'maxsim':
            sim_value = maxSim(vec_syna, vec_synb, type_range)
        elif metric == 'avgsim':
            sim_value = avgSim(vec_syna, vec_synb, type_range)
        elif metric == 'globalsim':
            sim_value = globalSim(vec_syna, vec_synb, type_range)

        token_prime = bench_data.Token_Data(word_a, word_b, sim_value)
        moretokens.append(token_prime)

    return (moretokens)