def context_sim(new_tokens, trained_model, type_range, metric='maxsimc'): moretokens = [] for new_token in new_tokens: word_a = new_token.word1 word_b = new_token.word2 #list of synset pairs synsets_a = sp.synset_all(word_a) synsets_b = sp.synset_all(word_b) #average vector for the context for each word context_a = context_parser(word_a, new_token.sent1, trained_model) context_b = context_parser(word_b, new_token.sent2, trained_model) #clean synsets that only exist in the model vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model) vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model) if metric == 'maxsimc': sim_value = maxSimC(vec_syna, context_a, vec_synb, context_b, type_range) elif metric == 'avgsimc': sim_value = avgSimC(vec_syna, context_a, vec_synb, context_b, type_range) elif metric == 'globalsimc': sim_value = globalSimC(context_a, context_b, type_range) token_prime = bench_data.Token_Data(word_a, word_b, sim_value) moretokens.append(token_prime) return (moretokens)
def process_yp130(file): tokens_list = [] print('Processing %s' % file) with open(file, 'r', encoding='utf-8') as fin: for line in fin: block = line.split(' ') #delimiter tmp_token = bench_data.Token_Data(block[0], block[1], float(block[2].strip('\n'))) tokens_list.append(tmp_token) return (tokens_list)
def nocontext_sim(tokens, trained_model, type_range, metric='avgsim'): moretokens = [] for token in tokens: word_a = token.word1 word_b = token.word2 synsets_a = sp.synset_all(word_a) synsets_b = sp.synset_all(word_b) #clean synsets that only exist in the model vec_syna = sp.validate_synsets_model(word_a, synsets_a, trained_model) vec_synb = sp.validate_synsets_model(word_b, synsets_b, trained_model) if metric == 'maxsim': sim_value = maxSim(vec_syna, vec_synb, type_range) elif metric == 'avgsim': sim_value = avgSim(vec_syna, vec_synb, type_range) elif metric == 'globalsim': sim_value = globalSim(vec_syna, vec_synb, type_range) token_prime = bench_data.Token_Data(word_a, word_b, sim_value) moretokens.append(token_prime) return (moretokens)