def calculate_sense_similarity(word, wiki_tag_file, keydict_path, similarity_function='jaccard', threshold=0): tag_map, link_map = get_wiki_tag_and_link_maps(wiki_tag_file) sense_idx_map = get_sense_idx_map(keydict_path, word) for sense1, idx1 in sense_idx_map.iteritems(): tags1 = set(tag_map[sense1]) for sense2, idx2 in sense_idx_map.iteritems(): if sense1 != sense2: tags2 = set(tag_map[sense2]) sim_score = None if similarity_function == 'jaccard': sim_score = calculate_jaccard_sim(tags1, tags2) if sim_score > threshold: print "{}\t{}-{}\t{}".format(word, sense1, sense2, sim_score)
def run(): model_id = sys.argv[1] tag_map, link_map = get_wiki_tag_and_link_maps(sys.argv[2]) keydict_path = sys.argv[3] output_fn = sys.argv[4] words = sys.argv[5:] model = load_model(model_id) number_of_field = 15 table = [[] for _ in xrange(number_of_field)] for word in words: sense_idx_map = get_sense_idx_map(keydict_path, word) i = 1 for sense, idx in sense_idx_map.iteritems(): try: similar_words = model.most_similar(positive=[idx], topn=10) table[i].extend([u"%s %s" % (t[0], t[1]) for t in similar_words]) table[i].append(u",".join(tag_map[sense])) similarities = [t[1] for t in similar_words] avg_sim = sum(similarities) / len(similarities) table[i].append(u"%s %f" % (link_map[sense], avg_sim)) i += 1 except KeyError: pass for j in xrange(i, number_of_field): diff = len(table[1]) - len(table[j]) for _ in xrange(diff): table[j].append("") for i in xrange(12): if i == 6: table[0].append(word) else: table[0].append("") for column in table: column.append(" ") table_filtered = [] for column in table: if len(column) != 0: table_filtered.append(column) headers = ["target_word"] headers.extend(["sense-%d" % i for i in xrange(1, number_of_field)]) with codecs.open(output_fn, "w", "utf-8") as f: t = tabulate.tabulate(zip(*table_filtered), tablefmt="simple", headers=headers) f.write(t)
def run(): model_id = sys.argv[1] tag_map, link_map = get_wiki_tag_and_link_maps(sys.argv[2]) keydict_path = sys.argv[3] output_fn = sys.argv[4] words = sys.argv[5:] model = load_model(model_id) number_of_field = 15 table = [[] for _ in xrange(number_of_field)] for word in words: sense_idx_map = get_sense_idx_map(keydict_path, word) i = 1 for sense, idx in sense_idx_map.iteritems(): try: similar_words = model.most_similar(positive=[idx], topn=10) table[i].extend([u"%s %s" % (t[0], t[1]) for t in similar_words]) table[i].append(u",".join(tag_map[sense])) similarities = [t[1] for t in similar_words] avg_sim = sum(similarities) / len(similarities) table[i].append(u"%s %f" % (link_map[sense], avg_sim)) i += 1 except KeyError: pass for j in xrange(i, number_of_field): diff = len(table[1]) - len(table[j]) for _ in xrange(diff): table[j].append("") for i in xrange(12): if i == 6: table[0].append(word) else: table[0].append("") for column in table: column.append(" ") table_filtered = [] for column in table: if len(column) != 0: table_filtered.append(column) headers = ["target_word"] headers.extend(["sense-%d" % i for i in xrange(1, number_of_field)]) with codecs.open(output_fn, 'w', 'utf-8') as f: t = tabulate.tabulate(zip(*table_filtered), tablefmt="simple", headers=headers) f.write(t)