Пример #1
0
def calculate_sense_similarity(word, wiki_tag_file, keydict_path, similarity_function='jaccard', threshold=0):
    tag_map, link_map = get_wiki_tag_and_link_maps(wiki_tag_file)
    sense_idx_map = get_sense_idx_map(keydict_path, word)
    for sense1, idx1 in sense_idx_map.iteritems():
        tags1 = set(tag_map[sense1])
        for sense2, idx2 in sense_idx_map.iteritems():
            if sense1 != sense2:
                tags2 = set(tag_map[sense2])
                sim_score = None
                if similarity_function == 'jaccard':
                    sim_score = calculate_jaccard_sim(tags1, tags2)
                if sim_score > threshold:
                    print "{}\t{}-{}\t{}".format(word, sense1, sense2, sim_score)
Пример #2
0
def run():
    model_id = sys.argv[1]
    tag_map, link_map = get_wiki_tag_and_link_maps(sys.argv[2])
    keydict_path = sys.argv[3]
    output_fn = sys.argv[4]
    words = sys.argv[5:]

    model = load_model(model_id)
    number_of_field = 15
    table = [[] for _ in xrange(number_of_field)]
    for word in words:
        sense_idx_map = get_sense_idx_map(keydict_path, word)
        i = 1
        for sense, idx in sense_idx_map.iteritems():
            try:
                similar_words = model.most_similar(positive=[idx], topn=10)
                table[i].extend([u"%s %s" % (t[0], t[1]) for t in similar_words])
                table[i].append(u",".join(tag_map[sense]))
                similarities = [t[1] for t in similar_words]
                avg_sim = sum(similarities) / len(similarities)
                table[i].append(u"%s %f" % (link_map[sense], avg_sim))
                i += 1
            except KeyError:
                pass

        for j in xrange(i, number_of_field):
            diff = len(table[1]) - len(table[j])
            for _ in xrange(diff):
                table[j].append("")

        for i in xrange(12):
            if i == 6:
                table[0].append(word)
            else:
                table[0].append("")

        for column in table:
            column.append(" ")

    table_filtered = []
    for column in table:
        if len(column) != 0:
            table_filtered.append(column)

    headers = ["target_word"]
    headers.extend(["sense-%d" % i for i in xrange(1, number_of_field)])

    with codecs.open(output_fn, "w", "utf-8") as f:
        t = tabulate.tabulate(zip(*table_filtered), tablefmt="simple", headers=headers)
        f.write(t)
Пример #3
0
def run():
    model_id = sys.argv[1]
    tag_map, link_map = get_wiki_tag_and_link_maps(sys.argv[2])
    keydict_path = sys.argv[3]
    output_fn = sys.argv[4]
    words = sys.argv[5:]

    model = load_model(model_id)
    number_of_field = 15
    table = [[] for _ in xrange(number_of_field)]
    for word in words:
        sense_idx_map = get_sense_idx_map(keydict_path, word)
        i = 1
        for sense, idx in sense_idx_map.iteritems():
            try:
                similar_words = model.most_similar(positive=[idx], topn=10)
                table[i].extend([u"%s %s" % (t[0], t[1]) for t in similar_words])
                table[i].append(u",".join(tag_map[sense]))
                similarities = [t[1] for t in similar_words]
                avg_sim = sum(similarities) / len(similarities)
                table[i].append(u"%s %f" % (link_map[sense], avg_sim))
                i += 1
            except KeyError:
                pass

        for j in xrange(i, number_of_field):
            diff = len(table[1]) - len(table[j])
            for _ in xrange(diff):
                table[j].append("")

        for i in xrange(12):
            if i == 6:
                table[0].append(word)
            else:
                table[0].append("")

        for column in table:
            column.append(" ")

    table_filtered = []
    for column in table:
        if len(column) != 0:
            table_filtered.append(column)

    headers = ["target_word"]
    headers.extend(["sense-%d" % i for i in xrange(1, number_of_field)])

    with codecs.open(output_fn, 'w', 'utf-8') as f:
        t = tabulate.tabulate(zip(*table_filtered), tablefmt="simple", headers=headers)
        f.write(t)
Пример #4
0
def calculate_sense_similarity(word,
                               wiki_tag_file,
                               keydict_path,
                               similarity_function='jaccard',
                               threshold=0):
    tag_map, link_map = get_wiki_tag_and_link_maps(wiki_tag_file)
    sense_idx_map = get_sense_idx_map(keydict_path, word)
    for sense1, idx1 in sense_idx_map.iteritems():
        tags1 = set(tag_map[sense1])
        for sense2, idx2 in sense_idx_map.iteritems():
            if sense1 != sense2:
                tags2 = set(tag_map[sense2])
                sim_score = None
                if similarity_function == 'jaccard':
                    sim_score = calculate_jaccard_sim(tags1, tags2)
                if sim_score > threshold:
                    print "{}\t{}-{}\t{}".format(word, sense1, sense2,
                                                 sim_score)