Python load_word2vec 예제들, distsim.load_word2vec Python 예제들

예제 #1

0

파일 보기

def main():
##    start = time.time()
    word_to_vec_dict = distsim.load_word2vec("nyt_word2vec.4k")
##    word_to_vec_dict = distsim.load_word2vec("glove.6B.300d.txt")
    line_count = 0
    total,o_b,f_b,t_b = 0,0,0,0
    best_v = []
    with open("word-test.v3.txt") as infile:
        for line in infile:
            line_count += 1
            if line_count == 1: continue
            tmpv = line.strip(' \n\t').split()
            if tmpv[0] != ':':
                best_v = sim_compare(word_to_vec_dict,tmpv[0],tmpv[1],tmpv[2],tmpv[3])
                o_b += best_v[0]
                f_b += best_v[1]
                t_b += best_v[2]
                total += 1
            else:
                if(line_count != 2):
                    print '1-best:',o_b/total
                    print '5-best:',f_b/total
                    print '10-best:',t_b/total
                    print ' '
                total,o_b,f_b,t_b = 0,0,0,0
                best_v = []
                print 'For analogy:',tmpv[1]
    print '1-best:',o_b/total
    print '5-best:',f_b/total
    print '10-best:',t_b/total

예제 #2

0

파일 보기

def table(lines):
    data = []
    for line in lines:
        data.append(line.split(' '))
    data = [data]
    t = PrettyTable(['Class', '1-best', '5-best', '10-best'])
    title = ['adversarial1', 'adversarial2']
    p = [1, 5, 10]
    for num in range(len(data)):
        total = len(data[num])
        count = [0, 0, 0]
        for row in data[num]:
            word_to_vec_dict = distsim.load_word2vec("nyt_word2vec.4k")
            w1 = word_to_vec_dict[row[0]]
            w2 = word_to_vec_dict[row[1]]
            w4 = word_to_vec_dict[row[3]]
            ret = distsim.show_nearest(word_to_vec_dict, w1 - w2 + w4,
                                       set([row[0], row[1], row[3]]),
                                       distsim.cossim_dense)
            true = row[2]
            for i in range(len(p)):
                l = [j[0] for j in ret[:p[i]]]
                if true in l:
                    count[i] += 1
        t.add_row([
            title[num], count[0] / float(total), count[1] / float(total),
            count[2] / float(total)
        ])
    print t

예제 #3

0

파일 보기

파일: q7.py 프로젝트: marshalljacobs12/Natural-Language-Processing

def calculateAccuracies(solutions):
    word_to_vec_dict = distsim.load_word2vec("nyt_word2vec.4k")
    accuracies = {}
    errors = {}

    for k, v in solutions.iteritems():
        #print 'group: ' + str(k)
        accuracies[k] = (float(0.0), float(0.0), float(0.0))
        errors[k] = ''
        size = len(v)
        hasShownError = False
        for i in v:
            w1 = word_to_vec_dict[i[0]]
            w2 = word_to_vec_dict[i[1]]
            w4 = word_to_vec_dict[i[3]]
            ret = distsim.show_nearest(word_to_vec_dict, w1 - w2 + w4,
                                       set([str(i[0]),
                                            str(i[1]),
                                            str(i[3])]), distsim.cossim_dense)
            isInTenBest = False
            for n in range(len(ret)):
                (p, q) = ret[n]
                if p == i[2]:
                    isInTenBest = True
                    if n == 0:
                        (x, y, z) = accuracies[k]
                        x += 1.0
                        y += 1.0
                        z += 1.0
                        accuracies[k] = (x, y, z)
                    elif n < 5:
                        (x, y, z) = accuracies[k]
                        y += 1.0
                        z += 1.0
                        accuracies[k] = (x, y, z)
                    else:
                        (x, y, z) = accuracies[k]
                        z += 1.0
                        accuracies[k] = (x, y, z)

            if not isInTenBest and not hasShownError:
                errors[k] = (i[0], i[1], i[2], i[3], ret[0][0])
                hasShownError = True

        (a, b, c) = accuracies[k]
        a /= size
        b /= size
        c /= size
        accuracies[k] = (a, b, c)
        #print accuracies[k]

    #print errors
    return (accuracies, errors)

예제 #4

0

파일 보기

#!/usr/bin/env python
import distsim
word_to_vec_dict = distsim.load_word2vec("nyt_word2vec.4k")
###Provide your answer below

###Answer examples
print "jack", distsim.show_nearest(word_to_vec_dict, word_to_vec_dict['jack'],
                                   set(['jack']), distsim.cossim_dense)
print "london", distsim.show_nearest(word_to_vec_dict,
                                     word_to_vec_dict['london'],
                                     set(['london']), distsim.cossim_dense)
print "month", distsim.show_nearest(word_to_vec_dict,
                                    word_to_vec_dict['month'], set(['month']),
                                    distsim.cossim_dense)
print "attack", distsim.show_nearest(word_to_vec_dict,
                                     word_to_vec_dict['attack'],
                                     set(['attack']), distsim.cossim_dense)
print "happy", distsim.show_nearest(word_to_vec_dict,
                                    word_to_vec_dict['happy'], set(['happy']),
                                    distsim.cossim_dense)
print "jail", distsim.show_nearest(word_to_vec_dict, word_to_vec_dict['jail'],
                                   set(['jail']), distsim.cossim_dense)
print "fantastic", distsim.show_nearest(word_to_vec_dict,
                                        word_to_vec_dict['fantastic'],
                                        set(['fantastic']),
                                        distsim.cossim_dense)

예제 #5

0

파일 보기

파일: q7.py 프로젝트: ZhaoYangbjtu/Natural_Language_Processing

import distsim

#word_to_vec_dict = distsim.load_word2vec("nyt_word2vec.4k")
word_to_vec_dict = distsim.load_word2vec("glove.6B.50d.txt")
#word_to_ccdict = distsim.load_contexts("nytcounts.4k")
L = []
relation = []
group = 0
with open('word-test.v3.txt', "r") as f_in:
    #with open('word-test.v3.txt',"r") as f_in:
    for line in f_in:
        if line[0] == ':':
            group += 1
            L.append([group, line])
            relation.append(line.strip('\n').replace(':', ''))
        else:
            L.append([group, line])

Accuracy_Best1 = []
Accuracy_Best5 = []
Accuracy_Best10 = []

for g in range(1, 9):
    analogy = []
    for i in range(0, len(L)):
        if L[i][0] == g and len(L[i][1].split()) == 4:
            analogy.append(L[i][1].split())

    best1 = best5 = best10 = 0

    for a in range(0, len(analogy)):

예제 #6

0

파일 보기

#!/usr/bin/env python
#from __future__ import division
import distsim
from collections import defaultdict

file = open('q8.txt', 'r')
word_to_vec_dict = distsim.load_word2vec("wiki-news-300d-1M.vec4")
match_position = defaultdict(list)
catorder = []

for line in file:
    line = line.strip().split()
    if line[0] == '//':
        continue
    if line[0] == ':':
        cat = line[1]
        catorder.append(cat)
    else:
        word0 = word_to_vec_dict[line[0]]
        word1 = word_to_vec_dict[line[1]]
        word3 = word_to_vec_dict[line[3]]
        ret = distsim.show_nearest(word_to_vec_dict, word0 - word1 + word3,
                                   set([line[0], line[1], line[3]]),
                                   distsim.cossim_dense)
        count = 0
        while (count < len(ret)):
            if ret[count][0] == line[2]:
                break
            else:
                count += 1
        if count != len(ret):

예제 #7

0

파일 보기

#!/usr/bin/env python
import distsim
from collections import defaultdict

f = open('q8_data.txt', 'r')
word_to_vec_dict = distsim.load_word2vec("glove.twitter.27B.100d.txt")
category_list = []
category_num_dict = defaultdict(list)
for line in f:
    line = line.strip('\n')
    if line[0] == '//':
        continue
    elif line[0] == ':':
        category = line.split(' ')[1]
        category_list.append(category)
    word = line.strip().split(' ')
    if len(word) == 4:
        word1_dict = word_to_vec_dict[word[0]]
        word2_dict = word_to_vec_dict[word[1]]
        word4_dict = word_to_vec_dict[word[3]]
        ret = distsim.show_nearest(word_to_vec_dict,
                                   word1_dict - word2_dict + word4_dict,
                                   set([word[0], word[1],
                                        word[3]]), distsim.cossim_dense)
        count = 0
        find = False
        while (count < len(ret)):
            if ret[count][0] == word[2]:
                count += 1
                find = True
                break

예제 #8

0

파일 보기

def q7_answer(reasoningData):
    def _get_n_best_count(analogy_returnedVectors, n_best):
        #count of words in n_best range - if they also hold right value
        count = 0
        for correct_retVec_tpl in analogy_returnedVectors:
            if n_best > len(correct_retVec_tpl[1]):
                if correct_retVec_tpl[0] in correct_retVec_tpl[
                        1][:len(correct_retVec_tpl[1])]:
                    count += 1
            elif correct_retVec_tpl[0] in correct_retVec_tpl[1][:n_best]:
                count += 1

        return float(count)

    word_to_vec_dict = distsim.load_word2vec("nyt_word2vec.4k")

    relation_group = {}
    result_comp = {}
    col_len = max([len(x) for x in reasoningData])

    print '\n1 NEGATIVE EXAMPLE FROM EACH GROUP( Element3: Incorrect Prediction / Correct Value):\n'
    for groupName, list_of_analogies in reasoningData.iteritems():
        relation_group[groupName] = []
        result_comp.setdefault(groupName, {'matched': 0, 'unmatched': 0})
        incorrect_pred_example_shown = False

        for analogy in list_of_analogies:

            returned_vectors = distsim.show_nearest(
                word_to_vec_dict,
                word_to_vec_dict[analogy[0]] - word_to_vec_dict[analogy[1]] +
                word_to_vec_dict[analogy[3]],  # <-THE CORE OF RESASONING
                set([analogy[0], analogy[1], analogy[3]]),
                distsim.cossim_dense)
            returned_vectors = [x[0] for x in returned_vectors]
            relation_group[groupName].append((analogy[2], returned_vectors))

            if analogy[2] == returned_vectors[0]:
                result_comp[groupName]['matched'] += 1
            else:
                result_comp[groupName]['unmatched'] += 1
                if not incorrect_pred_example_shown:
                    print groupName.ljust(col_len), ' Predicted / Actual : ',\
                        analogy[0]+" : "+ analogy[1]+ " :: "+ returned_vectors[0]+'/'+analogy[2]+' : '+ analogy[3]
                incorrect_pred_example_shown = True

    del word_to_vec_dict
    ########################### Print analysis

    relation_kind_accuracy = []
    for groupName, match_unmatch_dict in result_comp.iteritems():
        relation_kind_accuracy.append([
            groupName,
            round(
                float(match_unmatch_dict['matched']) /
                (match_unmatch_dict['matched'] +
                 match_unmatch_dict['unmatched']), 3)
        ])

    relation_kind_accuracy = sorted(relation_kind_accuracy,
                                    key=lambda x: x[1],
                                    reverse=True)
    print '\nGROUPS SORTED BY REASONING ACCURACY:'
    for groupName, accuracy in relation_kind_accuracy:
        print groupName.ljust(col_len), ' Accuracy:', accuracy
    print '\n'

    #result_table = []
    print ''.ljust(col_len), 'TOP_1', '\t', 'TOP_5', '\t', 'TOP_10'
    for groupName, analogy_returnedVectors in relation_group.iteritems():
        result_row = [groupName]

        for n_best in [1, 5, 10]:

            top_n = _get_n_best_count(analogy_returnedVectors, n_best)
            result_row.append(round(top_n / len(analogy_returnedVectors), 3))

        #result_table.append(result_row)
        print result_row[0].ljust(
            col_len), result_row[1], '\t', result_row[2], '\t', result_row[3]
    print '\n'

예제 #9

0

파일 보기

파일: extra.py 프로젝트: edison0829/course_codes

        cur.append(line.split(' '))
data.append(cur)
data = data[1:]
t = PrettyTable(['Class', '1-best', '5-best', '10-best'])
#     capi- tal 0, currency 1, city-in-state 2, family 3, adjective-to-adverb 4, comparative 5, superlative 6, and nationality-adjective 7.
title = [
    'capital', 'currency', 'city-in-state', 'family', 'adjective-to-adverb',
    'comparative', 'superlative', 'nationality-adjective'
]
p = [1, 5, 10]
for num in range(len(data)):
    total = len(data[num])
    count = [0, 0, 0]
    e = []
    for row in data[num]:
        word_to_vec_dict = distsim.load_word2vec("deps.words")
        w1 = word_to_vec_dict[row[0]]
        w2 = word_to_vec_dict[row[1]]
        w4 = word_to_vec_dict[row[3]]
        ret = distsim.show_nearest(word_to_vec_dict, w1 - w2 + w4,
                                   set([row[0], row[1], row[3]]),
                                   distsim.cossim_dense)
        true = row[2]
        for i in range(len(p)):
            l = [j[0] for j in ret[:p[i]]]
            if true in l:
                count[i] += 1
            elif p[i] == 1:
                e.append([ret[0][0], true])
    t.add_row([
        title[num], count[0] / float(total), count[1] / float(total),