示例#1
0
class KNearestService(object):
    def __init__(self, actual_vspace, transformed_vspace):

        self.wvspace = WordVecSpaceMem(actual_vspace)
        self.t_vspace = DiskArray(transformed_vspace,
                                  dtype=[('vec', np.float32, 300)])

    def k_nearest(self,
                  word: str,
                  k: int = 10,
                  metric: str = 'angular') -> dict:
        index = self.wvspace.get_word_index(word)

        result = self.wvspace.get_nearest(index, k, metric=metric)
        actual_results = self.wvspace.get_word_at_indices(result)

        vec = self.t_vspace['vec'][index].reshape(1, 300)
        vecs = self.t_vspace['vec']

        if metric == 'angular':
            metric = 'cosine'

        dist = distance.cdist(vec, vecs, metric)

        dist = pd.Series(dist[0])
        res = dist.nsmallest(k).keys()
        trans_results = self.wvspace.get_word_at_indices(list(res))

        recall = len(set(actual_results) & set(trans_results)) / k

        data = dict(vspace_results=actual_results,
                    T_vspace_results=trans_results,
                    recall=recall)
        return data
示例#2
0
def k_nearest(wvspace, disk_f, word):
    wv = WordVecSpaceMem(wvspace)
    da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)])
    index = wv.get_word_index(word)

    result = wv.get_nearest(index, k=10)
    print(wv.get_word_at_indices(result))

    vec = da['vec'][index].reshape(1, 300)
    vecs = da['vec']

    #dist = distance.cdist(vec, vecs, 'cosine')
    dist = distance.cdist(vec, vecs, 'euclidean')
    #dist = np.dot(vec, vecs.T)

    dist = pd.Series(dist[0])
    res = dist.nsmallest(10).keys()
    print('\n')
    print(wv.get_word_at_indices(list(res)))
示例#3
0
def k_nearest(wvspace, disk_f, words, metric, image_name):
    f = open(words)
    wv = WordVecSpaceMem(wvspace)
    da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)])

    vecs = da['vec']

    psame = []
    pnsame = []
    for line in f:
        words = json.loads(line.strip())
        index1 = wv.get_word_index(words[0])
        index2 = wv.get_word_index(words[1])

        if 'clinicaltrials' in words[0] or 'clinicaltrials' in words[1]:
            continue

        vec1 = vecs[index1].reshape(1, 300)
        vec2 = vecs[index2].reshape(1, 300)

        if metric == 'cosine':
           vspace_dist = wv.get_distance(words[0], words[1])
           tvspace_dist = distance.cosine(vec1, vec2)
        else:
            vspace_dist = wv.get_distance(words[0], words[1], metric='euclidean')
            tvspace_dist = distance.euclidean(vec1, vec2)

        if words[2] == 0:
            psame.append(tvspace_dist)
        else:
            pnsame.append(tvspace_dist)

    dm = (np.std(psame) + np.std(pnsame)) / 2
    nm = abs(np.mean(psame) - np.mean(pnsame))

    d = nm / dm
    print('the cohens D distance is', d)

    plt.hist(psame, bins=50, alpha=0.5, label='same points')
    plt.hist(pnsame, bins=50, alpha=0.5, label='not same points')
    plt.legend(loc='upper right')
    plt.savefig(image_name)