class KNearestService(object): def __init__(self, actual_vspace, transformed_vspace): self.wvspace = WordVecSpaceMem(actual_vspace) self.t_vspace = DiskArray(transformed_vspace, dtype=[('vec', np.float32, 300)]) def k_nearest(self, word: str, k: int = 10, metric: str = 'angular') -> dict: index = self.wvspace.get_word_index(word) result = self.wvspace.get_nearest(index, k, metric=metric) actual_results = self.wvspace.get_word_at_indices(result) vec = self.t_vspace['vec'][index].reshape(1, 300) vecs = self.t_vspace['vec'] if metric == 'angular': metric = 'cosine' dist = distance.cdist(vec, vecs, metric) dist = pd.Series(dist[0]) res = dist.nsmallest(k).keys() trans_results = self.wvspace.get_word_at_indices(list(res)) recall = len(set(actual_results) & set(trans_results)) / k data = dict(vspace_results=actual_results, T_vspace_results=trans_results, recall=recall) return data
def k_nearest(wvspace, disk_f, word): wv = WordVecSpaceMem(wvspace) da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)]) index = wv.get_word_index(word) result = wv.get_nearest(index, k=10) print(wv.get_word_at_indices(result)) vec = da['vec'][index].reshape(1, 300) vecs = da['vec'] #dist = distance.cdist(vec, vecs, 'cosine') dist = distance.cdist(vec, vecs, 'euclidean') #dist = np.dot(vec, vecs.T) dist = pd.Series(dist[0]) res = dist.nsmallest(10).keys() print('\n') print(wv.get_word_at_indices(list(res)))
def k_nearest(wvspace, disk_f, words, metric, image_name): f = open(words) wv = WordVecSpaceMem(wvspace) da = DiskArray(disk_f, dtype=[('vec', np.float32, 300)]) vecs = da['vec'] psame = [] pnsame = [] for line in f: words = json.loads(line.strip()) index1 = wv.get_word_index(words[0]) index2 = wv.get_word_index(words[1]) if 'clinicaltrials' in words[0] or 'clinicaltrials' in words[1]: continue vec1 = vecs[index1].reshape(1, 300) vec2 = vecs[index2].reshape(1, 300) if metric == 'cosine': vspace_dist = wv.get_distance(words[0], words[1]) tvspace_dist = distance.cosine(vec1, vec2) else: vspace_dist = wv.get_distance(words[0], words[1], metric='euclidean') tvspace_dist = distance.euclidean(vec1, vec2) if words[2] == 0: psame.append(tvspace_dist) else: pnsame.append(tvspace_dist) dm = (np.std(psame) + np.std(pnsame)) / 2 nm = abs(np.mean(psame) - np.mean(pnsame)) d = nm / dm print('the cohens D distance is', d) plt.hist(psame, bins=50, alpha=0.5, label='same points') plt.hist(pnsame, bins=50, alpha=0.5, label='not same points') plt.legend(loc='upper right') plt.savefig(image_name)