def main(): model = load_w2v(sys.argv[1]) words = [w for w,_ in model.most_similar(positive=['country'], topn=20)] vecs = [model[w] for w in words] result = linkage(vecs, method='ward') dendrogram(result, labels=words, orientation='right') plt.show()
def main(): model = load_w2v(sys.argv[1]) with open(sys.argv[2]) as f: for line in f: if line[0] == ':': print(line.rstrip()) continue w1, w2, w3, w4 = line.rstrip().split() word, prob = model.most_similar(positive=[w2, w3], negative=[w1], topn=1)[0] print(' '.join([w1, w2, w3, w4, word, str(prob)]))
def main(): model = load_w2v(sys.argv[1]) words = [w for w, _ in model.most_similar(positive=['country'], topn=20)] vecs = [model[w] for w in words] result = TSNE(n_components=2, random_state=0).fit_transform(vecs) plt.scatter(result[:, 0], result[:, 1]) for point, word in zip(result, words): x, y = point plt.annotate(word, point) plt.show()
def main(): model = load_w2v(sys.argv[1]) words = [w for w, _ in model.most_similar(positive=['country'], topn=20)] vecs = [model[w] for w in words] labels = KMeans(n_clusters=5, random_state=0).fit_predict(vecs) result = [[], [], [], [], []] for word, label in zip(words, labels): result[label].append(word) for label in range(5): print('class {}:'.format(label)) print(', '.join(result[label]) + '\n')
def main(): model = load_w2v(sys.argv[1]) most_similars = model.most_similar(positive=['Spain', 'Athens'], negative=['Madrid']) for word, prob in most_similars: print('{}\t{}'.format(word, prob))
from q63 import analogy def load_question_words(): data = pd.read_csv( project_path / "data/w2v/questions-words.txt", sep=" ", skiprows=1, header=None, ) data = data[data.iloc[:, 0] != ":"] return data if __name__ == "__main__": print("\rSolving Q64 ... ", end="") w2v = load_w2v() data = load_question_words() analogies = [ analogy(w2v, r[1], r[0], r[2], topn=1)[0] for _, r in tqdm(list(data.iterrows())) ] data.insert(4, 4, [a[0] for a in analogies]) data.insert(5, 5, [a[1] for a in analogies]) data.to_csv(project_path / "output/w2v/analogy.csv") print("Done.")
def main(): model = load_w2v(sys.argv[1]) most_similars = model.most_similar('United_States', topn=10) for word, prob in most_similars: print('{}\t{}'.format(word, prob))
def main(): model = load_w2v(sys.argv[1]) similarity = model.similarity('United_States', 'U.S.') print(similarity)
def __init__(self, min_freq=1, root=project_path / "data/news/"): w2v = load_w2v() self.w2v = w2v self.min_freq = min_freq self.root = root self.padding_idx = 0