def main(): model = FastText('model_text8.bin') target_words = [ 'granada', 'python', 'harmony', 'mafia', 'yoga', 'goth', 'cyberpunk', 'nasa', 'japan', 'boolean', 'foodball', 'algorithm', 'china', 'usa', 'internet', 'harvard', 'earth', 'horse', 'angel', 'rock' ] for t_word in target_words: # get embedding target_word_embedding = model.get_numpy_vector(t_word) print('Target word:', t_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(t_word, k=15) # init array nn_word_embedding = np.zeros(shape=(15, 128)) i = 0 for word, similarity in closest_words: # get each word embedding nn_word_embedding[i] = model.get_numpy_vector(word) #print('Word:', word, 'Vec:', nn_word_embedding[i]) i = i + 1 # kmeans #print(nn_word_embedding.shape) #print(closest_words) cluster_model = KMeans(n_clusters=3, init='k-means++') prediction = cluster_model.fit_predict(nn_word_embedding) print(prediction) j = 0 for word in closest_words: print('Word:', word[0], '- Cluster #%d' % (prediction[j] + 1)) j = j + 1
def text(): model = FastText('wiki.zh.bin') print('load over..') s1 = '启航' s2 = '董启航' s3 = ' 董启文' print(model.nearest_neighbors('桃', k=5)) #text()
def main(): model = FastText('model_text8.bin') target_word = 'dog' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) print('Embedding shape:', target_word_embedding.shape) print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity)
class FastTextEmbedding(Embedding): def __init__(self, binfile, normalize = False): self.file = binfile self.vdim = -1 self.normalize = normalize def load(self): print('Loading fasttext model.') self.ftmodel = FastText() self.ftmodel.load_model(self.file) self.vdim = len(self.ftmodel['is']) print('Finished loading fasttext model.') return self def getVector(self, word): return self.ftmodel.get_numpy_vector(word, normalized = self.normalize) def search(self, q, topk = 4): raise NotImplementedError() def wordForVec(self, v): word, sim = self.ftmodel.words_for_vector(v)[0] return word, sim def containsWord(self, word): return True def vocabulary(self): return self.ftmodel.words def nearest_neighbors(self, term, n=1000): return self.ftmodel.nearest_neighbors(term, n) def all_nearest_neighbors(self, term): return self.nearest_neighbors(term, len(self.vocabulary())) def dim(self): return self.vdim
model.skipgram(input="./clean_corpus", output='model', epoch=100, lr=0.7) print(model.nwords) ''' #model = fasttext.load_model('../data/model.bin') model = FastText('../data/model.bin') def get_set(filepath): result = set() with open(filepath, 'r') as file_in: for line in file_in: result.add(line.strip()) return result vocab_in = get_set("../data/vocab_in") vocab_out = get_set("../data/vocab_out") in2out_file = open("../data/in2out_map", 'w') for word in vocab_out: candi_list = model.nearest_neighbors(word, k=10) mark = False for temp in candi_list: if (temp[0] in vocab_in): in2out_file.write("\t".join([word, temp[0]]) + "\n") mark = True break if (not mark): in2out_file.write("\t".join([word, "none"]) + "\n") in2out_file.close()
if word_lst[j]=='ළ': if len(i)<=1: per_word = word_lst.copy() per_word[j] = 'ල' concat = '' for k in per_word: concat+=k permutated_words.append(concat) return permutated_words model = FastText('sinhala_all.bin') #print(model.similarity('බල්ල', 'බල්ලා')) sin_word = 'බළ්ළාට' permutated_sin_word = NanaLala(sin_word) words = model.nearest_neighbors(sin_word, k=10000) suggested_words = [] for i in words: suggested_words.append(i[0]) copy_suggested_words = suggested_words.copy() for i in suggested_words: if i in permutated_sin_word: copy_suggested_words.remove(i) copy_suggested_words.insert(0,i) print(copy_suggested_words[:5])
def evaluate(): model = FastText("debate2vec.bin") #print(model.similarity('dog', 'cat')) pprint.pprint(model.nearest_neighbors('praxis', k=30))
start = time.time() model = FastText('output.bin') with open("testDataMasked.txt", encoding="utf8") as f: contentData = f.readlines() contentData = [x.strip() for x in contentData] lineIndex = 1 with open("FastTextResult.txt", 'a', encoding="utf8") as fastTextResultFile: for line in contentData: tmpArray = line.split() vectors = [] maskedIndex = tmpArray.index('[MASK]') similiarsLeft = model.nearest_neighbors(tmpArray[maskedIndex - 1], k=1000) if (maskedIndex < len(tmpArray) - 1): similiarsRight = model.nearest_neighbors(tmpArray[maskedIndex + 1], k=1000) predictions = [] if (maskedIndex < len(tmpArray) - 1): for simL in similiarsLeft: index = 0 for simR in similiarsRight: if (simL[0] == simR[0]): predictions.append(simL[0]) else: for simL in similiarsLeft: predictions.append(simL[0])
output='skip_gram_model', epoch=100, lr=0.7) print(skip_gram_model['贷款']) # print(skip_gram_model.get_numpy_vector('贷款')) # print(skip_gram_model.get_numpy_vector('贷款', normalized=True)) var1 = skip_gram_model.get_numpy_vector('人民币') var2 = skip_gram_model.get_numpy_vector('贷款') var3 = skip_gram_model.get_numpy_vector('外币') skip_gram_model.words_for_vector(var1 + var2 - var3, k=1) # for word in skip_gram_model.words: # print(word, skip_gram_model[word]) print(skip_gram_model.nearest_neighbors('贷款', k=2)) # test data is stored inside a file, use this: # skip_gram_model.predict_proba_file('./test.txt', k=2) print("\n") ################## # 使用cbow模型训练 # ################## cbow_model = FastText() cbow_model.cbow(input='./train.txt', output='cbow_model', epoch=100, lr=0.7) print(cbow_model['贷款']) # print(cbow_model.get_numpy_vector('贷款')) # print(cbow_model.get_numpy_vector('贷款', normalized=True))
def lemma(girdi): istek = {"text": girdi, "fields": "lemma"} lemmas = ((requests.post(url, istek, headers)).text) sonuc = lemmas.strip(punctuations) cikti = re.sub(r'^.*\"', '', sonuc) if cikti == "No_Lemma": cikti = girdi else: cikti == sonuc return cikti # Load model and set query key primary = (model.nearest_neighbors(key, k=25)) my_dict = dict(primary) # Empty list for first cycle f_cycle = [] f_second_input = [] # Create first cycle data for word, weight in my_dict.items(): word = word.lower() weight = format(weight, '.3f') if word != key and not word.startswith(key): word = lemma(word) f_second_input.append(word) # sonuc = (f"{key},{word},{weight}") sonuc = key, word, weight
class WordModel: def __init__(self): self.model_name = None self.model = None self.model_path = None def get_vector(self, word): try: vector = self.model[word] except KeyError: vector = None return vector def get_full_path(self, relative_path): pwd = os.path.dirname(os.path.abspath(__file__)) pwd = os.path.join(pwd, relative_path) return pwd def path_exists(self, relative_path): full_path = self.get_full_path(relative_path) return os.path.exists(full_path) def validate_model_path(self, path): if path is None or not self.path_exists(path): if self.model_name == 'fasttext': default_path = self.get_full_path( 'models/fasttext_skipgram_model.bin') else: default_path = self.get_full_path( 'models/word2vec_skipgram.w2v') return default_path else: path = self.get_full_path(path) return path def validate_model_name(self, name): if name is not None and name in ['fasttext', 'word2vec']: correct_name = name else: correct_name = 'fasttext' return correct_name def get_numpy_vector(self, word): if self.model_name == 'fasttext': return self.model.get_numpy_vector(word, normalized=True) else: try: np_vector = self.model.wv[word] except KeyError: np_vector = None return np_vector def load_model(self): if not os.path.exists(self.model_path): raise FileNotFoundError('model file not found!') if self.model_name == 'fasttext': self.model = FastText(self.model_path) else: self.model = gensim.models.Word2Vec.load(self.model_path, mmap='r') def get_most_similar_words(self, word, k=5): if self.model_name == 'fasttext': return self.model.nearest_neighbors(word, k=k) else: try: similar_words = self.model.similar_by_word(word, topn=k) except KeyError: similar_words = [] return similar_words def get_words_for_vector(self, vector, k=3): if self.model_name == 'fasttext': return self.model.words_for_vector(vector, k) else: try: similar_words = self.model.similar_by_vector(vector, topn=k) except KeyError: similar_words = [] return similar_words def similarity(self, word_1, word_2): if self.model_name == 'fasttext': return self.model.similarity(word_1, word_2) else: try: sim = self.model.wv.similarity(word_1, word_2) except KeyError: sim = 0 return sim def word_analogies(self, words, k=3): word_list = words.split() if len(word_list) == 3: word_vec_1 = self.get_numpy_vector(word_list[0]) word_vec_2 = self.get_numpy_vector(word_list[1]) word_vec_3 = self.get_numpy_vector(word_list[2]) word_vec_4 = word_vec_3 - word_vec_1 + word_vec_2 return self.get_words_for_vector(word_vec_4, k) def odd_one_out(self, words): word_list = words.split() if len(word_list) == 4: scores = [0.0, 0.0, 0.0, 0.0] for index in range(len(word_list)): for another_word in word_list: if another_word != word_list[index]: scores[index] += self.similarity( word_list[index], another_word)**2 scores[index] = sqrt(scores[index] / 3.0) min_index = scores.index(min(scores)) result = f'{word_list[min_index]}\n' result += '\n' scores, word_list = zip(*sorted(zip(scores, word_list))) for word, score in zip(word_list, scores): result += f'{word}: {score}\t' return result
def main(): model = FastText('model_text8.bin') target_word = 'deep' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) print('Embedding shape:', target_word_embedding.shape) print('Embedding:', target_word_embedding[0:15], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'president' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'self' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'insult' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'general' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'inclined' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'property' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'international' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'many' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'imprisoned' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'branches' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'communist' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'france' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'strict' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'earthly' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) terget_word = "zero" # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'feminism' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'ideas' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'theory' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'writings' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3)