def train(self): print("訓練中...(喝個咖啡吧^0^)") # Load file sentence = word2vec.Text8Corpus( "D:\\word2vec\\trash\\segmentation.txt") # Setting degree and Produce Model(Train) model = word2vec(sentence, size=100, window=5, min_count=5, sg=1, workers=8) # Save model model.wv.save_word2vec_format( u"D:\\word2vec\\model\\zhfn100w5m5sg1.model.bin", binary=True) print("model1 已儲存完畢")
return model def wordsimilarity(word, model): semi = '' try: semi = model.most_similar(word, topn=10) except KeyError: print('The word not in vocabulary!') for term in semi: print('%s,%s' % (term[0], term[1])) def LineSentence(path): """将指定路经的文本转换成iterable of iterables""" sentences = [] i = 0 with codecs.open(path, 'r', encoding="UTF-8") as raw_texts: for line in raw_texts.readlines(): line = line.strip() sent_list = line.split() i += 1 print("sent " + i) sentences.append(sent_list) print("read sentences done!") return sentences config = Config() model = word2vec(config, saved=True)
#Feature 1 from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False, ngram_range=(2,2)) #Feature 2 from sklearn.feature_extraction.text import CountVectorizer bow = CountVectorizer() #Feature 3 from sklearn.feature_extraction.text import HashingVectorizer hash_vect = HashingVectorizer() #Feature 4 from gensim.models import word2vec w2vec = word2vec() from sklearn.pipeline import FeatureUnion combined_features = FeatureUnion([("tfidf_vect", tfidf_vect), ("bow", bow), ("hash",hash_vect), ('word2vec',word2vec)]) X_combined_features = combined_features.fit_transform(df['content'].values) y = df['label'].values print X_combined_features.toarray()
# train word2vec model ; shuffle data every epoch for i in range(n_epoch): random.shuffle(data) model.train(data, total_examples=len(data), epochs=1) # save model model.save('word2vec_model/CBOW.wv.syn0.npy') # main() sg = 0 vec_size = 256 min_count_of_each_word = 5 window_size = 5 n_epoch = 5 word2vec(sg, vec_size, min_count_of_each_word, window_size, n_epoch) ############### Application of word2vec ############### # load word2vec model model = word2vec.Word2Vec.load('/word2vec_model/CBOW.wv.syn0.npy') # get most similarity with given words model.wv.most_similar('nvidia') # Print >>> #[('GPU', 0.5550138354301453), # ('TPU', 0.5424560308456421), # ('Pro', 0.5173478126525879), # ('intel', 0.5163905620574951), # ('NVIDIA', 0.5157663226127625), # ('Intel', 0.5154422521591187), # ('PSV', 0.4950483441352844),
for j in range(length_of_walk): # 次に到達するノードを選択する next_node = random.choice(list(G.neighbors(now_node))) # リストに到達したノードをリストに追加する path.append(str(next_node)) # 今いるノードを「現在地」とする now_node = next_node # ランダムウォークしたノードをリストに追加 paths.append(path) # 訪れたノード群を返す return paths G = sample4() walking = make_random_walks(G, num_walk=512, length_of_walk=1000) model = word2vec(walking, min_count=1, size=2, window=10, workers=1) x = [] y = [] node_list = [] colors = [] fig, ax = plt.subplots() for node in G.nodes: vector = model.wv[str(node)] x.append(vector[0]) y.append(vector[1]) ax.annotate(str(node), (vector[0], vector[1])) if 0 <= node <= 2: colors.append("r") else: colors.append("b")
f = open(pg_file_path, mode="r") reader = csv.reader(f, delimiter="\t") # NetworkXにGraph.add_node()できるノードに変換 nodes = [(r[0], {"label": label(r[2])}) for r in reader if r[1] == ":page_id"] f.seek(0) # NetworkXに読み込める属性付きエッジに変換 edges = [(r[0], r[2], {"property": r[3]}) for r in reader if r[1] == "->"] labels = [{"id": x[0], "label": x[1]["label"]} for x in nodes] G = nx.DiGraph() # ノードを追加 G.add_nodes_from(nodes) # エッジを追加 G.add_edges_from(edges) # パラメータの検討必要!!!! walks = make_random_walks(G, 20, 20) model = word2vec(walks, min_count=0, size=2, window=5, workers=1) wiki_page_id = convert_artist2id(wiki_page_label) vector = model.wv[wiki_page_id] ranking = model.wv.most_similar([vector], [], int(sample_size)) for e in ranking: page_id = e[0] artist_name = convert_id2artist(page_id) print(artist_name)
sentences.append(cut) for elem in cut: new_data[elem] = new_data.get(elem, 0) + 1 from gensim.models import word2vec model = word2vec.Word2Vec(sentences, size=128, min_count=1) token_list = ['PAD_token', 'UNK_token', 'END_token'] #np.zeros((1 emb_vectors = [] emb_vectors.append(np.zeros((128))) emb_vectors.append(np.random.rand((128)) / 1000.0) emb_vectors.append(np.random.rand((128)) / 1000.0) for k, v in new_data.items(): #print k.encode('utf-8') #print model.wv[k] #print token_list.append(k) emb_vectors.append(np.array(model.wv[k])) emb_vectors = np.array(emb_vectors) np.save("../data/emb.npy", emb_vectors) with open("../data/token_list.json", 'w') as f: json.dump(token_list, f) load_emb = np.load('../data/emb.npy') pdb.set_trace() print('ok') #for k, v in new_data.items(): # print k.encode('utf-8'), v word2vec()
for item in full_sentences: sentence = item.split('\t') sentences.append(sentence) weighted_feats += sentence weighted_feats = set(weighted_feats) topn = len(weighted_feats) ####################################################################################### print("Starting training...") tic = time() model = word2vec(sentences, min_count=min_count, size=size, iter=epochs, window=window) print("\tTime: ", time()-tic) ####################################################################################### print("Starting prediction...") file = open(users_sentences, "r") users = file.read().splitlines() file.close() dicto = dict() for count, user in enumerate(users): dicto[user] = count
import gensim from gensim import models, similarities from gensim.models import word2vec with open('twitter_data/pos_tweets.txt', 'r') as infile: pos_tweets = infile.readlines() with open('twitter_data/neg_tweets.txt', 'r') as infile: neg_tweets = infile.readlines() #use 1 for positive sentiment, 0 for negative y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets)))) x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2) #Do some very minor text preprocessing def cleanText(corpus): corpus = [z.lower().replace('\n','').split() for z in corpus] return corpus x_train = cleanText(x_train) x_test = cleanText(x_test) n_dim = 300 #Initialize model and build vocab imdb_w2v = word2vec(size=n_dim, min_count=10) imdb_w2v.build_vocab(x_train) #Train the model over train_reviews (this may take several minutes) imdb_w2v.train(x_train)