def generate(q1, q2, answer, model_google, options): sentences = [] for i in options: sentences.append(q1 + answer[i] + q2) sentences = Word2Vec.cleanText(sentences) n_dim = 300 vectors = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in sentences] dataset = [] for a in vectors: sentence = np.zeros((49, 300)) m = len(a) start = int((49 - m) / 2) sentence[start:start + m] = a dataset.append(np.array(sentence)) return dataset
def main(): print("数据预处理阶段") DataPretreat.prepare_data(windows_size=3) vocabulary_size = len(DataPretreat.vocabulary_list) SkipGram = Word2Vec.SkipGram(vocabulary_size) print("创建SkipGram神经网络") SkipGram.build_network() print("训练SkipGram神经网络") SkipGram.train() print("可视化SkipGram训练效果") SkipGram.visualize()
def yes(): try: global bVectorSpace, rVectorSpace, up_to_date if up_to_date: return bVectorSpace, rVectorSpace = wv.start() except Exception as e: mw.MessageWindow( "Error", "Error occurred during generating vector spaces" + str(e)) return mw.MessageWindow("Reload", "Vector spaces are up-to-date.") up_to_date = True del gf.bWords[:], gf.rWords[:]
def run(self, flag): nx_graphs, _ = Reader.multi_readG(self.path) if flag == "LN": r_t = Reader.true_cluster(self.path).tolist() print(clustering(r_t)) cluster_true = [r[0] - 1 for r in r_t] k_list = [k for k in range(2, 11)] else: cluster_true = [] k_list = [2, 3, 6, 8] for i in range(29): if i < 12: cluster_true.append(0) else: cluster_true.append(1) w_dict = Reader.weight(self.path) print(nx_graphs[0]) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs, self.p, self.q, 0.5) MK_G.preprocess_transition_probs(w_dict, 2) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() result = {} for k in k_list: cluster_trained = KMeans( n_clusters=k, random_state=0).fit_predict(M_matrix).tolist() length = min(len(cluster_true), len(cluster_trained)) r = normalized_mutual_info_score(cluster_true[0:length], cluster_trained[0:length]) f = f1_score(cluster_true[0:length], cluster_trained[0:length], average='micro') print(cluster_trained) print(cluster_true) result[k] = (r, f) #pickle.dump(cluster_trained, open(self.path+str(k)+'.pickle', '+wb')) print(result)
def predict(sentence, num_words, model): # 预测此句是否为要点句,是返回1,不是返回0 data = np.empty((1, num_words, size), dtype="float64") sentence = u.remove_useless(sentence) word_list = u.seg2words_long(sentence) word_list = word_list[:num_words] # 把长度缩减到训练模型的长度 num = 0 vector_model = wv.load_model() for i in range(len(word_list)): word = word_list[i].encode('utf-8') vector = wv.get_vector(vector_model, word) if vector == []: continue data[0, i, :] = vector num += 1 for j in range(num, num_words): data[0, j, :] = -1 prediction = model.predict(data) # print "%.2f%%" % (float(prediction[0][0]) * 100) + " " + "%.2f%%" % (float(prediction[0][1]) * 100) return np.argmax(prediction)
def generate(q1, q2, answer, model_google, options): sentences = [] for i in options: sentences.append(q1 + answer[i] + q2) sentences = Word2Vec.cleanText(sentences) n_dim = 300 vectors = [ Word2Vec.buildWordVector(model_google, z, n_dim) for z in sentences ] dataset = [] for a in vectors: sentence = np.zeros((49, 300)) m = len(a) start = int((49 - m) / 2) sentence[start:start + m] = a dataset.append(np.array(sentence)) question = [] for i in options: question.append(q1 + q2) question = Word2Vec.cleanText(question) n_dim = 300 q = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in question] q_set = [] for a in q: sentence = np.zeros((49, 300)) m = len(a) start = int((49 - m) / 2) sentence[start:start + m] = a q_set.append(np.array(sentence)) option = [] for i in options: option.append(answer[i]) option = Word2Vec.cleanText(option) n_dim = 300 a = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in option] a_set = [] for a in a: sentence = np.zeros((4, 300)) m = len(a) if not m == 0: start = int((4 - m) / 2) sentence[start:start + m] = a a_set.append(np.array(sentence)) return dataset, q_set, a_set
def run(self): path = self.path nx_graphs, total_edges = Reader.multi_readG(path) r_list, nx_graphs_sampled = Sampler.multi_sampling(path, self.s_p) print('%d edges sampled, graph length is %d' % (len(r_list), len(nx_graphs_sampled))) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q) MK_G.preprocess_transition_probs() MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() r_set = set([node for edge in r_list for node in edge]) eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) M_precision = eval_p.eval() print("*** Merged graph precision: ", M_precision)
def run(self): path = self.path #### Step 1: reading and sampling graphs nx_graphs, airport_mapping, airport_dst = Reader.read_airline(path) print(nx_graphs[0].nodes()) r_set = set() if self.flag == 0 or self.flag == 4: w_dict = {} MK_G = Node2Vec_LayerSelect.Graph(nx_graphs, self.p, self.q, self.r) MK_G.preprocess_transition_probs(w_dict, 1) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.edge_list_eval( airport_dst, airport_mapping) print("*** MKII Random: precision %f, accuracy %f, F %f" % (precision, recall, F)) ''' eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Random AUC:", M_auc) ''' print( "-----------------------DONE--------------------------------")
def main(): filename = "../Data/Final_Dataset_Word2Vec_Emoji2Vec.csv" print("1. Train with Word2Vec, 2. Train with Emoji2Vec 3. Both") print("Enter choice (1/2/3):") ch = int(input()) if ch == 1: word_vec = w.main(filename) return word_vec elif ch == 2: Emoji_vec = ex.main(filename) return Emoji_vec elif ch == 3: print("Concatenating...") Concatenated_Vector = c.main() return Concatenated_Vector else: print("Invalid")
def run(self): path = self.path #### Step 1: reading and sampling graphs m_graph, nx_graphs, total_edges = Reader.multi_readG_with_Merg(path) print("%d total nodes" % len(m_graph.nodes())) r_list, m_graph_sampled, nx_graphs_sampled = Sampler.multi_sampling_with_Merg( path, self.s_p) print( "%d edges before sampling, %d edges after sampling. sampled %d " % (len(m_graph.edges()), len(m_graph_sampled.edges()), len(r_list))) r_set = set([node for edge in r_list for node in edge]) if self.flag == 0 or self.flag == 1: #### Step 2: Aggregated graph #for i in range(2): M_G = Node2Vec.Graph(m_graph_sampled, self.p, self.q) M_G.preprocess_transition_probs() M_walks = M_G.simulate_walks(self.num_walks, self.walk_length) M_words = [] for walk in M_walks: M_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(M_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, m_graph, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** Aggregated graph: precision %f, accuracy %f, F %f " % (precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, m_graph, m_graph_sampled) M_auc = eval_a.eval_auc(1) print("@@@ Merged graph AUC:", M_auc) print( "-----------------------DONE--------------------------------") #### Step 3: Aggregated result if self.flag == 0 or self.flag == 2: T_matrix = {} T_mapping = {} for g in nx_graphs_sampled: #print(g.edges()) G = Node2Vec.Graph(g, self.p, self.q) G.preprocess_transition_probs() walks = G.simulate_walks(self.num_walks, self.walk_length) words = [] for walk in walks: words.extend([str(step) for step in walk]) L = Word2Vec.Learn(words) matrix, mapping = L.train() T_matrix[g] = matrix T_mapping[g] = mapping eval_p_s = Evaluator.combining_Precision_Eval( T_matrix, T_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p_s.eval() print("*** Aggregated result: precision %f, accuracy %f, F %f" % (precision, recall, F)) eval_a = Evaluator.combining_AUC_Eval(T_matrix, T_mapping, nx_graphs, nx_graphs_sampled) S_auc = eval_a.eval_auc(1) print('@@@ Separated garph AUC:', S_auc) print( "-----------------------DONE--------------------------------") #### Step 4: MKII verification if self.flag == 0 or self.flag == 3: graph_list_sampled = [] graph_list_sampled.append(m_graph_sampled) graph_list = [] graph_list.append(m_graph) w_dict = {} MK_G = Node2Vec_LayerSelect.Graph(graph_list, self.p, self.q, self.r) MK_G.preprocess_transition_probs(w_dict, 1) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, graph_list[0], r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII verification: precision %f, accuracy %f, F %f" % (precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, m_graph, m_graph_sampled) M_auc = eval_a.eval_auc(1) print("@@@ Merged graph AUC:", M_auc) print( "-----------------------DONE--------------------------------") #### Step 5: MKII Random if self.flag == 0 or self.flag == 4: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, self.r) MK_G.preprocess_transition_probs(w_dict, 1) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Random: precision %f, accuracy %f, F %f" % (precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Random AUC:", M_auc) print( "-----------------------DONE--------------------------------") #### Step 6: MKII Weighted if self.flag == 0 or self.flag == 4: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, self.r) MK_G.preprocess_transition_probs(w_dict, 2) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Weighted: precision %f, accuracy %f, F %f" % (precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Weighted AUC:", M_auc) print( "-----------------------DONE--------------------------------") #### Step 7: MKII Biased if self.flag == 0 or self.flag == 4: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, self.r) MK_G.preprocess_transition_probs(w_dict, 0) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Biased: precision %f, accuracy %f, F %f" % (precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Biased AUC:", M_auc) print( "-----------------------DONE--------------------------------") #### Step 8: MKII Biased_ii if self.flag == 0 or self.flag == 4: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, self.r) MK_G.preprocess_transition_probs(w_dict, 3) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Biased_ii: precision %f, accuracy %f, F %f" % (precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Biased_ii AUC:", M_auc) print( "-----------------------DONE--------------------------------") if self.flag == 4: for r in range(11): r_t = r / 10.0 if r_t == 0: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, 0.1) MK_G.preprocess_transition_probs(w_dict, 1) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Random: precision %f, accuracy %f, F %f" % (precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Random AUC:", M_auc) print( "-----------------------DONE--------------------------------" ) else: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, r_t) MK_G.preprocess_transition_probs(w_dict, 3) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print( "*** MKII Biased_ii with %f: precision %f, accuracy %f, F %f" % (r_t, precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Biased_ii AUC:", M_auc) #### Step 9: CommoneNeighbors and Jaccard if self.flag == 0 or self.flag == 5: p = link_pred.Prediction() v_set = p.create_vertex(m_graph.edges()) matrix_perm = p.create_adjmatrix( [edge for edge in itertools.combinations(r_set, 2)], v_set) matrix_ori = p.create_adjmatrix(m_graph.edges(), v_set) matrix_samp = p.create_adjmatrix(m_graph_sampled.edges(), v_set) cn = link_pred.CommonNeighbors() score_cn = cn.fit(matrix_ori) C_precision, C_recall, C_F = p.acc(score_cn, matrix_ori, matrix_perm, self.e_p) print("*** CommonNeighbors: precision %f, accuracy %f, F %f" % (C_precision, C_recall, C_F)) C_auc = p.auc_score(score_cn, matrix_ori, matrix_samp, "cc") print("@@@ CommonNeighbors: AUC %f", C_auc) ja = link_pred.Jaccard() score_ja = ja.fit(matrix_ori) J_precision, J_recall, J_F = p.acc(score_ja, matrix_ori, matrix_perm, self.e_p) print("*** Jaccard: precision %f, accuracy %f, F %f" % (J_precision, J_recall, J_F)) J_auc = p.auc_score(score_ja, matrix_ori, matrix_samp, "cc") print("@@@ Jaccard: AUC %f", J_auc) print( "-----------------------DONE--------------------------------")
def featuresExtraction(dataSet): #calculate feature 1 - TFIDF textDataSet = [] for line in dataset: textDataSet.append(line[0]) textDataSet.append(line[1]) #add synonyms newDataSet = Tep.addSynonyms(textDataSet) finalDataSet = [] #stemming for line in newDataSet: text = mnlp.stemming(line) finalDataSet.append(mnlp.convertText(text)) #obtendo os vetores TF-IDF de cada frase vector = Tfidf.calculateTFIDF(finalDataSet) similarities = [] """aqui calculamos a distância do cosseno entre a frase 1 e a frase 2, ou seja, entre os pares de frases esse vector vai ter os vetores tf-idf de cada frase, no caso, é como se na posição 0 estivesse a frase 1, na posição 1 estivesse a frase 2, na posição 2 estivesse a frase 3... e assim por diante Então se queremos calcular a similaridade entre a frase 1 e a frase 2 do nosso banco, devemos calcular a distância do cosseno entre vector[0] e vector[1] Por isso o for abaixo intera de 2 em 2 --> range(0, len(vector), 2) """ for i in range(0, len(vector), 2): distance = spatial.distance.cosine(vector[i], vector[i + 1]) similarities.append(1 - distance) #calculando as outras features #iniciando o modelo do word2vec word_vectors, model = Word2Vec.startModel() features = [] #para cada linha do meu csv, vou calcular a similaridade utilizando esses métodos a seguir: for x in range(len(dataSet)): featuresLine = [] """calculando a feature 2 entre a coluna 0 e coluna 1 do meu csv esse método obtive de um trabalho da literatura, vou te passar o pdf dele também """ feature2 = Word2Vec.wordOrderSimilarity(word_vectors, model, dataSet[x][0], dataSet[x][1]) """A feature 3 é distância do cosseno entre os vetores de cada frase, ou seja, o vetor de cada frase é a soma dos vetores de embeddings de cada palavra""" sim2 = Word2Vec.embeddingsSimilarity(model, dataSet[x][0], dataSet[x][1]) if math.isnan(sim2): feature3 = 1.0 else: feature3 = sim2 """A feature 4 utiliza uma matriz de similaridades com tamanho: numero de palavras da frase 1 X numero de palavras da frase 2 Esse é o método que utilizei na minha dissertação. O word2vec aqui foi utilizado para calcular a similaridade entre as palavras. E a similaridade entre as frases é obtida utilizando esse método da matriz""" sim3 = Word2Vec.calculateSimilarity(word_vectors, model, dataSet[x][0], dataSet[x][1]) if math.isnan(sim3): feature4 = 1.0 else: feature4 = sim3 """Esse feature utiliza a mesma matriz da feature acima, só que no lugar de calcular a similaridade entre as palavras usando word2vec, nós usamos uma abordagem binária. Se as palavras forem iguais, a similaridade entre elas será 1, se forem diferentes a similaridade entre elas será 0""" feature5 = Word2Vec.binarySimilarity(dataSet[x][0], dataSet[x][1]) """A feature 6 será o tamanho da frase menor dividido pelo tamanho da frase maior""" size1 = len(mnlp.tokenize(dataset[x][0])) size2 = len(mnlp.tokenize(dataset[x][1])) if (size1 > size2): feature6 = size2 / size1 else: feature6 = size1 / size2 #salvo um aquivo com as features extraídas e a classe a qual elas pertencem featuresLine.append(similarities[x]) featuresLine.append(feature2) featuresLine.append(feature3) featuresLine.append(feature4) featuresLine.append(feature5) featuresLine.append(feature6) featuresLine.append(dataSet[x][2]) #similarity class #print(featuresLine) features.append(featuresLine) #imprimindo o valor de similaridade obtido, combinando as features similaridade = (0.3 * similarities[x]) + (0.1 * feature2) + ( 0.2 * feature3) + (0.2 * feature4) + (0.1 * feature5) + (0.1 * feature6) print(similaridade)
def run(self): path = self.path #### Step 1: reading and sampling graphs ''' m_graph, nx_graphs, total_edges = Reader.multi_readG_with_Merg(path) print("%d total nodes"%len(m_graph.nodes())) r_list, m_graph_sampled, nx_graphs_sampled = Sampler.multi_sampling_with_Merg(path, self.s_p) print("%d edges before sampling, %d edges after sampling. sampled %d "%(len(m_graph.edges()), len(m_graph_sampled.edges()), len(r_list))) r_set = set([node for edge in r_list for node in edge]) ''' nx_graphs_sampled, _ = Reader.multi_readG(self.path) cluster_true = [] for i in range(29): if i < 12: cluster_true.append(0) else: cluster_true.append(1) for r in range(11): r_t = r / 10.0 if r_t == 0: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, 0.1) MK_G.preprocess_transition_probs(w_dict, 1) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() ''' eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Biased: precision %f, accuracy %f, F %f"%(precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Biased AUC:", M_auc) ''' else: w_dict = Reader.weight(self.path) #print(w_dict) MK_G = Node2Vec_LayerSelect.Graph(nx_graphs_sampled, self.p, self.q, r_t) MK_G.preprocess_transition_probs(w_dict, 3) MK_walks = MK_G.simulate_walks(self.num_walks, self.walk_length) MK_words = [] for walk in MK_walks: MK_words.extend([str(step) for step in walk]) M_L = Word2Vec.Learn(MK_words) M_matrix, M_mapping = M_L.train() ''' eval_p = Evaluator.Precision_Eval(M_matrix, M_mapping, nx_graphs, r_set, self.e_p) precision, recall, F = eval_p.eval() print("*** MKII Biased_ii with %f: precision %f, accuracy %f, F %f"%(r_t, precision, recall, F)) eval_a = Evaluator.AUC_Eval(M_matrix, M_mapping, nx_graphs, nx_graphs_sampled) M_auc = eval_a.eval_auc(1) print("@@@ MKII Biased_ii AUC:", M_auc) ''' cluster_trained = KMeans( n_clusters=2, random_state=0).fit_predict(M_matrix).tolist() length = min(len(cluster_true), len(cluster_trained)) r = normalized_mutual_info_score(cluster_true[0:length], cluster_trained[0:length]) mi_f = f1_score(cluster_true[0:length], cluster_trained[0:length], average='micro') ma_f = f1_score(cluster_true[0:length], cluster_trained[0:length], average='macro') print("r is %f: nmi %f, micro_f %f, macro_f %f" % (r_t, r, mi_f, ma_f)) print( "-----------------------DONE--------------------------------")
# Node2Vec # generating all the walks that needed in learning p = 0.5 q = 0.5 num_walks = 10 walk_length = 80 G = Node2Vec.Graph(New_BFSlist, New_Edgelist, p, q) G.preprocess_transition_probs() walks = G.simulate_walks(num_walks, walk_length) print('walk list size', len(walks)) words = [] for walk in walks: words.extend([str(step) for step in walk]) L = Word2Vec.Learn(words) matrix, mapping = L.train() percentage, AUC = T.run_test(Removelist, matrix, mapping, BFSlist) print(("the percetion of prediction is %f " % percentage)) print("the AUC of prediction is %f" % AUC) results_file.write(str(AUC) + '\t' + str(percentage) + '\t') print(("Total time comsumed %fs" % (time.time() - start))) # Node2Vec END # --------------------------------------------------------------------------------- # # --------------------------------------------------------------------------------- # # Embedding # 1. 生成MultiNetsEmbedding需要的东西 os.system("python3 generate_facts.py %s" % filename) # 2. MultiEmbedding os.system("./Embedding -network_name %s -generate_flag 0" % filename)
def run(self): path = self.path online_dir = path + "online/" online_graphs, _ = Reader.multi_readG(online_dir) offline_dir = path + "offline/" offline_graphs, _ = Reader.multi_readG(offline_dir) ### Step 1: learing with N2V MKII if self.flag == 0 or self.flag == 1: off_G = Node2Vec_LayerSelect.Graph(offline_graphs, self.p, self.q) off_G.preprocess_transition_probs() off_walks = off_G.simulate_walks(self.num_walks, self.walk_length) off_words = [] for walk in off_walks: off_words.extend([str(step) for step in walk]) off_L = Word2Vec.Learn(off_words) off_matrix, off_mapping = off_L.train() on_G = Node2Vec_LayerSelect.Graph(online_graphs, self.p, self.q) on_G.preprocess_transition_probs() on_walks = on_G.simulate_walks(self.num_walks, self.walk_length) on_words = [] for walk in on_walks: on_words.extend([str(step) for step in walk]) on_L = Word2Vec.Learn(on_words) on_matrix, on_mapping = on_L.train() off_perm_list = common_nodes(off_mapping, online_graphs) off_eval = Evaluator.Precision_Eval(off_matrix, off_mapping, online_graphs, off_perm_list, self.e_p) off_precision = off_eval.eval() print("*** Off to on MKII precision: ", off_precision) off_eval_a = Evaluator.AUC_Eval(off_matrix, off_mapping, online_graphs, offline_graphs) off_auc = off_eval_a.eval_auc(0) print("@@@ Off to on MKII AUC:", off_auc) on_perm_list = common_nodes(on_mapping, offline_graphs) on_eval = Evaluator.Precision_Eval(on_matrix, on_mapping, offline_graphs, on_perm_list, self.e_p) on_precision = on_eval.eval() print("*** On to off MKII precision: ", on_precision) on_eval_a = Evaluator.AUC_Eval(on_matrix, on_mapping, offline_graphs, online_graphs) on_auc = on_eval_a.eval_auc(0) print("@@@ On to off MKII AUC:", on_auc) if self.flag == 0 or self.flag == 2: on_matrix = {} on_mapping = {} on_perm_list = [] for g in online_graphs: G = Node2Vec.Graph(g, self.p, self.q) G.preprocess_transition_probs() walks = G.simulate_walks(self.num_walks, self.walk_length) words = [] for walk in walks: words.extend([str(step) for step in walk]) L = Word2Vec.Learn(words) matrix, mapping = L.train() on_matrix[g] = matrix on_mapping[g] = mapping on_perm_list.extend(common_nodes(mapping, offline_graphs)) on_perm_list = set([node for node in on_perm_list]) #print(on_perm_list) #print(on_mapping) eval_p_on = Evaluator.combining_Precision_Eval( on_matrix, on_mapping, offline_graphs, on_perm_list, self.e_p) on_precision = eval_p_on.eval() print("*** on to off precision: ", on_precision) on_eval_a = Evaluator.combining_AUC_Eval(on_matrix, on_mapping, offline_graphs, online_graphs) on_auc = on_eval_a.eval_auc(0) print("@@@ On to off AUC:", on_auc) off_matrix = {} off_mapping = {} off_perm_list = [] for g in offline_graphs: G = Node2Vec.Graph(g, self.p, self.q) G.preprocess_transition_probs() walks = G.simulate_walks(self.num_walks, self.walk_length) words = [] for walk in walks: words.extend([str(step) for step in walk]) L = Word2Vec.Learn(words) matrix, mapping = L.train() off_matrix[g] = matrix off_mapping[g] = mapping off_perm_list.extend(common_nodes(mapping, online_graphs)) off_perm_list = set([node for node in off_perm_list]) eval_p_off = Evaluator.combining_Precision_Eval( off_matrix, off_mapping, online_graphs, off_perm_list, self.e_p) off_precision = eval_p_off.eval() print("*** off to on precision: ", off_precision) off_eval_a = Evaluator.combining_AUC_Eval(off_matrix, off_mapping, online_graphs, offline_graphs) off_auc = off_eval_a.eval_auc(0) print("@@@ Off to on AUC:", off_auc)
def MainProcedure(sentence): tokens = token_String.tokenizer(sentence) count = 0 finalMovieList = {} for token in tokens: count = count + 1 if token[1] == "n": initialVec = MySqlConn.returnMovieIdFromTag(token[0]) # print(initialVec) else: tagsMovieIds = MySqlConn.returnMovieIdFromTag(token[0]) #print(tagsMovieIds) genresMovieIds = MySqlConn.returnMovieIdFromGenre(token[0]) #print(genresMovieIds) MovieIdsTagsAndGenres = mergeArrays(tagsMovieIds, genresMovieIds) #print(MovieIdsTagsAndGenres) word2vecSynonims = Word2Vec.give_Word2VecSinonims(token[0]) wordNetSynonims = wordnet.wordNet(token[0]) synonims = mergeArrays(word2vecSynonims, wordNetSynonims) movieIdList = [] for synonim in synonims: movieIdList.extend(MySqlConn.returnMovieIdFromTag(synonim)) movieIdList = numpy.unique(movieIdList) movieIds = mergeArrays(list(movieIdList), list(MovieIdsTagsAndGenres)) #print(tagsMovieIds) if count == 1: for movie in movieIds: finalMovieList[movie] = 1 # print(len(finalMovieList)) else: for movie in movieIds: if movie in finalMovieList: finalMovieList[movie] = finalMovieList[movie] + 1 else: finalMovieList[movie] = 1 dicDeFrec = defaultdict(list) for movie in finalMovieList: rating = MySqlConn.returnRatingForMovieId(movie) dicDeFrec[finalMovieList[movie]].append({movie: rating}) max_films = 1 movielist = [] frec = len(tokens) for i in range(len(tokens), 0, -1): if dicDeFrec[i] != []: for j in dicDeFrec[i]: if max_films != 0: movielist.append(j) max_films = max_films - 1 else: break # frec = len(tokens) # max_films = 5 # for movie in finalMovieList: # if max_films != 0: # if finalMovieList[movie] == frec: # rating = MySqlConn.returnRatingForMovieId(movie) # dicDeFrec[finalMovieList[movie]].append({movie: rating}) # max_films = max_films - 1 # else: # pass # print(movielist) return movielist
def save_data(line_list, data_path, ignore): num_lines = 0 largest_num = 0 vector_model = wv.load_model() for i in range(len(line_list)): if (i + 1) % 50 == 0: print("第" + str(i + 1) + "行 (" + str(i + 1) + "/" + str(len(line_list)) + ")") # 清除无关信息 line_list[i] = u.remove_useless(line_list[i]) # 处理标签 label = 0 if "|" in line_list[i]: label = 1 line_list[i] = line_list[i].replace("|", "") else: if ignore: if random.randint(0, 9) < 3: label = 0 else: continue else: label = 0 # 转换为词向量 total_vector = [] word_list = u.seg2words_long(line_list[i]) for word in word_list: word = word.encode('utf-8') vector = wv.get_vector(vector_model, word) # 模型是utf-8的 if (vector == []): continue total_vector.append(vector) # 找到最大值 if len(total_vector) > largest_num: largest_num = len(total_vector) # 去除空行 if total_vector == []: continue num_lines += 1 # 写入数据 f = open(data_path, "a") f.write(str(label) + "\n") for vector in total_vector: for num in vector: f.write(str(num) + " ") f.write("\n") f.write("%\n") f.close() # 在开头两行补上数据的维度,以供在训练的初始化时提取 f = open(data_path, 'r+') content = f.read() f.seek(0, 0) f.write(str(num_lines) + "\n") f.write(str(largest_num) + "\n") f.write(content) f.close()
def main(): word_vec = w.main() Emoji_vec = ex.main() print("Concatenating...") Concatenated_Vector = JoinVectors(word_vec, Emoji_vec, len(word_vec)) return Concatenated_Vector
return np.sum(y == yhat) * 100.0 / y.size def softmax_wrapper(features, labels, weights, regularization=0.0): cost, grad, _ = softmaxRegression(features, labels, weights, regularization) return cost, grad # Gradient check always comes first dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) dimVectors = 10 C = 5 _, wordVectors0, _ = Word2Vec.load_saved_params() wordVectors = (wordVectors0[:nWords, :] + wordVectors0[nWords:, :]) #dummy_weights = 0.1 * np.random.randn(dimVectors, 5) #dummy_features = np.zeros((10, dimVectors)) #dummy_labels = np.zeros((10,), dtype=np.int32) #for i in xrange(10): # words, dummy_labels[i] = dataset.getRandomTrainSentence() # dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words) #print "==== Gradient check for softmax regression ====" #gradcheck_naive(lambda weights: softmaxRegression(dummy_features, dummy_labels, weights, 1.0, nopredictions = True), dummy_weights) # #print "\n=== For autograder ===" #print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0) # Try different regularizations and pick the best!
# -*- coding: utf-8 -* import sys sys.path.insert( 0, '/Users/davichiar/Documents/ADDAVICHI/Python/Sentimental-Analysis-master/Bidirectional_LSTM' ) import os import tensorflow as tf import Bi_LSTM import Word2Vec import gensim import numpy as np W2V = Word2Vec.Word2Vec() Batch_size = 1 Vector_size = 300 Maxseq_length = 2600 learning_rate = 0.001 lstm_units = 128 num_class = 2 keep_prob = 1.0 X = tf.placeholder(tf.float32, shape=[None, Maxseq_length, Vector_size], name='X') Y = tf.placeholder(tf.float32, shape=[None, num_class], name='Y') seq_len = tf.placeholder(tf.int32, shape=[None])
import Helper import Word2Vec import CNN positive, negative = Helper.Partition_Pos_Neg_Data() sentences = Helper.Get_Sentences(positive, negative) model = Word2Vec.Do_Word2Vec(sentences) # print(model.similarity('মেসি', 'নেইমার')) CNN.Do_CNN(positive, negative, model) # words = Helper.Sentence2Word(positive[0]) # for w in words : # print(w)
import Word2Vec import gensim import numpy as np import pymysql.cursors # =========================================== # load data connection = pymysql.connect(user='******', password='******', database='GRE') cursor = connection.cursor() commit = "select * from GRES" cursor.execute(commit) Sentences = [each[1] for each in cursor.fetchall()] Sentences = Word2Vec.cleanText(Sentences) # =========================================== # Load model model_google = gensim.models.Word2Vec.load_word2vec_format( '../model/GoogleNews-vectors-negative300.bin', binary=True) # Word2Vec.Train_Wrod2VEc(Sentences, model_google) # =========================================== # Generalize words n_dim = 300 train_vectors = [ Word2Vec.buildWordVector(model_google, z, n_dim) for z in Sentences ] Word2Vec.storeVecs(train_vectors, '../vectors/google_vecs.txt')
# -*- coding: utf-8 -*- import Word2Vec #load = ["6CM00079.txt","6CM00080.txt","6CM00082.txt","6CM00083.txt","6CM00088.txt","6CM00090.txt","6CM00092.txt","6CM00093.txt","6CM00094.txt","6CM00095.txt"] load = ["6CM00080.txt"] # 호출 및 벡터 사이즈 설정 vector_size = 10 #word2vec = Word2Vec.Word2Vec(pos,vector_size) word2vec = Word2Vec.Word2Vec(vector_size,load) final_embeddings, datas, count, dictionary, reverse_dictionary = word2vec.output() # 유사한 단어 불러오기 #print(dictionary) result = word2vec.similarity("군대",100) print(result) # 1. 키워드 입력시 유사단어 뽑기 # 2. 주요키워드에서 보여주기
import gensim import pymysql.cursors import Word2Vec # =========================================== # load data connection = pymysql.connect(user='******', password='******', database='GRE') cursor = connection.cursor() commit = "select * from GRES2" cursor.execute(commit) Sentences = [each[1] for each in cursor.fetchall()] Sentences = Word2Vec.cleanText(Sentences) # =========================================== # Load model model_google = gensim.models.KeyedVectors.load_word2vec_format('../GoogleModel/GoogleNews-vectors-negative300.bin', binary=True) # Word2Vec.Train_Wrod2VEc(Sentences, model_google) # =========================================== # Generalize words n_dim = 300 train_vectors = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in Sentences] Word2Vec.storeVecs(train_vectors, '../data for input1/q_vecs.pkl') commit = "select * from GRES2" cursor.execute(commit) Sentences = [each[2] for each in cursor.fetchall()] Sentences = Word2Vec.cleanText(Sentences) # Generalize words train_vectors = [Word2Vec.buildWordVector(model_google, z, n_dim) for z in Sentences]
from Word2Vec import * import pymongo db = pymongo.MongoClient().travel.articles class texts: def __iter__(self): for t in db.find().limit(30000): yield t['words'] wv = Word2Vec(texts(), model='cbow', nb_negative=16, shared_softmax=True, epochs=2) #建立并训练模型 wv.save_model('myvec') #保存到当前目录下的myvec文件夹 #训练完成后可以这样调用 wv = Word2Vec() #建立空模型 wv.load_model('myvec') #从当前目录下的myvec文件夹加载模型
def precision(y, yhat): """ Precision for classifier """ assert(y.shape == yhat.shape) return np.sum(y == yhat) * 100.0 / y.size def softmax_wrapper(features, labels, weights, regularization = 0.0): cost, grad, _ = softmaxRegression(features, labels, weights, regularization) return cost, grad # Gradient check always comes first dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) dimVectors = 10 C = 5 _, wordVectors0, _ = Word2Vec.load_saved_params() wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) #dummy_weights = 0.1 * np.random.randn(dimVectors, 5) #dummy_features = np.zeros((10, dimVectors)) #dummy_labels = np.zeros((10,), dtype=np.int32) #for i in xrange(10): # words, dummy_labels[i] = dataset.getRandomTrainSentence() # dummy_features[i, :] = getSentenceFeature(tokens, wordVectors, words) #print "==== Gradient check for softmax regression ====" #gradcheck_naive(lambda weights: softmaxRegression(dummy_features, dummy_labels, weights, 1.0, nopredictions = True), dummy_weights) # #print "\n=== For autograder ===" #print softmaxRegression(dummy_features, dummy_labels, dummy_weights, 1.0) # Try different regularizations and pick the best!
import Word2Vec ####Testing method words1 = "Foi um ótimo dia!" words2 = "Hoje está um dia lindo!" #iniciando o modelo de embeddings word_vectors, model = Word2Vec.startModel() similaridadeMatrizWord2vec = Word2Vec.calculateSimilarity( word_vectors, model, words1, words2) similaridadeVetoresEmbeddings = Word2Vec.embeddingsSimilarity( model, words1, words2) similaridadeWordOrder = Word2Vec.wordOrderSimilarity(word_vectors, model, words1, words2) similaridadeBinaria = Word2Vec.binarySimilarity(words1, words2) print("similaridade Matriz Word2vec = ", similaridadeMatrizWord2vec) print("similaridade Vetores Embeddings = ", similaridadeVetoresEmbeddings) print("similaridade Word Order = ", similaridadeWordOrder) print("similaridade binaria = ", similaridadeBinaria) """TESTANDO COM A BASE DE DADOS""" dados = open('DadosProcessados.csv', 'r', encoding='utf-8', errors='ignore').read().split('\n') dataset = [] #salvando os dados da base em dataset for line in dados:
import gensim import pymysql.cursors import Word2Vec import Doc2Vec # =========================================== # Load dictionary connection = pymysql.connect(user='******', password='******', database='GRE') cursor = connection.cursor() commit = "select * from GRES" cursor.execute(commit) Sentences = [each[1] for each in cursor.fetchall()] Dictionary1 = Word2Vec.cleanText(Sentences) Dictionary2 = Doc2Vec.Preprocessing(Sentences) # =========================================== # instantiate our DM and DBOW models size = 400 model_dm = gensim.models.Doc2Vec(min_count=0, window=10, size=size, sample=1e-3, negative=5, workers=3) model_dbow = gensim.models.Doc2Vec(min_count=0, window=10, size=size, sample=1e-3, negative=5, dm=0, workers=3)
def featuresExtraction(dataSet): #calculate feature 1 - TFIDF textDataSet = [] for line in dataset: textDataSet.append(line[3]) textDataSet.append(line[4]) #add synonyms newDataSet = Tep.addSynonyms(textDataSet) finalDataSet = [] #stemming for line in newDataSet: text = mnlp.stemming(line) finalDataSet.append(mnlp.convertText(text)) print(dataSet) print(newDataSet) vector = Tfidf.calculateTFIDF(finalDataSet) similarities = [] for i in range(0, len(vector), 2): distance = spatial.distance.cosine(vector[i], vector[i + 1]) similarities.append(1 - distance) #calculate others features word_vectors, model = Word2Vec.startModel() features = [] for x in range(len(dataSet)): featuresLine = [] #calculate feature 2 feature2 = Word2Vec.wordOrderSimilarity(word_vectors, model, dataSet[x][3], dataSet[x][4]) #calculate feature 3 sim2 = Word2Vec.embeddingsSimilarity(model, dataSet[x][3], dataSet[x][4]) if math.isnan(sim2): feature3 = 1.0 else: feature3 = sim2 #calculate feature 4 sim3 = Word2Vec.calculateSimilarity(word_vectors, model, dataSet[x][3], dataSet[x][4]) if math.isnan(sim3): feature4 = 1.0 else: feature4 = sim3 # calculate feature 5 feature5 = Word2Vec.binarySimilarity(dataSet[x][3], dataSet[x][4]) # calculate feature 6 size1 = len(mnlp.tokenize(dataset[x][3])) size2 = len(mnlp.tokenize(dataset[x][4])) if (size1 > size2): feature6 = size2 / size1 else: feature6 = size1 / size2 featuresLine.append(similarities[x]) featuresLine.append(feature2) featuresLine.append(feature3) featuresLine.append(feature4) featuresLine.append(feature5) featuresLine.append(feature6) featuresLine.append(dataSet[x][2]) #similarity class print(featuresLine) features.append(featuresLine)
header=0, delimiter="\t", quoting=3) unlabeled_train = pd.read_csv( "/home/vivek/Desktop/Kaggle/Sentiment Analysis/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3) model = models = gensim.models.Word2Vec.load( '300features_40minwords_10context') clean_train_reviews = [] num_features = 300 for review in train["review"]: clean_train_reviews.append( Word2Vec.review_to_wordlist(review, remove_stopwords=True)) trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features) print "Creating average feature vecs for test reviews" clean_test_reviews = [] for review in test["review"]: clean_test_reviews.append( Word2Vec.review_to_wordlist(review, remove_stopwords=True)) testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features) # Fit a random forest to the training data, using 100 trees forest = RandomForestClassifier(n_estimators=100) print "Fitting a random forest to labeled training data..."
parameters.init() # Prepare data for training the seq2seq prepare = DataPreparation() text = prepare.make_disintegration sent = prepare.get_sentences(text) dicc = prepare.get_dictionary(text, stopwords, vocab_size) data = prepare.get_word_list(sent, stopwords, window_size=Word2Vec_window_size) print('Propiedades del corpus: \n') print('\tDiccionario con %d palabras' % (len(dicc['w2i']))) word_to_vec = Word2Vec(vocab_size, Word2Vec_embedding_dim, Word2Vec_optimizer_step) x_train, y_train = word_to_vec.training_data(data) W1, b1 = word_to_vec.train(x_train, y_train) vocab_vectors = W1 + b1 conversations = [] for i in range(len(sent) - 2): if len(sent[i + 1]) != 0 and len( sent[i + 2]) != 0: # to avoid empty sentences conversations.append([sent[i + 1], sent[i + 2]]) # TRAIN THE MODEL # Initialize all the variables session = tf.Session() init_variables = tf.global_variables_initializer()
import Word2Vec import gensim import numpy as np import pymysql.cursors # =========================================== # load data connection = pymysql.connect(user='******', password='******', database='GRE') cursor = connection.cursor() commit = "select * from GRES" cursor.execute(commit) Sentences = [each[1] for each in cursor.fetchall()] Sentences = Word2Vec.cleanText(Sentences) # =========================================== # Train model model_w2v = gensim.models.Word2Vec.load('../model/model_w2v') Word2Vec.Train_Wrod2VEc(Sentences, model_w2v) # =========================================== # Generalize words n_dim = 300 train_vectors = [ Word2Vec.buildWordVector(model_w2v, z, n_dim) for z in Sentences ] Word2Vec.storeVecs(train_vectors, '../model/w2v_vecs.txt')