def vector_training(self, dimension): # w2v setting t = w2v() t.hyperparameter(dimension=dimension) t.train_file_setting("segmentation.txt", "e2v_w2v_sg") # articles for article in self.articles: article_id = article[0] print("article_id:", end='') print(article_id) t.write_file(article[1], append=True) t.write_file(article[2], append=True) # movies for movie in self.movies: movie_id = movie[0] print("movie_id:", end='') print(movie_id) t.write_file(movie[1], append=True) t.train() t.load_model() print(t.term_ranking_in_corpus("教師節", 50)) print(t.term_to_vector("爸爸")) print(t.terms_similarity("母親", "母親節")) print(1 - t.vectors_similarity(t.term_to_vector("在一起"), t.term_to_vector("過甜蜜")))
def vector_training(self): # w2v setting t = w2v() t.train_file_setting("segmentation.txt", "sum_w2v_w2v_sg") # articles for article in self.articles: article_id = article[0] content = article[1] print("article_id:", end='') print(article_id) # print(content) sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n', content) sentence_list = sentences.split("\n") # print(sentence_list) for sentence in sentence_list: if sentence != '': # print(sentence) seg_list = jieba.cut(sentence, cut_all=False) for seg in seg_list: if seg not in self.stopwordset and seg != ' ': print(seg, end=' ') t.write_file(seg + " ", append=True) print('') # movies for movie in self.movies: movie_id = movie[0] storyline = movie[1] print("movie_id:", end='') print(movie_id) # print(content) sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n', storyline) sentence_list = sentences.split("\n") # print(sentence_list) for sentence in sentence_list: if sentence != '': # print(sentence) seg_list = jieba.cut(sentence, cut_all=False) for seg in seg_list: if seg not in self.stopwordset and seg != ' ': print(seg, end=' ') t.write_file(seg + " ", append=True) print('') t.train() t.load_model() print(t.term_ranking_in_corpus("教師節", 50)) print(t.term_to_vector("爸爸")) print(t.terms_similarity("母親", "母親節")) print(1 - t.vectors_similarity(t.term_to_vector("母親"), t.term_to_vector("母親節")))
def w2v_algorithm(self, dimension, start, end, rank): t = w2v() t.hyperparameter(dimension=dimension) t.train_file_setting("segmentation.txt", "e2v_w2v_sg") t.load_model() count = start print(len(self.emotion_dic)) sql = "INSERT INTO experiment_entity2vec (id, emotion_entity, emotion_similarity) VALUES (%s, %s, %s)" for emotion in sorted(self.emotion_dic.items(), key=lambda x: x[1], reverse=True): print(emotion) emotion_entity = emotion[0] emotion_similarity = "" for similarity in t.term_ranking_in_corpus(emotion_entity, rank): emotion_similarity += similarity[0] + ":" + " " count += 1 val = (count, emotion_entity, emotion_similarity) try: self.cursor.execute(sql, val) except: print("Emotion Term Insert Error") if count == end: print("emotion finish...", end="\n\n") count = start break print(len(self.event_dic)) sql = "UPDATE experiment_entity2vec SET event_entity=%s, event_similarity=%s WHERE id=%s and event_entity = ''" for event in sorted(self.event_dic.items(), key=lambda x: x[1], reverse=True): print(event) event_entity = event[0] event_similarity = "" for similarity in t.term_ranking_in_corpus(event_entity, rank): event_similarity += similarity[0] + ":" + " " count += 1 val = (event_entity, event_similarity, count) self.cursor.execute(sql, val) if count == end: print("event finish...", end="\n\n") break self.db.commit()
parser.add_argument( '-iv', help='Index to Vector numpy array mapping integer to vector', default='index_to_vector.npy') args = parser.parse_args() word_embedding_filename = args.iv word_to_embedding_index_filename = args.wi try: word_embedding = np.load(word_embedding_filename) word_to_embedding_index = np.load(word_to_embedding_index_filename).item() except FileNotFoundError: print('Word embedding not found, running word2vec') word2vec.w2v(corpus_filename='./corpus/imdb_train_corpus.txt') embedding_norm = np.linalg.norm(word_embedding, axis=1) embedding_norm.shape = (10000, 1) normalized_word_embedding = word_embedding / embedding_norm m = word_to_embedding_index # Reverse dictionary to look up words from indices embedding_index_to_word = dict(zip(m.values(), m.keys())) root = './aclImdb/test/posneg/' for filename in os.listdir('./ggs_results/diffs/'): rv = rp.review(root + filename[0:-4] + '.txt') diff = np.load('./ggs_results/diffs/' + filename) prob = np.load('./ggs_results/probs/' + filename) print('Filename: ', filename, 'Initial Probability: ', prob[0][0]) if rv.sentiment == 'pos':
def save_vector(self): # w2v setting t = w2v() t.train_file_setting("segmentation.txt", "sum_w2v_w2v_sg") t.load_model() dimension = t.size # articles sql = "INSERT INTO articles_vector (id, sum_w2v_w2v_sg) VALUES (%s, %s)" for article in self.articles: article_sum_w2v_w2v_sg = np.zeros(dimension) article_id = article[0] content = article[1] print("article_id:", end='') print(article_id) # print(content) sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n', content) sentence_list = sentences.split("\n") # print(sentence_list) for sentence in sentence_list: if sentence != '': # print(sentence) seg_list = jieba.cut(sentence, cut_all=False) for seg in seg_list: if seg not in self.stopwordset and seg != ' ': try: seg_vector = t.term_to_vector(seg) print(seg) print(seg_vector[:5], end='\n\n') article_sum_w2v_w2v_sg += seg_vector except: continue print('') val = (article_id, str(list(article_sum_w2v_w2v_sg))) print("sum") print(article_sum_w2v_w2v_sg[:5], end="\n\n") self.cursor.execute(sql, val) self.db.commit() # movies sql = "INSERT INTO movies_vector (id, sum_w2v_w2v_sg) VALUES (%s, %s)" for movie in self.movies: movie_sum_w2v_w2v_sg = np.zeros(dimension) movie_id = movie[0] storyline = movie[1] print("movie_id:", end='') print(movie_id) # print(content) sentences = re.sub(r'\、|\,|★|\。|\?|\?|\;|\;|\:|\~|\:|\⋯', '\n', storyline) sentence_list = sentences.split("\n") # print(sentence_list) for sentence in sentence_list: if sentence != '': # print(sentence) seg_list = jieba.cut(sentence, cut_all=False) for seg in seg_list: if seg not in self.stopwordset and seg != ' ': try: seg_vector = t.term_to_vector(seg) print(seg) print(seg_vector[:5], end='\n\n') movie_sum_w2v_w2v_sg += seg_vector except: continue print('') val = (movie_id, str(list(movie_sum_w2v_w2v_sg))) print("sum") print(movie_sum_w2v_w2v_sg[:5], end="\n\n") self.cursor.execute(sql, val) self.db.commit()
def save_vector(self): # w2v setting t = w2v() t.train_file_setting("segmentation.txt", "e2v_w2v_sg") t.load_model() dimension = t.size # Access Articles NER 221269 self.cursor.execute( "SELECT id, emotion, event, person_object, time, location FROM articles_ner Where id >= 1 and id <= 221269" ) articles_ner = self.cursor.fetchall() for article_ner in articles_ner: article_id = article_ner[0] emotion = article_ner[1] event = article_ner[2] person_object = article_ner[3] time = article_ner[4] location = article_ner[5] print("article_id:", end='') print(article_id) relationship_e2v_w2v_sg = [] person_object_count = 0 person_object_add = np.zeros(dimension) for po in person_object.split(" "): if po != "": try: person_object_add += t.term_to_vector(po) person_object_count += 1 except: continue if person_object_count == 0: person_object_count = 1 relationship_e2v_w2v_sg = np.append( relationship_e2v_w2v_sg, person_object_add / person_object_count) scenario_e2v_w2v_sg = [] emotion_count = 0 emotion_add = np.zeros(dimension) for e in emotion.split(" "): if e != "": try: emotion_add += t.term_to_vector(e) emotion_count += 1 except: continue if emotion_count == 0: emotion_count = 1 relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg, emotion_add / emotion_count) scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg, emotion_add / emotion_count) event_count = 0 event_add = np.zeros(dimension) for e in event.split(" "): if e != "": try: event_add += t.term_to_vector(e) event_count += 1 except: continue if event_count == 0: event_count = 1 relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg, event_add / event_count) scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg, event_add / event_count) location_count = 0 location_add = np.zeros(dimension) for l in location.split(" "): if l != "": try: location_add += t.term_to_vector(l) location_count += 1 except: continue if location_count == 0: location_count = 1 relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg, location_add / location_count) time_count = 0 time_add = np.zeros(dimension) for ti in time.split(" "): if ti != "": try: time_add += t.term_to_vector(ti) time_count += 1 except: continue if time_count == 0: time_count = 1 relationship_e2v_w2v_sg = np.append(relationship_e2v_w2v_sg, time_add / time_count) sql = "UPDATE articles_vector SET relationship_e2v_w2v_sg=%s, scenario_e2v_w2v_sg=%s WHERE id=%s" val = (str(list(relationship_e2v_w2v_sg)), str(list(scenario_e2v_w2v_sg)), article_id) self.cursor.execute(sql, val) self.db.commit() # Access Movies NER 3722 self.cursor.execute( "SELECT id, emotion, event FROM movies_ner Where id >= 1 and id <= 3722" ) movies_ner = self.cursor.fetchall() for movie_ner in movies_ner: movie_id = movie_ner[0] emotion = movie_ner[1] event = movie_ner[2] print("movie_id:", end='') print(movie_id) scenario_e2v_w2v_sg = [] emotion_count = 0 emotion_add = np.zeros(dimension) for e in emotion.split(" "): if e != "": try: emotion_add += t.term_to_vector(e) emotion_count += 1 except: continue if emotion_count == 0: emotion_count = 1 scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg, emotion_add / emotion_count) event_count = 0 event_add = np.zeros(dimension) for e in event.split(" "): if e != "": try: event_add += t.term_to_vector(e) event_count += 1 except: continue if event_count == 0: event_count = 1 scenario_e2v_w2v_sg = np.append(scenario_e2v_w2v_sg, event_add / event_count) sql = "UPDATE movies_vector SET scenario_e2v_w2v_sg=%s WHERE id=%s" val = (str(list(scenario_e2v_w2v_sg)), movie_id) self.cursor.execute(sql, val) self.db.commit()
def __init__(self): self.importData() self.topicChooser = w2v() self.conDB() self.chatIDGen() self.cusPrint("hi, what can I help you with?")
from preprocessing import tokenize, read_preprocessed import numpy as np from word2vec import w2v from sklearn.model_selection import train_test_split use_preprocessed = True if not use_preprocessed: print('Processing the data...') data_path = '../Data/' # split(data_path, 'tweets.csv') good_tweets = tokenize(data_path + 'good_tweets.csv') bad_tweets = tokenize(data_path + 'bad_tweets.csv') else: print('Loading preprocessed data...') good_tweets = read_preprocessed('good_tweets') bad_tweets = read_preprocessed('bad_tweets') print('Creation of x and y vectors...') x_vector = good_tweets + bad_tweets y_vector = (np.zeros(len(good_tweets)).tolist()) + (np.ones(len(bad_tweets)).tolist()) [x_train, x_test, y_train, y_test] = train_test_split(x_vector, y_vector, shuffle=True) print('Creating the model...') model = w2v(x_train, y_train) print('Evaluation of the model...') model.evaluate(x_test, y_test)