def load(cls, np2vec_model_file, binary=False, word_ngrams=0): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ if word_ngrams == 0: return KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) elif word_ngrams == 1: return FastText.load(np2vec_model_file) else: logger.error('invalid value for \'word_ngrams\'')
def get_model(): model = FastText.load(EMBEDDINGS_FASTTEXT_MODEL_FILE) return model
def reload_movie_embedding(train_percent=MOVIEP.train_percent, valid=False, file_path=MOVIEP.movie_data_path, seq_num=MOVIEP.seq_num, embedding_type=EMBEDP.embedding_type, veclen=EMBEDP.veclen, window=EMBEDP.window): instance_data = read_pickle(file_path + 'movie_review_sequence_data.pkl', 'r') instance_result = read_pickle( file_path + 'movie_review_sequence_result.pkl', 'r') # word2index = read_pickle(file_path + 'new_word2index.pkl', 'r') feature_tensor = np.zeros((len(instance_data), seq_num, veclen)) if embedding_type == 'embedding': model = Word2Vec.load(file_path + 'movie_review_word2vec_' + str(veclen) + '_window' + str(window) + '.model') elif embedding_type == 'embedding_skipgram': model = Word2Vec.load(file_path + 'movie_review_word2vec__skipgram' + str(veclen) + '_window' + str(window) + '.model') elif embedding_type == 'fasttext': model = FastText.load(file_path + 'movie_review_fasttext_' + str(veclen) + '_window' + str(window) + '.model') elif embedding_type == 'fasttext_skipgram': model = FastText.load(file_path + 'movie_review_fasttext__skipgram' + str(veclen) + '_window' + str(window) + '.model') elif embedding_type == 'glove': model = glove2word2vec( file_path + 'movie_vectors_w' + str(window) + '_l' + str(veclen) + '.txt', file_path + 'glove' + str(veclen) + '_window' + str(window) + '.model') elif embedding_type == 'lda_sgns' or embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns': model = get_sgns_embedding('MovieReview') for instance_iter, instance in enumerate(instance_data): start_index = seq_num - len(instance) for seq_iter, seq_data in enumerate(instance): word_vec = model[seq_data] feature_tensor[instance_iter][seq_iter + start_index] += word_vec result_matrix = np.array(instance_result).reshape( (len(instance_result), -1)) train_size = int(feature_tensor.shape[0] * train_percent) train_x = feature_tensor[:train_size] train_y = result_matrix[:train_size] test_x = feature_tensor[train_size:] test_y = result_matrix[train_size:] if valid: new_train_size = int(train_size * train_percent) train_x = train_x[:new_train_size] train_y = train_y[:new_train_size] test_x = train_x[new_train_size:] test_y = train_y[new_train_size:] if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns': train_x_sg, train_y_sg, test_x_sg, test_y_sg = reload_movie_embedding( train_percent=train_percent, valid=valid, file_path=file_path, seq_num=seq_num, embedding_type="embedding_skipgram", veclen=veclen, window=window) if embedding_type == 'sg_add_sgns': train_x = train_x + train_x_sg test_x = test_x + test_x_sg if embedding_type == 'sg_cancat_sgns': train_x = np.concatenate((train_x, train_x_sg), axis=2) test_x = np.concatenate((test_x, test_x_sg), axis=2) return train_x, train_y, test_x, test_y
model_type = sys.argv[3] model_name = str(num_features) + "features_" + str( min_word_count) + "minwords_" + str(context) + "context_len2alldata" assert model_type in ["word2vec", "fasttext"] if model_type == "word2vec": # Load the trained Word2Vec model. model = Word2Vec.load(model_name) # Get wordvectors for all words in vocabulary. word_vectors = model.wv.vectors index2word = model.wv.index2word elif model_type == "fasttext": # Load the trained FastText model. model = FastText.load(model_name) # Get wordvectors for all words in vocabulary. word_vectors = model.wv.vectors index2word = model.wv.index2word all = pd.read_pickle('all.pkl') # Set number of clusters. num_clusters = int(sys.argv[2]) idx, idx_proba = cluster_GMM(num_clusters, word_vectors) # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments. # idx_name = "gmm_latestclusmodel_len2alldata.pkl" # idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl" # idx, idx_proba = read_GMM(idx_name, idx_proba_name)
from numpy.linalg import norm import json from gensim.models import FastText import logging import sys import torch import encoder from torch.autograd import Variable from sklearn.metrics.pairwise import cosine_similarity logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) logging.root.level = logging.INFO punctuation = '!"#$%&\'()*+,.:;<=>?@[\\]^`{|}~' table = str.maketrans('', '', punctuation) dictionary = {} model = FastText.load('model/entity_fasttext_n100') wv = model.wv del model def load_dictionary(dictionary_file): """ Load the dictionary with article titles mapped to their respective abstracts containing annotated text. Argument -------- dictionary_file: Input file """ global dictionary
import os from gensim.models import Word2Vec from gensim.models import FastText # EXP_HOME = "F:/MyWorks/Thesis Works/Crowdsource_Knowledge_Base/DeepGenQR/experiment" EXP_HOME = "C:/My MSc/ThesisWorks/BigData_Code_Search/DeepGenQR/experiment" model_file = EXP_HOME + '/pymodel/tomcat-fasttext-model' model = FastText.load(model_file) word_file = EXP_HOME + '/w2vec-data/words.txt' vec_file = EXP_HOME + '/w2vec-data/tomcat-vector.txt' vec_lines = list() words = open(word_file, 'r') for word in words: try: if model.wv.__contains__(word.strip()): vector = model.wv[word.strip()] line = word.strip() + " " + ' '.join(str(x) for x in vector) vec_lines.append(line) except IOError: print("Could not found " + word) pass output_file = open(vec_file, 'w') for content in vec_lines: output_file.write("%s\n" % content) output_file.close()
from tqdm import tqdm import random #필요한 자료 및 모델 val_data = json.load(open("data/val.json", "rb")) #맞춰야 하는 데이터 with open("tag_name.list", "rb") as f: tag_list = pickle.load(f) #태그 리스트 with open("music_tag.dic", "rb") as f: music_tag = pickle.load(f) #태그를 음악에 따라 묶은 것 with open("tag_music_freq.dic", "rb") as f: tag_music_freq = pickle.load(f) fasttext = FastText.load("FastText.model") #예상 내용이 들어가는 리스트, 일단 태그 10개, 곡 100개 만족하는거는 나중에 따로 처리해주고, 일단 채워넣는 단계 results = [] #def type1_presager(data, ...): def type1_vote(frequency): for tag, freq in tags.items(): if freq == frequency: answer_tags.append(tag) else: pass
import pandas as pd import pickle import re import string import nltk from CobaVectorizer import MeanEmbeddingVectorizer import gensim from gensim.models import FastText app = Flask(__name__, static_folder='static', template_folder='templates') #Initialize the flask App model = pickle.load( open('model_rf_byu200_02TS_Normal.pkl', 'rb')) # You can change it with your own Random Forest Model or else loc = "FastTextModels/saved_model_gensim200SG_BYU.bin" #You can Change it by your own pre-trained model model_ft = FastText.load(loc) connection = pymysql.connect(host='localhost', user='******', password='', database='sentimen') count = 0 @app.route('/') def home(): return render_template('index.html') @app.route('/predict', methods=['POST']) def predict(): '''
tweet_words = preprocess(bipolar_data['text']) del df # =========================================================================== # getting fasttext vectors #model = FastText(tweet_words, size=100, window=3, min_count=1) model = FastText(size=100, window=3, min_count=1) model.build_vocab(tweet_words, update=True) model.train(tweet_words, total_examples=model.corpus_count, epochs=10) # =========================================================================== # save and load model from gensim.test.utils import get_tmpfile fname = get_tmpfile("fasttext.model") model.save(fname) model = FastText.load(fname) # =========================================================================== # calculate the document vector as average of all the words index2word_set = set(model.wv.index2word) def avg_feature_vector(words, model, num_features, word_set): ''' calculates the average vector ''' feature_vec = np.zeros((num_features, ), dtype='float32') n_words = 0 for word in words: if word in index2word_set: n_words += 1 feature_vec = np.add(feature_vec, model[word])
tokens = tokenizer.tokenize(text) # if row.sentiment.lower() == 'neutral': # pred_selected_text = row.selected_text jaccards.append(jaccard(row.selected_text, pred_selected_text)) pred_selected_texts.append(pred_selected_text) text_tokens.append(tokens) df['jaccard'] = jaccards df['pred_selected_text'] = pred_selected_texts df['text_tokens'] = text_tokens if pred_file is not None: df.to_csv(Path(f'{Config.pred_dir}/{pred_file}'), index=False) return float(np.mean(jaccards)) __roberta_tokenizer = ByteLevelBPETokenizer( vocab_file=str(Config.Roberta.vocab_file), merges_file=str(Config.Roberta.merges_file), add_prefix_space=True, lowercase=True) __bert_tokenizer = BertWordPieceTokenizer( vocab_file=str(Config.Bert.vocab_file)) __xlnet_tokenizer = XLNetTokenizer(vocab_file=str(Config.XLNet.vocab_file), do_lower_case=False) __albert_tokenizer = AlbertTokenizer(vocab_file=str(Config.Albert.vocab_file), do_lower_case=True) __ft_embeddings = FastText.load(str(Config.ft_embeddings_path))
# -*- coding: utf-8 -*- """ Created on Sat Sep 29 21:46:40 2018 @author: tianyu """ import os import numpy as np import pandas as pd #os.chdir('/home/tiw15008/cleanfiles/fasttextmodel/') from gensim.models import FastText model = FastText.load('fasttext0928') sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] model = FastText(sentences, min_count=1) wv = model.wv W = np.memmap("fastembed.dat", dtype=np.double, mode="r", shape=(424107, 300)) f = open("fastembed_clean.vocab",encoding='utf-8') vocab_list = map(lambda x: eval(x.strip()), f.readlines()) vocab_dict = {w: k for k, w in enumerate(vocab_list)} data = pd.read_table("data_gename.txt",index_col=0, delim_whitespace=True) gene = data.index.values.tolist() common = [word for word in gene if word.lower() in vocab_dict] # words in the dict
print("=" * 10) print(summay_text) print("=" * 10) nn_word_list = ut.kakao_postagger_nn_finder(summay_text) print(nn_word_list) fasttext_data.append(nn_word_list) fastText_model = ut.fastText(fasttext_data) print('fastText_model similar list') # similar = model.most_similar(positive=['윤호', '하이킥'], topn=10) # [('순재', 0.9943705797195435), ('거침없이', 0.9900286197662354), ('그에게', 0.9879124164581299), ('중매역활', 0.9861310720443726), ('하거나', 0.9786599278450012), ('자이젠', 0.9707398414611816), ('민정은', 0.9691370725631714), ('프리실라', 0.9552605152130127), ('시온', 0.954103946685791), ('타바사', 0.9522706866264343)] # similar = model.most_similar(positive=['카파도키아', '아르메니아', '기원전'], topn=10) # [('에우메네스', 0.9945849180221558), ('페르디카스로부터', 0.9932612180709839), ('공격하', 0.9814687967300415), ('받아', 0.9809004068374634), ('알케타스', 0.9726078510284424), ('321년', 0.97102952003479), ('마족', 0.9679989814758301), ('영웅전', 0.9672538638114929), ('것이다', 0.9660188555717468), ('에린이', 0.9653569459915161)] similar = fastText_model.most_similar(positive=['삼성'], negative=['제로페이'], topn=10) print(similar) ut.plt_show(fastText_model, img_name='fasttext.png') """ Just run `python w2v_visualizer.py word2vec.model visualize_result` """ word2vec_model = FastText.load("./fastText.model") ut.visualize(word2vec_model, "./fastText_log")
from os import path, removedirs, remove, mkdir from gensim.models import Word2Vec, FastText from time import time directory = "/home/zack/Desktop/Hons Project/program/models/results/" topn = 10 start = time() words = [] w2v = Word2Vec.load( "/home/zack/Desktop/Hons Project/program/models/w2v/w2v_twitteronly.model") ft = FastText.load( "/home/zack/Desktop/Hons Project/program/models/fasttext/ft_twitteronly.model" ) print("models loaded successfully!") if path.exists(directory + "w2v_results.txt"): remove(directory + "w2v_results.txt") w2v_outfile = open(directory + "w2v_results.txt", "a+") if path.exists(directory + "ft_results.txt"): remove(directory + "ft_results.txt") ft_outfile = open(directory + "ft_results.txt", "a+") with open("/home/zack/Desktop/Hons Project/program/models/testwords.txt", "r") as f: for line in f: words.append(line.strip())
print("appending line " + str(i)) i += 1 print('Data input complete.') # 训练数据 model = FastText(lines, size=dim, min_count=3, iter=5) model.save('testModel.model') # 保存为model格式 model.wv.save_word2vec_format('testModelVec.vector', binary=False) # 保存为vector '''训练模型(如模型已存在无需重复加载训练)''' #remove_char(write_file='clean_data.txt', read_file='data_train.txt') #train('clean_data.txt', 200) '''模型加载与测试''' # 载入模型 model = FastText.load('testModel.model') word1 = '电影' word2 = '电视剧' word3 = '跑步' # 获取词向量 print(word1 + ' 的词向量为:') print(model.wv[word1]) # 求最相似词语 print('和 ' + word1 + ' 最相似的词语为:') print(model.most_similar(word1)) # 求相似度 print(word1 + ' 和 ' + word2 + ' 的相似度为:') print(model.wv.similarity(word1, word2)) print(word3 + ' 和 ' + word2 + ' 的相似度为:') print(model.wv.similarity(word3, word2))
class sentence2vec(object): # 解决matplotlib中文乱码 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False REAL = np.float32 # 定义预先计算归一化常数Z Z = 0. app = Flask(__name__) path = os.path.join(app.static_folder, Config.WordsModelFile) if Config.ModelMethod == 'Word2Vec': model = Word2Vec.load(path) elif Config.ModelMethod == 'FastText': model = FastText.load(path) else: model = Word2Vec.load(path) # 加载模型 @staticmethod def init(): # 预先计算归一化常数Z sentence2vec.Z = sentence2vec.normalization_constant_Z() pass @staticmethod def timefn(fn): """计算性能的修饰器""" @wraps(fn) def measure_time(*args, **kwargs): t1 = time.time() result = fn(*args, **kwargs) t2 = time.time() print("@timefn:" + fn.__name__ + " 生成摘要时间: " + np.str('%.2f' % (np.float32(t2 - t1))) + " 秒") return result return measure_time @staticmethod def normalization_constant_Z(): '''计算归一化常数Z''' vlookup = sentence2vec.model.wv.vocab Z = 0 for k in vlookup: Z += vlookup[k].count return Z @staticmethod def sif_embeddings(sentences, model, alpha=1e-3): """计算句子向量的SIF嵌入参数 ---------- sentences : list 需要计算的句子或文章 model : word2vec或FastText训练得到的模型 一个包含词向量和词汇表的gensim模型 alpha : float, optional 参数,用于根据每个单词的概率p(w)对其进行加权。 Returns ------- numpy.ndarray SIF 句子嵌入矩阵 len(sentences) * dimension """ vlookup = model.wv.vocab # 获取字典索引 vectors = model.wv # 我们能够访问词向量 size = model.vector_size # 词向量维度 output = [] # 遍历所有的句子 for s in sentences: count = 0 v = np.zeros(size, dtype=sentence2vec.REAL) # 摘要向量 # 遍历所有单词 for w in s: # 单词必须出现在词汇表中 if w in vlookup: for i in range(size): # 平滑逆频率,SIF v[i] += (alpha / (alpha + (vlookup[w].count / sentence2vec.Z)) ) * vectors[w][i] count += 1 if count > 0: for i in range(size): v[i] *= 1 / count output.append(v) return np.vstack(output).astype(sentence2vec.REAL) @staticmethod def cut(text): '''分词函数''' return ' '.join(jieba.cut(text)) @staticmethod def split_sentences(text): '''分句函数''' sents = [] text = re.sub(r'\n+', '。', text) # 换行改成句号(标题段无句号的情况) text = re.sub('([。!?\?])([^’”])', r'\1\n\2', text) # 普通断句符号且后面没有引号 text = re.sub('(\.{6})([^’”])', r'\1\n\2', text) # 英文省略号且后面没有引号 text = re.sub('(\…{2})([^’”])', r'\1\n\2', text) # 中文省略号且后面没有引号 text = re.sub('([.。!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text) # 断句号+引号且后面没有引号 text = text.replace(u'。。', u'。') # 删除多余的句号 text = text.replace(u'?。', u'。') # text = text.replace(u'!。', u'。') # 删除多余的句号 text = text.replace(u'\n', u'').replace(u'\r', u'') # 删除多余的\\r\\n text = text.replace(u'\u3000', u'') text = text.replace(u'\\n', u'') text = text.replace(u'点击图片', u'') text = text.replace(u'进入下一页', u'') #sentences = re.split(r'。|!|?|】|;',text) # 分句 sentences = re.split('。|!|\!|\.|?|\?', text) # 分句 #sentences = re.split(r'[。,?!:]',text) # 分句 sentences = sentences[:-1] # 删除最后一个句号后面的空句 for sent in sentences: len_sent = len(sent) if len_sent < 4: # 删除换行符、一个字符等 continue # sent = sent.decode('utf8') sent = sent.strip(' ') sent = sent.lstrip('【') sent = sent.lstrip('】') sents.append(sent) return sents @staticmethod def knn_smooth(arr): '''knn平滑函数''' result = [] if len(arr) > 3: result = [] for i in range(len(arr)): a = 0 # 处理第一句余弦距离时,取第一,第二句的余弦距离之和,再取平均,作为第一句的余弦距离 if i < 1: a = ((arr[i] + arr[i + 1]) / 2) result.append(a) # 处理中间句子余弦距离时,取前一句,当前句,后一句的余弦距离之和,再取平均,作为的余弦距离 elif i < len(arr) - 1: a = ((arr[i] + arr[i - 1] + arr[i + 1]) / 3) result.append(a) # 处理最后一句余弦距离时,取最后一句,前一句的余弦距离之和,再取平均,作为最后一句的余弦距离 else: a = ((arr[i] + arr[i - 1]) / 2) result.append(a) else: result = arr return result @staticmethod def get_plot(x1, x2, top_n): plt.figure(figsize=(12, 8)) plt.plot(x1[:top_n], linestyle='-.', marker='o', color='r', alpha=0.5, label='平滑前') plt.plot(x2[:top_n], linestyle='-.', marker='o', color='g', alpha=0.5, label='平滑后') plt.title('K N N连续句子相关性的平滑') plt.xlabel('句子编号') plt.ylabel('余弦距离(数值越小,句子越重要)') plt.grid(linestyle='-.', alpha=0.7) plt.legend() for i, j in zip(np.arange(len(x1[:top_n])), x1[:top_n]): plt.text(i, j + 0.002, '%.3f' % j, color='r', alpha=0.7) for i, j in zip(np.arange(len(x2[:top_n])), x2[:top_n]): plt.text(i, j + 0.002, '%.3f' % j, color='g', alpha=0.7) @staticmethod def get_sen_doc_cosine(text, title, top_n=10, plot=True): '''获取 句向量/文章向量 的余弦距离''' # 判断对象是否list if isinstance(text, list): text = ' '.join(text) # 文章分句 split_sens = sentence2vec.split_sentences(text) # 文章向量化,标题向量化 doc_vec = sentence2vec.sif_embeddings([text], sentence2vec.model, alpha=1e-3) # 定义句子/文章向量 和 句子/标题向量 余弦距离空字典 sen_doc_cosine = {} # 遍历文章分句,计算句向量,把文章的内容和对应的余弦距离存入字典 for sen in split_sens: sen_vec = sentence2vec.sif_embeddings([sen], sentence2vec.model, alpha=1e-3) # 计算 句子/文章向量 的余弦距离 sen_doc_cosine[sen] = cosine(sen_vec, doc_vec) # 句子/文章向量 余弦字典的keys,values空列表 sen_doc_cosine_keys, sen_doc_cosine_values = [], [] # 遍历句子/文章向量 余弦距离字典,获取正确的分句内容和对应的余弦距离存入对应列表中 for i, j in sen_doc_cosine.items(): sen_doc_cosine_keys.append(i) sen_doc_cosine_values.append(j) # 平滑前, 把(句子/文章向量)列表转成数组 knn_before_cosine_values = np.array(sen_doc_cosine_values) # 使用自定义的knn_smooth函数,计算新的余弦距离 (平滑后的余弦距离) knn_after_cosine_values = np.array( sentence2vec.knn_smooth(sen_doc_cosine_values)) # 定义knn平滑后的余弦距离空字典 knn_cosine_score = {} # 把原分句内容和平滑后的余弦距离组合成字典 knn_cosine_score = dict( zip(sen_doc_cosine_keys, knn_after_cosine_values)) # 绘制平滑前后的余弦距离的曲线图 if plot: sentence2vec.get_plot(knn_before_cosine_values, knn_after_cosine_values, top_n) # 返回经过平滑后的字典,降序,字典包含新闻分句和对应的余弦距离 return sorted(knn_cosine_score.items(), key=lambda x: x[1], reverse=False) # 有输入标题 @staticmethod def get_sen_doc_title_cosine(text, title, weight=0.5, top_n=10, plot=True): '''获取(句子/文章向量)(句子/标题向量)的余弦距离''' # 判断对象是否list if isinstance(text, list): text = ' '.join(text) # 文章分句 split_sens = sentence2vec.split_sentences(text) # 文章向量化,标题向量化 doc_vec = sentence2vec.sif_embeddings([text], sentence2vec.model, alpha=1e-3) title_vec = sentence2vec.sif_embeddings([title], sentence2vec.model, alpha=1e-3) # 定义句子/文章向量 和 句子/标题向量 余弦距离空字典 sen_doc_cosine, sen_title_cosine = {}, {} # 遍历文章分句,计算句向量,把文章的内容和对应的余弦距离存入字典 for sen in split_sens: sen_vec = sentence2vec.sif_embeddings([sen], sentence2vec.model, alpha=1e-3) # 计算 句子/文章向量 的余弦距离 sen_doc_cosine[sen] = cosine(sen_vec, doc_vec) # 计算 句子/标题向量 的余弦距离 sen_title_cosine[sen] = cosine(sen_vec, title_vec) # 句子/文章向量 余弦字典的keys,values空列表 sen_doc_cosine_keys, sen_doc_cosine_values = [], [] # 遍历句子/文章向量 余弦距离字典,获取正确的分句内容和对应的余弦距离存入对应列表中 for i, j in sen_doc_cosine.items(): sen_doc_cosine_keys.append(i) sen_doc_cosine_values.append(j) # 句子/标题向量 余弦字典的keys,values空列表 sen_title_cosine_keys, sen_title_cosine_values = [], [] # 遍历 句子/标题向量 余弦距离字典,获取正确的分句内容和对应的余弦距离存入对应列表中 for i, j in sen_title_cosine.items(): sen_title_cosine_keys.append(i) sen_title_cosine_values.append(j) # 平滑前,计算 (句子/文章向量)* 权重 + (句子/标题向量)* (1 - 权重) knn_before_cosine_values = np.array( sen_doc_cosine_values) * weight + np.array( sen_title_cosine_values) * (1 - weight) # 使用自定义的knn_smooth函数,计算新的余弦距离 (平滑后的余弦距离) knn_after_cosine_values = np.array( sentence2vec.knn_smooth(sen_doc_cosine_values) ) * weight + np.array( sentence2vec.knn_smooth(sen_title_cosine_values)) * (1 - weight) # 定义knn平滑后的余弦距离空字典 knn_cosine_score = {} # 把原分句内容和平滑后的余弦距离组合成字典 knn_cosine_score = dict( zip(sen_doc_cosine_keys, knn_after_cosine_values)) # 绘制平滑前后的余弦距离的曲线图 if plot: sentence2vec.get_plot(knn_before_cosine_values, knn_after_cosine_values, top_n) # 返回经过平滑后的字典,降序,字典包含新闻分句和对应的余弦距离 return sorted(knn_cosine_score.items(), key=lambda x: x[1], reverse=False) @staticmethod def get_summarize(text, title, weight=0.5, top_n=10, plot=False): '''生成摘要,默认获得前10句''' # 获取分句 split_sens = sentence2vec.split_sentences(text) # 获取排序后的字典,key为句子内容,values为分句向量与文章向量的余弦距离 if title == '': ranking_sentences = sentence2vec.get_sen_doc_cosine(text, title, top_n=top_n, plot=plot) else: ranking_sentences = sentence2vec.get_sen_doc_title_cosine( text, title, weight=weight, top_n=top_n, plot=plot) # 设置一个空集合和空字符 selected_sen = set() if len(split_sens) > top_n: # 遍历top_n的句子,并添加到空集合 for sen, _ in ranking_sentences[:top_n]: selected_sen.add(sen) else: for sen, _ in ranking_sentences: selected_sen.add(sen) # 设置摘要的空列表 summarize = [] # 遍历所有的句子,把top_n的句子,按照原新闻中的顺序拼接起来 for sen in split_sens: if sen in selected_sen: summarize.append(sen + '。') summarize = ' '.join(summarize) return summarize
for company in range(len(com_list)): for model_use in model_list: for per in percentage: per = int(per) article = pd.read_excel( f'../{fo}/All_File/Final_Clean_Article.xlsx') article_center = pd.read_excel( f'../{fo}/All_File/{com_list[company]}_intro.xlsx') all_article = article['內容'].tolist() news_time = article['時間'].tolist() all_article_center = article_center['Com_intro'].tolist() if model_use == 'fastText_stock': model = FastText.load( '../Word_Embedding_model/{}.model'.format(model_use)) else: model = Word2Vec.load( '../Word_Embedding_model/{}.model'.format(model_use)) print('Using Model : ', '../Word_Embedding_model/{}.model'.format(model_use)) score = [] # for cos Article_vector = [] # for article vector Article_extract = [] #for target article Article_vector_extract = [] # for target article vector Article_time_extract = [] # for target article time for y in all_article_center: y = y.split(' ') tmp_storage_y = []
def fasttext(): path = 'models/fasttext/fasttext.bin' model = FastText.load(path) return 'FastText', model
# 참고한 사이트 https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne/code font_name = matplotlib.font_manager.FontProperties( fname="C:/Windows/Fonts/.ttf" # 한글 폰트 위치를 넣어주세요 ).get_name() matplotlib.rc('font', family=font_name) modelPath = "/Users/lemon/Desktop/multi-class-text-classification-cnn-master_combine/trained_model_1526302044/fastText.vec" # C:\Users\lemon\Desktop\multi-class-text-classification-cnn-master_combine\trained_model_1526302044 # C:\Users\lemon\Desktop\multi-class-text-classification-cnn-master_combine\trained_model_1526302044/word2Vec.vec # C:/Users/lemon/Desktop/multi-class-text-classification-cnn-master_combine/trained_model_1526302044/word2Vec.vec # /Users/lemon/Desktop/multi-class-text-classification-cnn-master_combine/trained_model_1526302044/word2Vec.vec # model = g.Doc2Vec.load(modelPath) model = FastText.load(modelPath) vocab = list(model.wv.vocab) X = model[vocab] # X = model[model.wv.vocab] tsne = TSNE(n_components=2) # X_tsne = tsne.fit_transform(X[:1000,:]) X_tsne = tsne.fit_transform(X) df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y']) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.scatter(df['x'], df['y']) for word, pos in df.iterrows():
from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report import numpy as np from get_feature import * # 总特征list total_feature_list = list() # 总标签list label_list = list() with open("label.txt", "r", encoding="utf-8") as f: lines = f.readlines() for line in lines: label = line.strip() label_list.append(label) model = FastText.load("AImed.model") with open("further corpus.txt", "r", encoding="utf-8") as file: lines = file.readlines() for line in lines: sentence2list = line.strip().split() temp = list() initial_vector = np.zeros(20) # initial_pos_vector = np.zeros(33) # 实体e1的向量 v1 = model.wv['entityone'] for item in v1: temp.append(item) # 句子的向量 for word in sentence2list: vector = model.wv[word] initial_vector += vector
#characters chars = set([w_i for w in words for w_i in w]) n_chars = len(chars) print("Number of Labels: ", n_chars) tag2idx = {t: i + 1 for i, t in enumerate(tags)} tag2idx["PAD"] = 0 # Vocabulary Key:tag_index -> Value:Label/Tag idx2tag = {i: w for w, i in tag2idx.items()} # Char Key:char -> Value:token_index char2idx = {c: i + 2 for i, c in enumerate(chars)} char2idx["UNK"] = 1 char2idx["PAD"] = 0 words_fast = FastText.load('model_fast30/model_fast.model') #load pretrained word embedding embedding_matrix = np.ones((len(word2idx), 100), dtype='float32') embedding_matrix[0] = np.zeros(100, dtype='float32') # with open('wiki-news-300d-1M.vec') as f: for i in range(2, len(idx2word) - 2): embedding_matrix[i] = words_fast[idx2word[i]] # ordered_words_ft.append(s[0]) print('Found %s word vectors.' % len(embedding_matrix)) # for word, i in word2idx.items(): # embedding_vector = embeddings_index.get(word) # if embedding_vector is not None: # # words not found in embedding index will be all-zeros. # embedding_matrix[i] = embedding_vector
def load_fast_text(file): return FastText.load(file)
default="cosinus") parser.add_argument("-s", "--savePath", help="Path where to save model to", required=True) args = parser.parse_args() print("Load data..") # data = readFile(args.filename, columns=args.filenameColumns, sep=args.filenameDelimiter) data = pd.read_parquet(args.filename, columns=["id", "text", "user_name"], engine="pyarrow") print("Load word embeddings..") model_ft = FastText.load(args.wordEmbedding) print("Preprocess data..") data["text_vec"] = data[args.dataColumnName].map( lambda tweet: tweet_vectorizer(preprocess_tweet(tweet), model_ft)) data["prep"] = data[args.dataColumnName].map( lambda tweet: preprocess_tweet(tweet)) Nclusters = [10, 20, 30] scores = [] for N in Nclusters: res = kmeans(data, N, args.maxIterations, distance=args.distance, vectorColumn="text_vec")
def trained_metric(exp_id=0, n_jobs=1, freqs=(80, 100), window=3, emb_model='ft'): train_docs, test_docs = split_wiki9_articles(exp_id) save_dir = FLAGS.save_dir model_name = 'wiki9_{}_{}.model'.format(emb_model, FLAGS.exp_id) model_path = os.path.join(save_dir, model_name) if emb_model == 'ft': model = FastText.load(model_path) elif emb_model == 'w2v': model = Word2Vec.load(model_path) elif emb_model == 'glove': model = load_glove_model(model_path) elif emb_model == 'tfw2v': model = load_tf_embedding(FLAGS.exp_id, save_dir=save_dir, epoch=FLAGS.epoch, noise_multiplier=FLAGS.noise_multiplier, l2_norm_clip=FLAGS.l2_norm_clip, microbatches=FLAGS.microbatches) else: raise ValueError('No such embedding model: {}'.format(emb_model)) word_vectors = model.wv.vectors word_emb = tf.convert_to_tensor(word_vectors) metric_model = LinearMetricModel(word_vectors.shape[1]) optimizer = tf.train.AdamOptimizer(5e-4) inputs_a = tf.placeholder(tf.int64, (None, ), name="inputs_a") inputs_b = tf.placeholder(tf.int64, (None, ), name="inputs_b") labels = tf.placeholder(tf.float32, (None, ), name="labels") embs_a = tf.nn.embedding_lookup(word_emb, inputs_a) embs_b = tf.nn.embedding_lookup(word_emb, inputs_b) logits = metric_model.forward(embs_a, embs_b) if FLAGS.metric == 'cosine': embs_a = tf.nn.l2_normalize(embs_a, axis=1) embs_b = tf.nn.l2_normalize(embs_b, axis=1) dot = tf.reduce_sum(tf.multiply(embs_a, embs_b), axis=1) # loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) loss = tf.keras.losses.hinge(labels, logits) loss = tf.reduce_mean(loss) t_vars = tf.trainable_variables() grads_and_vars = optimizer.compute_gradients(loss, t_vars) train_ops = optimizer.apply_gradients( grads_and_vars, global_step=tf.train.get_or_create_global_step()) vocab_size = len(model.wv.vocab) thresh = (int(vocab_size * freqs[0] / 100), int(vocab_size * freqs[1] / 100)) print("Loading contexts for membership inference") if n_jobs > 1: member_job_ctxs = Parallel(n_jobs)( delayed(get_all_contexts)(ds, model, thresh, window) for ds in split_docs(train_docs, n_jobs)) nonmember_job_ctxs = Parallel(n_jobs)( delayed(get_all_contexts)(ds, model, thresh, window) for ds in split_docs(test_docs, n_jobs)) member_ctxs = [ ctxs for job_ctxs in member_job_ctxs for ctxs in job_ctxs ] nonmember_ctxs = [ ctxs for job_ctxs in nonmember_job_ctxs for ctxs in job_ctxs ] else: member_ctxs = get_all_contexts(train_docs, model, thresh, window) nonmember_ctxs = get_all_contexts(test_docs, model, thresh, window) print("Loaded {} member and {} nonmember".format(len(member_ctxs), len(nonmember_ctxs))) membership_labels = np.concatenate( [np.ones(len(member_ctxs)), np.zeros(len(nonmember_ctxs))]) train_ctxs, test_ctxs, train_labels, test_labels = train_test_split( member_ctxs + nonmember_ctxs, membership_labels, random_state=12345, train_size=FLAGS.train_size, stratify=membership_labels) def flatten_ctxs(ctxs, labels): flat_ctxs, flat_labels = [], [] for doc_ctx, doc_label in zip(ctxs, labels): flat_ctxs += doc_ctx flat_labels.append(np.ones(len(doc_ctx)) * doc_label) return flat_ctxs, np.concatenate(flat_labels) train_ctxs, train_labels = flatten_ctxs(train_ctxs, train_labels) test_ctxs, test_labels = flatten_ctxs(test_ctxs, test_labels) train_y = [] for ctxs, label in zip(train_ctxs, train_labels): train_y.append(np.ones(len(ctxs)) * label) train_y = np.concatenate(train_y).astype(np.float32) train_x = np.vstack(train_ctxs) def collect_scores(ctxs, labels, sess, baseline=False): stacked_ctxs = np.vstack(ctxs) stacked_scores = [] for batch_idx in iterate_minibatches_indices(len(stacked_ctxs), batch_size=1024, shuffle=False): feed = { inputs_a: stacked_ctxs[batch_idx][:, 0], inputs_b: stacked_ctxs[batch_idx][:, 1] } scores = sess.run(dot if baseline else logits, feed_dict=feed) stacked_scores.append(scores) stacked_scores = np.concatenate(stacked_scores) member_metrics, nonmember_metrics = [], [] start_idx = 0 for ctx, label in zip(ctxs, labels): scores = stacked_scores[start_idx:start_idx + len(ctx)] start_idx += len(ctx) if label == 1: member_metrics.append(scores) else: nonmember_metrics.append(scores) return member_metrics, nonmember_metrics with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(tf.global_variables_initializer()) test_member_metrics, test_nonmember_metrics = collect_scores( test_ctxs, test_labels, sess, True) compute_adversarial_advantage( [np.mean(m) for m in test_member_metrics], [np.mean(m) for m in test_nonmember_metrics]) print('Training attack model with {} data...'.format(len(train_x))) for epoch in range(30): iterations = 0 train_loss = 0 for batch_idx in iterate_minibatches_indices(len(train_y), batch_size=512, shuffle=True): feed = { inputs_a: train_x[batch_idx][:, 0], inputs_b: train_x[batch_idx][:, 1], labels: train_y[batch_idx] } err, _ = sess.run([loss, train_ops], feed_dict=feed) train_loss += err iterations += 1 print("Epoch: {}, Loss: {:.4f}".format(epoch, train_loss / iterations)) test_member_metrics, test_nonmember_metrics = collect_scores( test_ctxs, test_labels, sess) compute_adversarial_advantage( [np.mean(m) for m in test_member_metrics], [np.mean(m) for m in test_nonmember_metrics])
def load_FastText_yelp(path): print('loading FastText yelp...') ft_model = FastText.load(path) return ft_model
y_embed = [f'{i}_y' for i in range(100)] embed = embed_df[x_embed].values + embed_df[y_embed].values embed = l2norm(embed) # save embedding vector with open(f'{args.savedir}/{pre_embedname}.pickle', 'wb') as f: pickle.dump(embed, f) else: print('[{0:15s}] Evaluation'.format('STATE')) # configuration # - feature selection show_features = ['category', 'brand', 'nb_reviews', 'vol_price', 'product'] # - load embed model model = FastText.load(f'{args.savedir}/{modelname}.bin') # - filtering class filtering = Filtering(show_features) # 1. load data data, products, info = load(reviewpath, productpath, infopath) # 2. preprocessing new sentence # test_text = GP.fit([args.search], args.wordpath, args.pospath) test_text = list(map(GP.stopword, [args.search])) test_text = GP.spacefix(test_text) print('[{0:15s}] result : {1:}'.format('PREPROCESSING', test_text)) test_sent_vec = GP.sent2vec(test_text, model) test_sent_vec = l2norm(test_sent_vec) # 3. calculration similarity : cosine distance
# In[1]: import pickle from keras.models import load_model import keras # In[2]: from keras.preprocessing import sequence from keras.preprocessing.sequence import pad_sequences from nltk import sent_tokenize, word_tokenize from keras import backend as k # wordvector load from gensim.models import FastText model = FastText.load('fasttext_model') fasttext = model.wv # In[3]: def pred1(model, sentence): _dtype = k.floatx() sentence_token = [] sentence_token += word_tokenize(sentence) sentence_vec = [] sentence_vec.append([fasttext[v] for v in sentence_token]) padd = sequence.pad_sequences(sentence_vec, maxlen=45, dtype=_dtype) intent = ans1(model.predict(padd)[0])
TIME_ZONE = 'Asia/Seoul' # 'UTC' USE_I18N = True USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/3.1/howto/static-files/ STATIC_URL = '/static/' # MODEL = FastText.load(r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko\wiki_ko_v3.model') MODEL = FastText.load( r'C:\Users\HAN\Desktop\WarmingUpProject\AIJOA_Project\wiki.ko\wiki_ko_v3.model' ) MENULIST = { '폴더버거 핫치킨': [ '골드버거 치킨', '오늘도 봐봐 치킨', '오늘도 보고 와 치킨', '불도 먹었어 치킨', '골드버거 핫치킨', '골드버거 치킨', '월드 보고 아침에', '오늘도 보고 와 치킨', '폴더 버거 킹', '홀더 버거 치킨', '뭘 더 먹어 치킨', '너 먹어 치킨', '뭐 먹어 치킨' ], '폴더버거 비프': [ '골드버그 비프', '올더 버거 비프', '폴더 버거 비프', '골드버그 비프 세트', '올더 버거 비프 세트', '어디서 먹어 핑크색', '물 더 먹어 비트 세트', '골드버그 비프 세트', '올 더 버거 비틀 세트', '홀더 버거 비프', '뭘 더 먹어 비프', '너 먹어 피프 세트', '뭐 먹어 비프' ], '리아미라클버거': ['리아미라클버거', '미아 미라클버거', '리아미라클버거 세트', '미라클버거 세트', '리아 미라클 버거 세트'],