encoding='utf_8') as t: with open(path, 'r', encoding='utf_8') as f: lines = csv.reader(f, delimiter='\t') for line in lines: target = line[2] content = line[1] t.write(content + '\t' + '_label_' + target + '\n') #训练模型 classifier = ff.train_supervised( r'data\Chinese\Chinese fasttext data\seg_train', label='_label_') #储存模型 classifier.save_model( r'data\Chinese\Chinese fasttext data\fastText_model1') #保存模型 #加载模型 classifier = ff.load_model( r'data\Chinese\Chinese fasttext data\fastText_model1') #测试模型 correct = 0 total_count = 0 with open(r'data\Chinese\Chinese fasttext data\seg_test', 'r', encoding='utf_8') as t: lines = t.readlines() total_count = len(lines) print(total_count) for line in lines: txt = line.split('\t')[0] #根据数据间的分隔符切割行数据 txt = txt.strip('\n') #去掉每行最后的换行符'\n' predict = classifier.predict(txt) if predict[0][0] == line.split('\t')[1].strip('\n'): correct += 1
thread=1, loss="softmax") print_results(*model.test(test_data)) model.save_model(r"C:\Users\RY\Desktop\wikizhfasttext.bin") model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000) print_results(*model.test(valid_data)) model.save_model(r"C:\Users\RY\Desktop\wikizhfasttext.ftz") # 加载训练好的模型 from fastText import FastText model_path = r"C:\Users\RY\Desktop\wikizhfasttext.bin" model = FastText.load_model(model_path) # help(model) # 1. model.get_dimension() Get the dimension (size) of a lookup vector (hidden layer). # 2. model.get_input_matrix() Get a copy of the full input matrix of a Model. This only # works if the model is not quantized. # 3. model.get_input_vector() Given an index, get the corresponding vector of the Input Matrix. # 4. model.get_labels() Get the entire list of labels of the dictionary optionally # including the frequency of the individual labels. Unsupervised # models use words as labels, which is why get_labels # will call and return get_words for this type of # model. # 5. model.get_line() Split a line of text into words and labels. Labels must start with # the prefix used to create the model (__label__ by default) # 6. model.get_output_matrix() Get a copy of the full output matrix of a Model. This only # works if the model is not quantized.
print("你输入的内容是:",str_data) outstr='' import jieba import re jieba.load_userdict(os.path.join(filename,"dictionary.txt"))#载入词典 stopwords_file = os.path.join(filename,"stop_words.txt") stop_f = open(stopwords_file,"r",encoding='utf-8') stop_words = list() for line in stop_f.readlines(): line = line.strip() if not len(line): continue stop_words.append(line) stop_f.close () str_data=re.sub(r'[A-Za-z0-9]|/d+', '', str_data)#去除英文与数字 str_data=jieba.cut(str_data,cut_all=False,HMM=True) for word in str_data: if word not in stop_words: if word != '\t': outstr += word outstr += " " classifier = ff.load_model(os.path.join(filename,'model/model1.model')) test = classifier.predict(outstr,k=1,threshold=0.5) print("输入商品的标签为:",str(test[0]).replace("__label__",'')) print("商品属于该标签的概率为:",test[1])
def run(self, save_directory=None): """ :param save_directory: filepath where we save the text embedding :type save_directory: str :return: A 3d-array matrix storing the text embedding. The matrix is of shape (nb_sentences, max_sentence_length, embedding_size) """ if self.verbose: print("Loading FastText model...") model = FastText.load_model(MODEL_PATH) drug_embedding_path = utils.get_drug_embedding_path() sentences_list = [] if self.drug_description_embedding: try: with open(drug_embedding_path, 'rb') as f: drug_embeddings = pickle.load(f) except FileNotFoundError: self.drug_description_embedding = False print('Drugs will be embedded as "médicament".') for i, sentence in tqdm(enumerate(self.sentences), desc='Embedding words for each sentence...', disable=not self.verbose, total=len(self.sentences)): sentence_embedding = [] sentence = sentence.lower() splits = FastText.tokenize(sentence) for word in splits: # Skipping non-words if not re.match('(\w)+', word) or word in self.stop_words: continue # Getting rid of the apostrophe and taking the following word apos_split = word.split("'") if len(apos_split) == 2: _, word = apos_split if not word: continue # Dealing with the drug name if unidecode(word) in self.drug_names_set: # TODO: try something more complex if self.drug_description_embedding and drug_embeddings: try: emb_w = drug_embeddings[word] sentence_embedding.append(emb_w) continue except KeyError: word = DRUG_REPLACEMENT else: word = DRUG_REPLACEMENT # Correcting words if self.do_correction and not params.FR_DICT.check(word): suggestions = params.FR_DICT.suggest(word) if suggestions: word = suggestions[0] # Embedding sentence_embedding.append(model.get_word_vector(word)) if sentence_embedding: sentences_list.append(sentence_embedding) else: print("Warning: Found an empty sentence embedding. Ignoring.") del self.y[i] # # Updating max_sentence_length # sentence_length = len(sentence_embedding) # if sentence_length > max_sentence_length: # max_sentence_length = sentence_length # # # Padding sentence matrices with 0 vectors # text_embedding = [] # for sentence_embedding in sentences_list: # sentence_length = len(sentence_embedding) # sentence_embedding.extend([np.zeros((utils.get_embedding_dim(),))] \ # * (max_sentence_length - sentence_length)) # text_embedding.append(sentence_embedding) # # # Deleting list of sentences # del sentences_list embeddings = np.array(sentences_list) print("\nSaving text embedding of shape %s" % str(embeddings.shape)) X_train, X_test, y_train, y_test = train_test_split(embeddings, self.y, test_size=0.2, random_state=42) if save_directory: np.save(utils.get_X_train_path(save_directory), X_train) np.save(utils.get_X_test_path(save_directory), X_test) np.save(utils.get_y_train_path(save_directory), y_train) np.save(utils.get_y_test_path(save_directory), y_test) return embeddings
import numpy as np import torch import os from textCNN_chinese.textcnn import get_wordlists import fastText.FastText as ff from textCNN_chinese.textcnn.test import parse_net_result from textCNN_chinese.textcnn.model import textCNN from textCNN_chinese.textcnn import sen2inds classifier = ff.load_model("fasttext_save\\fastText_model") testCsvFile = "data\Chinese\Chinese raw data\\re_seg_test.csv" testFile = 'textCNN_chinese\model_save\\test.txt' testDataVecFile = 'textCNN_chinese\model_save\\testdata_vec.txt' word2ind, ind2word = get_wordlists.get_worddict() label_w2n, label_n2w = sen2inds.read_labelFile('textCNN_chinese\model_save\label.txt') textCNN_param = { 'vocab_size': len(word2ind), 'embed_dim': 50, 'class_num': len(label_w2n), "kernel_num": 20, "kernel_size": [3, 4, 5], "dropout": 0.5, } datas = open(testFile, 'r', encoding='utf_8').read().split('\n') datas = list(filter(None, datas)) word2ind, ind2word = get_wordlists.get_worddict() net = textCNN(textCNN_param) weightFile = 'textCNN_chinese\model_save\\19071915_model_iter_99_loss_3.03.pkl'
d=[] sum_test=0#计算测试集的行数 with open(test_path,'r',encoding="utf-8") as f_test: for line_test in f_test: sum_test+=1 for d_count in range(0,sum_test): d.append({}) #导入模型来预测测试集文件标签 for count in range(0,1):#此处根据上一步训练所用的分类器更改 prepare_list = globals() if __name__ == '__main__': for i in range(1, 6):#一次导入五个模型 prepare_list['classisier' + str(i)] = ff.load_model("data/bootstarp/model/model" + str(count*5+i) + ".model") line_nu=0 with open(test_product, "r", encoding="utf-8") as f_test: for line in f_test: line = line.strip() dict = d[line_nu] for i in range(1, 5): str_test = prepare_list['classisier' + str(i)].predict(line, k=5, threshold=0.0) for j in range(0, 5): # print(str_test[0][j]) dict.setdefault(str_test[0][j], 0) dict[str_test[0][j]] += str_test[1][j] d[line_nu]=dict line_nu+=1 for count_update in d:#综合每个模型选出最终的标签
required=True, help='root directory of output') parser.add_argument('--no_cuda', action='store_true', help='do not use cuda') parser.add_argument('--fusing_method', type=str, default='', help='fusing_method') args = parser.parse_args() if not args.no_cuda and not torch.cuda.is_available(): print('Warning: cuda is not available on this machine.') args.no_cuda = True if __name__ == '__main__': print('Loading a pretrained fastText model...') word_embedding = FastText.load_model(args.fasttext_model) print('Loading a pretrained model...') txt_encoder = VisualSemanticEmbedding(args.embed_ndim) txt_encoder.load_state_dict(torch.load(args.text_embedding_model)) txt_encoder = txt_encoder.txt_encoder G = Generator(use_vgg=args.use_vgg, fusing=args.fusing_method) G.load_state_dict(torch.load(args.generator_model)) G.train(False) if not args.no_cuda: txt_encoder.cuda() G.cuda()
from torch.autograd import Variable from dmn_atis.DmnModel import DMN from dmn_atis.DmnLoader import label_to_index,word_to_index,flatten,ATIS_data_load,prepare_sequence,pad_to_fact,getBatch,pad_to_batch import random import numpy as np random.seed(1024) np.random.seed(1024) torch.manual_seed(1024) HIDDEN_SIZE = 80 BATCH_SIZE = 64 NUM_EPISODE = 3 classifier = ff.load_model("D:\VScode\BERTProject-master\\fasttext_save\\fastText_re_cut word_cross_validation_model0") testCsvFile = "D:\VScode\BERTProject-master\data\cross validation\\re_cut word_cross_validation_test0.csv" testFile = "textCNN_chinese\model_save\\re_cut word_cross_validation_test0.txt" testDataVecFile = 'textCNN_chinese\model_save\\re_cut word_cross_validation_testvec0.txt' train_data=ATIS_data_load("D:\VScode\BERTProject-master\data\cross validation\\re_cut word_cross_validation_train0.csv", encoding = "utf-8") fact, q, a = list(zip(*train_data)) vocab = ['在','哪','里','办','理'] for lis in fact: for seq in lis: for word in seq: if word not in vocab: vocab.append(word) word2index,index2word = word_to_index(vocab) labels = []
def load_fasttext(model_path): classifier = ff.load_model(model_path) return classifier
device_id = cfg["device_id"] torch.cuda.set_device(device_id) if cfg["dataset"] == 'conala': save_path = save_path + "_conala" elif cfg["dataset"] == 'codesearchnet': pass else: print("Wrong Dataset Entered") exit() print(f"Model = {cfg['model']}") # Loading word embeddings if use_bin: import fastText.FastText as ft ft_anno_vec = ft.load_model('conala/ft_models/anno_model.bin') ft_code_vec = ft.load_model('conala/ft_models/code_model.bin') else: from keras.preprocessing import text, sequence def prepare_sequence(seq, seq_len, to_ix): idxs_list = [] for seq_elem in seq: idxs = [] for w in seq_elem.split(): try: idxs.append(to_ix[w]) except KeyError: continue
def fast_text_predict(model_file, txt): classifier = ff.load_model(model_file) # 载入已经训练好的模型 pre = classifier.predict('i like apple', 10) # 输出改文本的预测结果 a = 1
#Remove stopwords and stem ps = PorterStemmer() filtered_words = [ word for word in words if word not in stopwords.words('english') ] stemmed_words = [ps.stem(word) for word in filtered_words] lower_words = [word.lower() for word in stemmed_words] text = ' '.join(stemmed_words) return text # Load the model print "Loading the model" model = fasttext.load_model(trained_model) print "Model loaded" title_row = [ 'rectype', 'sha', 'ins_del_count', 'issueid', 'actor', 'date', 'text', 'similarity' ] # Gets the aggregated word vector representation of the text def GetTextVector(text): final_vector = np.zeros((300, )) words = text.split(' ') for word in words: if len(word) > CHAR_THRESHOLD: continue word_vector = model.get_word_vector(word)
def predict(text): model = fasttext.load_model('models/fastText/fasttext_train.model.bin') text = extractWords(text) word_list = " ".join(jieba.cut(text)) label_predict = model.predict(word_list) return list(label_predict[0]) #turple 返回list
def load_model_to_test(): """ 加载模型进行测试 :return: """ # 加载测试数据 correct_labels = [] texts = [] with open("fasttext.test", "r", encoding="utf-8") as ft_test: for line in ft_test: correct_labels.append(line.strip().split(" , ")[0]) texts.append(line.strip().split(" , ")[1]) # 加载分类模型 for w in range(1, 2): all_marco_precision = [] all_marco_recall = [] all_marco_f1 = [] all_micro_precision = [] all_micro_recall = [] all_micro_f1 = [] for i in range(5, 51): classifier = ff.load_model("Model/model_w" + str(w) + "_e" + str(i)) print("Model/model_w" + str(w) + "_e" + str(i)) # 预测 predict_labels = classifier.predict(texts)[0] # 计算预测结果 true_positive = 0 false_positive = 0 false_negative = 0 evaluation_parameters = [] label_to_name = load_label_name_map()[0] for label, name in label_to_name.items(): evaluate_p = {} evaluate_p["name"] = name evaluate_p["nexample"] = len(texts) for i in range(len(texts)): # 预测属于该类,实际属于该类 if predict_labels[i] == label and correct_labels[ i] == label: true_positive += 1 # 预测属于该类,实际不属于该类 elif predict_labels[ i] == label and correct_labels[i] != label: false_positive += 1 # 预测不属于该类,实际属于该类 elif predict_labels[i] != label and correct_labels[ i] == label: false_negative += 1 evaluate_p["true_positive"] = true_positive evaluate_p["false_positive"] = false_positive evaluate_p["false_negative"] = false_negative # 计算精确率、召回率、F值 precision = true_positive / (true_positive + false_positive) evaluate_p["precision"] = precision recall = true_positive / (true_positive + false_negative) evaluate_p["recall"] = recall f1 = 2 * precision * recall / (precision + recall) evaluate_p["f1"] = f1 evaluation_parameters.append(evaluate_p) # print("%s标签测试结果:" % name) # print("测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" % (len(texts), precision, recall, f1)) # 计算宏平均和微平均 sum_precision = 0 sum_recall = 0 sum_true_positive = 0 sum_false_positive = 0 sum_false_negative = 0 for p in evaluation_parameters: sum_precision += p["precision"] sum_recall += p["recall"] sum_true_positive += p["true_positive"] sum_false_positive += p["false_positive"] sum_false_negative += p["false_negative"] n = len(evaluation_parameters) marco_precision = sum_precision / n all_marco_precision.append(marco_precision) marco_recall = sum_recall / n all_marco_recall.append(marco_recall) marco_f1 = 2 * marco_precision * marco_recall / (marco_precision + marco_recall) all_marco_f1.append(marco_f1) print("宏平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" % (len(texts), marco_precision, marco_recall, marco_f1)) micro_true_positive = sum_true_positive / n micro_false_positive = sum_false_positive / n micro_false_negative = sum_false_negative / n micro_precision = micro_true_positive / (micro_true_positive + micro_false_positive) all_micro_precision.append(micro_precision) micro_recall = micro_true_positive / (micro_true_positive + micro_false_negative) all_micro_recall.append(micro_recall) micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) all_micro_f1.append(micro_f1) print("微平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" % (len(texts), micro_precision, micro_recall, micro_f1)) names = [i for i in range(5, 51)] ax1 = plt.subplot(311) plt.plot(names, all_marco_precision, label='marco-P') plt.plot(names, all_micro_precision, label='micro-P') plt.legend(loc='upper left') ax2 = plt.subplot(312, sharey=ax1) plt.plot(names, all_marco_recall, label='marco-P') plt.plot(names, all_micro_recall, label='micro-R') plt.legend(loc='upper left') plt.subplot(313, sharey=ax1) plt.plot(names, all_marco_f1, label='marco-F1') plt.plot(names, all_micro_f1, label='micro-F1') plt.legend(loc='upper left') plt.xlabel(u"训练轮数(ngram=" + str(w) + ")") plt.savefig("./ngram" + str(w) + ".png") plt.show()
import re from fastText import FastText classifier = FastText.load_model("./fastText") with open("./tmp_test_content", "r") as f: lines = f.readlines() lines = list(map(lambda x: x.rstrip("\n"), lines)) labels, probs = classifier.predict(lines) labels = list(map(lambda x: re.sub("__label__", "", x[0]), labels)) print(labels) probs = list(map(lambda x: str(x[0]), list(probs))) res = list(zip(labels, probs)) res = list(map(lambda x: "\t".join(x), res)) res = list(zip(lines, res)) res = list(map(lambda x: "\t ".join(x), res)) res = "\n".join(res) with open("res3", "w") as f: f.write(res)
max_idf = max(tfidf.idf_) self.word2weight = defaultdict(lambda: max_idf) for w, i in tfidf.vocabulary_.items(): self.word2weight[w] = tfidf.idf_[i] def transform(self, X): return np.array([ np.mean([ self.fasttext_model.get_word_vector(w) * self.word2weight[w] for w in words ] or [np.zeros(self.dim)], axis=0) for words in X ]) if __name__ == '__main__': descriptions_df = pnd.read_csv(utils.get_drugs_indication_path(), encoding=params.UTF_8) fasttext_model = FastText.load_model(MODEL_PATH) tfidf_emb = TfidfEmbeddingVectorizer(fasttext_model, utils.get_embedding_dim()) tfidf_emb.fit(descriptions_df.descriptions) embeddings = tfidf_emb.transform(descriptions_df.descriptions) embeddings_dict = {} for i in range(descriptions_df.shape[0]): embeddings_dict[descriptions_df.loc[i, 'drug_names']] = embeddings[i] with open(utils.get_drug_embedding_path(), 'wb') as f: pickle.dump(embeddings_dict, f)
from fastText import FastText ft = FastText.load_model("lid.176.ftz") text = 'OpenAI has moved most of its staff to a for-profit LLC. RIP "open-source". RIP "non-profit".' labels = ft.predict(text) print(labels)
from fastText import FastText from flask import Flask, request import re from loadFromJSONString import loadFromJSONString from beautifyMessage import beautifyMessage app = Flask(__name__) classifier = FastText.load_model('slack_model.bin') channelToWorkMap = { '__label__-00_announcements': True, '__label__-01_general': True, '__label__-07_community': False, '__label__-12_random-fun': False, '__label__unrestricted-chat': False } logFileName = 'NOTify.log' isWorkHours = True with open(logFileName, 'w') as logFile: logFile.write('') def log(msg): print(msg) with open(logFileName, 'a') as log: log.write(msg + '\n') freeTimeQueue = []
def load_model(model_path): #加载模型 #加载windows模型 import fastText.FastText as ff classifier = ff.load_model(model_path) return classifier
def __init__(self, model=MODEL_FILE): self.model = FastText.load_model(model)
sum_test = 0 #计算分词后预测数据集的行数 with open(os.path.join(filename, "trainF.tsv"), 'r', encoding="utf-8") as f_test: for line_test in f_test: sum_test += 1 for d_count in range(0, sum_test): d.append({}) #导入模型来预测数据集文件标签 for count in range(0, 80): #此处根据上一步训练所用的分类器更改 prepare_list = globals() if __name__ == '__main__': for i in range(1, 6): #一次导入五个模型 prepare_list['classisier' + str(i)] = ff.load_model("data/bootstarp/model/model" + str(count * 5 + i) + ".model") line_nu = 0 with open(os.path.join(filename, "trainF.tsv"), "r", encoding="utf-8") as f_test: for line in f_test: line = line.strip() dict = d[line_nu] for i in range(1, 5): str_test = prepare_list['classisier' + str(i)].predict( line, k=5, threshold=0.0) for j in range(0, 5): # print(str_test[0][j]) dict.setdefault(str_test[0][j], 0) dict[str_test[0][j]] += str_test[1][j] d[line_nu] = dict line_nu += 1
# _*_coding:utf-8 _*_ import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import fastText.FastText as fasttext #加载模型 model=fasttext.load_model('models/fasttext_train.model.bin') labels_right = [] texts = [] labels_predict = [] with open("data/test/fastText_test.txt") as fr: for line in fr: line = line.decode("utf-8").rstrip() label_right=line.split("\t")[1] labels_right.append(label_right) text=line.split("\t")[0] texts.append(text) label_predict=model.predict(text) labels_predict.append(label_predict[0]) print ("文本: ") print (line) print ("真实label: ") print (label_right) print ("预测label: ") print (label_predict[0]) predict_labels=[] for predict_label in labels_predict: predict_labels.append(predict_label[0])
#作者:肖劲宇 张红轩 #用途:进行意图分析 #时间:2019.6.26 import fastText.FastText as ff import jieba classifier = ff.load_model('data/try.model') labels_right = [] texts = [] with open("data/test.txt") as fr: for line in fr: line = line.decode("utf-8").rstrip() labels_right.append(line.split("\t")[1].replace("__label__", "")) texts.append(line.split("\t")[0]) labels_predict = [e[0] for e in classifier.predict(texts)] #预测输出结果为二维形式 # print labels_predict text_labels = list(set(labels_right)) text_predict_labels = list(set(labels_predict)) print(text_predict_labels) print(text_labels) A = dict.fromkeys(text_labels, 0) #预测正确的各个类的数目 B = dict.fromkeys(text_labels, 0) #测试数据集中各个类的数目 C = dict.fromkeys(text_predict_labels, 0) #预测结果中各个类的数目 for i in range(0, len(labels_right)): B[labels_right[i]] += 1 C[labels_predict[i]] += 1 if labels_right[i] == labels_predict[i]: A[labels_right[i]] += 1
#!/usr/bin/python from fastText import FastText model = FastText.load_model('./model/final_ns.bin') def validate_keyword(key): tup = () words = key.split(' ') for word in words: tup = tup + model.get_subwords(word) tup_list = list(tup) for l in tup_list: if (type(l) is not list): tup_list.remove(l) keyword = '' for l in tup_list: if (len(l) == 0): tup_list.remove(l) else: keyword = keyword + ' ' + str(l[0]) if (len(keyword) != 0): return True else: return False def predict_label(key):