def train(): classifier = ft.train_supervised(train_path) model = classifier.save_model(model_path) test = classifier.test(test_path) print("准确率:", test.precision) print("回归率:", test.recall) classifier.get_labels()
def train_classifier(self): # self.load_train_dataset() #实验后的最佳参数之一 start_time = time.time() classifier = ff.train_supervised( self.data_path, lr=0.1, loss='hs', wordNgrams=2, epoch=300) # epoch=20,0.91;epoch=50,0.93; model = classifier.save_model( self.model_save_path + 'level_2_fasttext_classifier_big_big.model' ) # 保存模型 all:0.91;all_2:0.93 classifier.get_labels() # 输出标签 # 测试模型 # print('加载fasttext模型--{}'.format('level_1_fasttext_classifier_big_test.model')) # classifier = ff.load_model(self.model_save_path+'level_1_fasttext_classifier_big_test.model') test_result = classifier.test(self.test_save_path + 'test_big.txt') result_str = 'test precision:{}\n'.format(test_result) print(result_str) end_time = time.time() load_time = round(end_time - start_time, 3) train_time_str = 'train and test model time %fs' % load_time print(train_time_str) save_file(self.result_save_path + 'fasttext_result_big.txt', result_str + train_time_str + '\n', 'a')
def fastText_classifier(train_data,model_save_path): classifier=ff.train_supervised(train_data+'book_sub_level_1_train.txt',lr=0.1,loss='hs',wordNgrams=2,epoch=300) model = classifier.save_model(model_save_path+'book_sub_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(train_data+'book_sub_level_1_train.txt') print(result) '''
def trainFT(path: str, n=1): """ """ clf = FT.train_supervised(path, epoch=100, dim=100, wordNgrams=n, label='__label__', loss='softmax') return clf
def train_fasttext_win(inputPath='news_fasttext/news_fasttext_train.txt', savePath='model.m', label='__label__'): if not os.path.exists(savePath): print('train model...') classifier = ff.train_supervised(inputPath, label=label) classifier.save_model(savePath) # 保存模型 else: classifier = ff.load_model('model.m') # 读取模型 print('loaded model...') return classifier
def fast_text_model(X_test): ''' 使用fasttext进行文本分类 ''' # 分类训练 classifier = ff.train_supervised('train.txt', label='__label__') # 模型预测,返回预测标签和概率 label, prob = classifier.predict(X_test) # 根据给定数据集对模型进行评价,返回样本个数、准确率、召回率 result = classifier.test('test.txt') return label, prob, result
def test_fasttext(train_path, test_path, model_save_path): classifier = ff.train_supervised(train_path + 'A_train.txt', lr=0.1, loss='hs', wordNgrams=2, epoch=50) #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt') model = classifier.save_model(model_save_path + 'A_train_classifier2.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(test_path + 'A_test.txt') print(result)
def train_fasttext(inputPath='train.txt', savePath='./model.m', label='__label__'): if not os.path.exists(savePath): print('train...') classfication = ft.train_supervised(inputPath, label=label) classfication.save_model(savePath) else: classfication = ft.load_model(savePath) print('load model...') return classfication
def fasttext_model_train(): """ fasttext模型训练 :return: """ for i in range(5, 51): for w in range(1, 3): start_time = time.time() classifier = ff.train_supervised("fasttext.train", epoch=i, lr=0.5, wordNgrams=w) print("ngram=%d,训练第%d轮,用时%s" % (w, i, time.time() - start_time)) classifier.save_model("Model/model_w" + str(w) + "_e" + str(i))
def fastText_classifier(train_data, model_save_path, result_save_path): classifier = ff.train_supervised(train_data + 'level_3_train.txt', lr=0.1, loss='hs', wordNgrams=2, epoch=150) model = classifier.save_model(model_save_path + 'level_3_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(train_data + 'level_3_train.txt') print(result) with open(result_save_path + 'train3_results.txt', 'w') as fp: fp.write(str(result)) fp.write('\n') '''
def fastText_classifier(train_data,test_data,model_save_path,result_save_path): files = [] results = [] if not os.path.exists(train_data): os.makedirs(train_data) if not os.path.exists(model_save_path): os.makedirs(model_save_path) if not os.path.exists(result_save_path): os.makedirs(result_save_path) for level_one,test_l1 in zip(os.listdir(train_data),os.listdir(test_data)): print(level_one+'-->'+test_l1) if '.txt' in level_one and '.txt' in test_l1: classifier=ff.train_supervised(train_data+level_one,、lr=0.1,loss='hs',wordNgrams=2,epoch=100) #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt') model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(test_data+test_l1) files.append(level_one) results.append(result) print(result) else: data_list = os.listdir(train_data+level_one+'/') test_list = os.listdir(test_data+test_l1+'/') if not len(data_list) or not len(test_data): continue classifier=ff.train_supervised(train_data+level_one+'/'+data_list[0],lr=0.1,loss='hs',wordNgrams=2,epoch=50) #classifier = load_model.load_model(model_save_path+'train_level_1_classifier.model','nt') model = classifier.save_model(model_save_path+level_one+'_classifier.model') # 保存模型 classifier.get_labels() # 输出标签 result = classifier.test(test_data+test_l1+'/'+test_list[0]) files.append(data_list[0]) results.append(result) print(result) print(files) print(results) with open(result_save_path+'train_results.txt','w') as fp: for i,j in zip(files,results): fp.write(str(i)+'-->'+str(j)) fp.write('\n') '''
def fasttext_train(self): model_file = os.path.join(self.home_data, 'fasttext.model') if os.path.exists(model_file): print("Fasttext模型已经存在,直接载入") classifier = ft.load_model(self.home_data + 'fasttext.model') else: print("Fasttext模型不存在,训练") import fastText.FastText as ft classifier = ft.train_supervised(self.home_data+"fasttext_train.txt") # 训练模型 model = classifier.save_model(self.home_data+'fasttext.model') # 保存模型 classifier = ft.load_model(self.home_data+'fasttext.model') # 导入模型 result = classifier.test(self.home_data+"fasttext_test.txt") # 输出测试结果 labels = classifier.get_labels() # 输出标签 print("测试实例数", result[0]) # 实例数 print("准确率", result[1]) # 全部的准确率 print("召回率", result[2]) # 召回率 logging.info('测试实例数 %s' % str(result[0])) logging.info('准确率 %s' % str(result[1])) logging.info('召回率 %s' % str(result[2]))
def train(self, vecSize, winSize, epochs, minCount, lossFunction, sampleThreshold, learnRate, ngrams, wordGrams, bucket): self.model = ft.train_supervised( input=self.trainingFile, lr=learnRate, dim=vecSize, ws=winSize, epoch=epochs, minCount=minCount, loss=("ns" if lossFunction < 0 else ("softmax" if lossFunction == 0 else "hs")), neg=(-lossFunction if lossFunction < 0 else 0), t=sampleThreshold, minn=ngrams // 2, maxn=ngrams, wordNgrams=wordGrams, bucket=bucket, verbose=0, thread=4 #8 )
class TCFastText(object): texts, labels = read("./data/cut_data.txt") labels = list(map(lambda x: "__label__" + str(x), labels)) data = zip(texts, labels) data = list(map(lambda x: " ".join(x), data)) data = np.array(data) kf = KFold(n_splits=8) for train_index, test_index in kf.split(data): print("Train:", train_index, "Test:", test_index) train = data[train_index] test = data[test_index] with open("data/ft_train", "w") as f: f.write("\n".join(train)) with open("data/ft_test", "w") as f: f.write("\n".join(test)) # ft = FastText.train_supervised("data/ft_train", dim=128, epoch=60, minCount=4, wordNgrams=5, label="__label__") ft = FastText.train_supervised("data/ft_train", dim=128, epoch=60, minCount=5, wordNgrams=3, label="__label__") # ft = FastText.train_supervised("data/ft_train", dim=80, epoch=60, minCount=5, wordNgrams=3, label="__label__") result = ft.test("data/ft_test") ft.save_model("fastText") print(result) break
#十折交叉验证法检验最终模型的准确率 precision=[] for i in range(0,len(result)): f_train = open(os.path.join(filename, "original/train.tsv"), 'w', encoding="utf-8")#将训练集分成训练文件与测试文件 f_test = open(os.path.join(filename, "original/test.tsv"), 'w', encoding="utf-8")#此处自动生成 a=result[i] list_train=random.sample(list(set(resultList)-set(a)),(len(resultList)-len(a))) for x in list_train: f_train.write(linecache.getline(train_new,x)) for y in a: f_test.write(linecache.getline(train_new,y)) f_train.close() f_test.close() start=time.time() classifier = ff.train_supervised(os.path.join(filename, 'original/train.tsv'), dim=64, lr=0.7, wordNgrams=2, minCount=2,bucket=10000000,label = '__label__',thread = 20,epoch=7)#训练代码 model=classifier.save_model(os.path.join(filename,'original/model/model'+str(i+1)+'.model')) # 保存模型 test = classifier.test(os.path.join(filename, 'original/test.tsv'), k=1)#测试 end=time.time() precision.append((test[1],end-start)) print('模型预测准确率:', test[1]) print("训练时间为:",end-start) sum_precision=0 sum_time=0 for i,t in precision: sum_precision+=i sum_time+=t #用于删除本程序中生成的训练文件,测试文件,这两文件每次运行均会生成,最终删除 os.remove(os.path.join(filename,"original/train.tsv"))
import pandas as pd from sklearn.model_selection import train_test_split data = pd.read_csv("D:\\new_data\\new_data\\train_set.csv") data = data.loc[:, ['word_seg', 'class']] data['label'] = data.apply(lambda x: 'label' + str(x[1]), axis=1) x_train, x_test = train_test_split(data, test_size=0.3, random_state=42) x_train.loc[:, ['word_seg', 'label']].to_csv( "D:\\new_data\\new_data\\train_set1.txt", index=False, header=None, sep='\t') x_test.loc[:, ['word_seg', 'label']].to_csv( "D:\\new_data\\new_data\\test_set1.txt", index=False, header=None, sep='\t') import fastText.FastText as ff classifier = ff.train_supervised('D:\\new_data\\new_data\\train_set1.txt', label="label") result = classifier.test("D:\\new_data\\new_data\\test_set1.txt")
# _*_coding:utf-8 _*_ import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import fastText.FastText as fasttext #训练模型 classifier = fasttext.train_supervised("data/train/fastText_train.txt", label="__label__") #保存模型 classifier.save_model('models/fasttext_train.model.bin') labels_right = [] texts = [] labels_predict = [] with open("data/train/fastText_train.txt") as fr: for line in fr: line = line.decode("utf-8").rstrip() label_right = line.split("\t")[1] labels_right.append(label_right) text = line.split("\t")[0] texts.append(text) label_predict = classifier.predict(text) labels_predict.append(label_predict[0]) print("文本") print(line) print("真实label") print(label_right) print("预测label") print(label_predict[0])
maxn # max length of char ngram [0] neg # number of negatives sampled [5] wordNgrams # max length of word ngram [1] loss # 损失函数 {ns, hs, softmax, ova} [softmax] bucket # number of buckets [2000000] thread # 线程数 [number of cpus] lrUpdateRate # 学习率更新速率 [100] t # sampling threshold [0.0001] label # 标签前缀 ['__label__'] verbose # verbose [2] pretrainedVectors # pretrained word vectors (.vec file) for supervised learning [] """ model = ft.train_supervised("train.txt", lr=1, dim=300, epoch=5, wordNgrams=4, loss='hs') model.save_model("model_file.bin") def print_results(N, p, r): print("N\t" + str(N)) # 预测错的例子 # P: 准确率 R: 召回率 print("P@{}\t{:.3f}".format(1, p)) print("R@{}\t{:.3f}".format(1, r)) print_results(*model.test('test.txt'))
import csv #def transfercsv_to_fastText(csv_path,fastText_file): path = r"data\Chinese\Chinese raw data\seg_test.tsv" with open(r'data\Chinese\Chinese fasttext data\seg_test', 'w', encoding='utf_8') as t: with open(path, 'r', encoding='utf_8') as f: lines = csv.reader(f, delimiter='\t') for line in lines: target = line[2] content = line[1] t.write(content + '\t' + '_label_' + target + '\n') #训练模型 classifier = ff.train_supervised( r'data\Chinese\Chinese fasttext data\seg_train', label='_label_') #储存模型 classifier.save_model( r'data\Chinese\Chinese fasttext data\fastText_model1') #保存模型 #加载模型 classifier = ff.load_model( r'data\Chinese\Chinese fasttext data\fastText_model1') #测试模型 correct = 0 total_count = 0 with open(r'data\Chinese\Chinese fasttext data\seg_test', 'r', encoding='utf_8') as t: lines = t.readlines() total_count = len(lines) print(total_count)
def fast_text_train(data_file, model_file, test_file): classifier = ff.train_supervised(data_file) classifier.save_model(model_file) # 保存模型 test = classifier.test(test_file, 1) # 输出测试结果 b = test.precision a = 1
def train_model(): start_time = time.time() all_marco_precision = [] all_marco_recall = [] all_marco_f1 = [] all_micro_precision = [] all_micro_recall = [] all_micro_f1 = [] for i in range(5, 51): classifier = ff.train_supervised("fastText/train_data", epoch=i, lr=0.5) classifier.save_model("fastText/model/train") print("模型构建时间:%s s" % str(time.time() - start_time)) # 因为fasttext中设计的是针对多标签的精确率与召回率,对于单标签,计算结果一致,不具有参考价值 # print("积极数据测试:") # test = classifier.test('fastText/test_data_positive') # print("测试数据数量:%d\t准确率:%f\t召回率:%f" % (test[0], test[1], test[2])) # print("中立数据测试:") # test = classifier.test('fastText/test_data_neutral') # print("测试数据数量:%d\t准确率:%f\t召回率:%f" % (test[0], test[1], test[2])) # print("消极数据测试:") # test = classifier.test('fastText/test_data_negative') # print("测试数据数量:%d\t准确率:%f\t召回率:%f" % (test[0], test[1], test[2])) correct_labels = [ line.strip().split(" , ")[0] for line in open( 'fastText/test_data', "r", encoding="utf-8").readlines() ] texts = [ line.strip().split(" , ")[1] for line in open( 'fastText/test_data', "r", encoding="utf-8").readlines() ] predict_labels = classifier.predict(texts)[0] true_positive = 0 false_positive = 0 false_negative = 0 evaluation_parameters = [] labels = {"__label__-1": "消极", "__label__0": "中立", "__label__1": "积极"} for label, name in labels.items(): evaluate_p = {} print("%s标签测试结果:" % name) evaluate_p["name"] = name evaluate_p["nexample"] = len(texts) for i in range(len(texts)): # 预测属于该类,实际属于该类 if predict_labels[i] == label and correct_labels[i] == label: true_positive += 1 # 预测属于该类,实际不属于该类 elif predict_labels[i] == label and correct_labels[i] != label: false_positive += 1 # 预测不属于该类,实际属于该类 elif predict_labels[i] != label and correct_labels[i] == label: false_negative += 1 evaluate_p["true_positive"] = true_positive evaluate_p["false_positive"] = false_positive evaluate_p["false_negative"] = false_negative # 计算精确率、召回率、F值 precision = true_positive / (true_positive + false_positive) evaluate_p["precision"] = precision recall = true_positive / (true_positive + false_negative) evaluate_p["recall"] = recall f1 = 2 * precision * recall / (precision + recall) evaluate_p["f1"] = f1 evaluation_parameters.append(evaluate_p) print("测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" % (len(texts), precision, recall, f1)) # 计算宏平均和微平均 sum_precision = 0 sum_recall = 0 sum_true_positive = 0 sum_false_positive = 0 sum_false_negative = 0 for p in evaluation_parameters: sum_precision += p["precision"] sum_recall += p["recall"] sum_true_positive += p["true_positive"] sum_false_positive += p["false_positive"] sum_false_negative += p["false_negative"] n = len(evaluation_parameters) marco_precision = sum_precision / n all_marco_precision.append(marco_precision) marco_recall = sum_recall / n all_marco_recall.append(marco_recall) marco_f1 = 2 * marco_precision * marco_recall / (marco_precision + marco_recall) all_marco_f1.append(marco_f1) print("宏平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" % (len(texts), marco_precision, marco_recall, marco_f1)) micro_true_positive = sum_true_positive / n micro_false_positive = sum_false_positive / n micro_false_negative = sum_false_negative / n micro_precision = micro_true_positive / (micro_true_positive + micro_false_positive) all_micro_precision.append(micro_precision) micro_recall = micro_true_positive / (micro_true_positive + micro_false_negative) all_micro_recall.append(micro_recall) micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) all_micro_f1.append(micro_f1) print("微平均----测试集大小:%d\t精确率:%f\t召回率:%f\tF_1:%f" % (len(texts), micro_precision, micro_recall, micro_f1)) names = [i for i in range(5, 51)] ax1 = plt.subplot(311) plt.plot(names, all_marco_precision, label='marco-P') plt.plot(names, all_micro_precision, label='micro-P') plt.legend(loc='upper left') ax2 = plt.subplot(312, sharey=ax1) plt.plot(names, all_marco_recall, label='marco-P') plt.plot(names, all_micro_recall, label='micro-R') plt.legend(loc='upper left') plt.subplot(313, sharey=ax1) plt.plot(names, all_marco_f1, label='marco-F1') plt.plot(names, all_micro_f1, label='micro-F1') plt.legend(loc='upper left') plt.xlabel(u"训练轮数(ngram=1)") plt.savefig('./ngram1.png') plt.show()
r = float(A[key]) / float(B[key]) p = float(A[key]) / float(C[key]) f = p * r * 2 / (p + r) logging.info("%s:\t p:%f\t r:%f\t f:%f" % (key, p, r, f)) except: logging.error("error:", key, "right:", A.get(key, 0), "real:", B.get(key, 0), "predict:", C.get(key, 0)) if __name__ == "__main__": base_dir = data_path.metaphor_data_base_dir filename_train = 'metaphor_recognition.fasttext.train' filename_validation = 'metaphor_recognition.fasttext.validation' # base_dir = r'/home/liyuncong/program/fasttext/data/' # filename_train = 'news_fasttext_train.txt' # filename_validation = 'news_fasttext_test.txt' filename_model = 'metaphor_recognition.fasttext' train_data = os.path.join(base_dir, filename_train) valid_data = os.path.join(base_dir, filename_validation) # train_supervised uses the same arguments and defaults as the fastText cli model = ff.train_supervised(train_data, epoch=25, lr=1.0, wordNgrams=2) test_result = model.test(valid_data) print_results(*test_result) __predict(model, valid_data)
import fastText.FastText as ff import jieba #使用训练数据集进行有监督的训练 classifier = ff.train_supervised("data/train.txt") #模型进行持久化 model = classifier.save_model('data/try.model')