def lrTrain( lb={ 'b': 1, 't': 1, 'e': 1, 'm': -1 }, n=1000, fe='one-hot', descend=None): Y, X = fE.sample(n) y = [] for s in Y: y.append(lb[s]) if fe == 'one-hot': x = fE.oneHotGet(X) if fe == 'tf-idf': x = fE.tfIdfGet(X) if fe == 'word2vec': x = fE.word2vec(X) if descend == 'pca': x = fE.pcaGet(x) if descend == 'lda': x = fE.ldaGet(x) clf = LogisticRegression(C=1, penalty='l1', tol=0.01, class_weight='balanced', solver='saga') clf.fit(x, y) joblib.dump(clf, "lrTrainModel.m") print('lr训练, label转换为:', lb, '\n训练集规模:', n, '\n特征提取为:', fe, '\n降维方法:', descend) log = open('log.txt', 'a', encoding='utf-8') log.write('lr训练, label转换为: ' + str(lb) + '\n训练集规模: ' + str(n) + '\n特征提取为: ' + str(fe) + '\n降维方法: ' + str(descend) + '\n') log.write('模型保存在: lrTrainModel.m中\n---------------------\n') log.close() return
def svmTrain( lb={ 'b': 1, 't': 1, 'e': 1, 'm': -1 }, n=1000, W=1, fe='one-hot', descend=None): Y, X = fE.sample(n) y = [] for s in Y: y.append(lb[s]) if fe == 'one-hot': x = fE.oneHotGet(X) if fe == 'tf-idf': x = fE.tfIdfGet(X) if fe == 'word2vec': x = fE.word2vec(X) if descend == 'pca': x = fE.pcaGet(x) if descend == 'lda': x = fE.ldaGet(x) clf = svm.SVC(class_weight='balanced') clf.fit(x, y) joblib.dump(clf, "svmTrainModel.m") print('svm训练, label转换为:', lb, '\n训练集规模:', n, '\n权重:', W, '\n特征提取为:', fe, '\n降维方法:', descend) log = open('log.txt', 'a', encoding='utf-8') log.write('svm训练, label转换为: ' + str(lb) + '\n训练集规模: ' + str(n) + '\n权重: ' + str(W) + '\n特征提取为: ' + str(fe) + '\n降维方法: ' + str(descend) + '\n') log.write('模型保存在: svmTrainModel.m中\n---------------------\n') log.close() return
def lrTrain( lb={ 'b': 1, 't': 1, 'e': 1, 'm': -1 }, n=1000, fe='one-hot', descend=None): Y, X = fE.sample(n) y = [] for s in Y: y.append(lb[s]) if fe == 'one-hot': x = fE.oneHotGet(X) if fe == 'tf-idf': x = fE.tfIdfGet(X) if fe == 'word2vec': x = fE.word2vec(X) if descend == 'pca': x = fE.pcaGet(x) if descend == 'lda': x = fE.ldaGet(x) clf = NearestCentroid() clf.fit(x, y) joblib.dump(clf, "knnTrainModel.m") print('knn训练, label转换为:', lb, '\n训练集规模:', n, '\n特征提取为:', fe, '\n降维方法:', descend) log = open('log.txt', 'a', encoding='utf-8') log.write('knn训练, label转换为: ' + str(lb) + '\n训练集规模: ' + str(n) + '\n特征提取为: ' + str(fe) + '\n降维方法: ' + str(descend) + '\n') log.write('模型保存在: knnTrainModel.m中\n---------------------\n') log.close() return
def accuracy(lb={'b': 1, 't': 1, 'e': 1, 'm': -1}, fe='one-hot', descend=None): clf = joblib.load("knnTrainModel.m") f = open('label_segmentation_test.txt', 'r', encoding='utf-8') line = f.readline() dataSet = [] while line: dataSet.append(line.split(',')) line = f.readline() f.close() random.shuffle(dataSet) count = 3000 X = [] Y = [] for i in range(count): Y.append(dataSet[i][0]) X.append(dataSet[i][1].strip('\n')) y = [] for s in Y: y.append(lb[s]) if fe == 'one-hot': x = fE.oneHotGet(X) if fe == 'tf-idf': x = fE.tfIdfGet(X) if fe == 'word2vec': x = fE.word2vec(X) if descend == 'pca': x = fE.pcaGet(x) if descend == 'lda': x = fE.ldaGet(x) TP = 1 FP = 1 TN = 1 FN = 1 for i in range(count): if fe != 'word2vec': x[i] = np.array(x[i]) res = clf.predict(x[i].reshape(1, -1)) if res == y[i] and y[i] == 1: TP += 1 if res == y[i] and y[i] == -1: TN += 1 if res != y[i] and y[i] == 1: FN += 1 if res != y[i] and y[i] == -1: FP += 1 if i % 500 == 0: print(i) #count = TP + TN + FN + FP precision = TP / (TP + FP) recall = TP / (TP + FN) F1 = 2 * TP / (count + TP - TN) print('knn测试\nlabel转换为:', lb, '\n特征提取为:', fe, '\n降维方法:', descend) print('TP:', TP, 'TN:', TN, 'FN:', FN, 'FP:', FP) print('总计:', count) print('精度:', TP + TN, ', ', (TP + TN) / count) print('查准率:', precision) print('查全率:', recall) print('F1:', F1) log = open('log.txt', 'a', encoding='utf-8') log.write('knn测试\nlabel转换为: ' + str(lb) + '\n特征提取为: ' + str(fe) + '\n降维方法: ' + str(descend) + '\n') log.write('TP: ' + str(TP) + ', TN: ' + str(TN) + ', FN:' + str(FN) + ', FP:' + str(FP) + '\n') log.write('总计: ' + str(count) + '\n') log.write('精度: ' + str(TP + TN) + ', ' + str((TP + TN) / count) + '\n') log.write('查准率: ' + str(precision) + '\n') log.write('查全率: ' + str(recall) + '\n') log.write('F1: ' + str(F1) + '\n-------------------\n') log.close() return