def test(clfs = None): if clfs == None: return print('begin test') #test_loader load_test = rst.Load(testdata_path) #测试数据,原始数据 test_docs = load_test.datas #测试数据label,原始label。like:<Opinion target="food" category="FOOD#QUALITY" polarity="negative" from="4" to="8"/> test_docs_labels = load_test.labels #同上 load_train = rst.Load(traindata_path) #加载特征 feature_loader = fp.LoadFeature(load_train.datas,load_train.labels) testX = feature_loader.get_all_feature(test_docs) #测试数据label转化为矩阵形式 testY = raw_label_process(test_docs_labels) #预测概率初始化 predict_prob = np.zeros((len(test_docs), category_num)) #预测 for col, clf in enumerate(clfs): prob = clf.predict_proba(testX)[:, 1] predict_prob[:, col] = prob # 根据threshold决定每一句预测的label(字符串形式) labels = [] for docid, test_doc in enumerate(test_docs): # up_threshold_ids : a tuple up_threshold_ids = np.where(predict_prob[docid, :] >= paras['threshold']) ids = up_threshold_ids[0] # for id in up_threshold_ids: if len(ids) == 0: labels.append(None) continue label = [] for id in ids: label.append(categorys[id]) labels.append(label) #生成预测结果xml,利用评估工具A.jar来评测 rst.generate_xml(labels, input_path=testdata_path, output_path=output_path) #利用自己的评测函数来评测,二者评测结果差不多 threshod_list = [paras['threshold'] for j in range(category_num)] threshod_vec = np.array(threshod_list) for row in range(len(test_docs)): predict_prob[row, :] = np.less_equal(threshod_vec, predict_prob[row, :]) predict_prob =predict_prob.astype(int) # get f score predict_prob and testY fscore = get_fscore(testY, predict_prob) print('fscore:'+str(fscore))
def train(): print('load train datas and labels') load = rst.Load(traindata_path) train_docs = load.datas train_docs_labels = load.labels feature_loader = fp.LoadFeature(train_docs,train_docs_labels) print('begin train') clfs = [] trainX = feature_loader.get_all_feature(train_docs) #tfidf_matrix 是稀疏矩阵,转化为正常矩阵 trainY = raw_label_process(train_docs_labels) #trainX, trainY = data_feature_precess(train_docs, train_all_docs_raw_str_labels,train_docs) for category,trainX,y in getdata(trainX,trainY): print('train the classfier:'+category) clf = MLPClassifier(solver=paras['solver'], hidden_layer_sizes=paras['hidden_layer_sizes'], alpha=paras['alpha'], learning_rate_init=paras['learning_rate_init'], random_state=paras['random_state'], verbose = False ) #clf = SVC(kernel='linear', probability=True) clf.fit(trainX,y) clfs.append(clf) return clfs
def model_selection2(): load = rst.Load(traindata_path) docs = load.datas docs_labels = load.labels #key paras value:fscore paras_fscore_map = {} # 5 - cross validate for i in range(5): #random_state = 0 : 每次随机种子不一样 train_docs,eval_docs,train_docs_labels,eval_docs_labels = train_test_split(docs,docs_labels,test_size=0.2,random_state=0) feature_loader = fp.LoadFeature(train_docs) trainX = feature_loader.get_all_feature(train_docs) trainY = raw_label_process(train_docs_labels) evalX = feature_loader.get_all_feature(eval_docs) evalY = raw_label_process(eval_docs_labels) paras_group = [] for i1,i2,i3,i4,i5 in paras_generate(): paras_str = str(i1)+','+str(i2)+','+str(i3)+','+str(i4)+","+str(i5) print('begin'+paras_str) f = 0.0 clfs = [] for category, trainX, y in getdata(trainX,trainY): #print('train the classfier in model selection MODEL:' + category) clf = MLPClassifier(solver=i3, hidden_layer_sizes=i2, alpha=i4, learning_rate_init=i1, random_state=paras['random_state'] ) clf.fit(trainX, y) clfs.append(clf) predict_prob = np.zeros((len(eval_docs), category_num)) for col, clf in enumerate(clfs): prob = clf.predict_proba(evalX)[:, 1] predict_prob[:, col] = prob clfs.clear() #根据predict_label和threshold来决定预测label threshod_list = [i5 for j in range(category_num)] threshod_vec = np.array(threshod_list) for row in range(len(eval_docs)): predict_prob[row,:] = np.less_equal(threshod_vec,predict_prob[row,:]) predict_prob.astype(int) #get f score predict_prob and testY fscore = get_fscore(evalY,predict_prob) if i == 0: paras_fscore_map[paras_str] = fscore else: paras_fscore_map[paras_str] = (paras_fscore_map[paras_str] * (i-1)+fscore)/i # print((fscore,[i1,i2,i3,i4,i5])) # paras_group.append((fscore,[i1,i2,i3,i4,i5])) sorted(paras_fscore_map.items(),lambda x,y : x[1] > y[1]) for item in paras_fscore_map.items(): print(item)
def qselect(A, k): if len(A) < k: return A pivot = A[-1] right = [pivot] + [x for x in A[:-1] if x[1] >= pivot[1]] rlen = len(right) if rlen == k: return right if rlen > k: return qselect(right, k) else: left = [x for x in A[:-1] if x[1] < pivot[1]] return qselect(left, k - rlen) + right if __name__ == '__main__': # docs = [ # 'Judging from previous posts this used to be a good place, but not any longer.', # 'The food was lousy - too sweet or too salty and the portions tiny.', # # ] # #<Opinion target="place" category="RESTAURANT#GENERAL" polarity="negative" from="51" to="56"/> # #<Opinion target="food" category="FOOD#QUALITY" polarity="negative" from="4" to="8"/> # raw_sent_labels = [ # [{'Opinion target':'good place'}], # [{'Opinion target':'good food'}] # ] traindata_path = './restaurant2015/ABSA-15_Restaurants_Train_Final.xml' load = rst.Load(traindata_path) get_namelist(load.datas, load.labels)
def cross_validating(): load = rst.Load(traindata_path) docs = load.datas docs_labels = load.labels # key paras value:fscore paras_fscore_map = {} for i in range(5): #random_state = 0 : 每次随机种子不一样 train_docs, eval_docs, train_docs_labels, eval_docs_labels = train_test_split( docs, docs_labels, test_size=0.2, random_state=0) trainY = tn.raw_label_process(train_docs_labels) evalY = tn.raw_label_process(eval_docs_labels) paras_group = [] for i1, i2 in gridsearch_threshold(): feature_loader = fp.LoadFeature(train_docs, train_docs_labels, i1, i2) trainX = feature_loader.get_all_feature(train_docs) print(trainX.shape) evalX = feature_loader.get_all_feature(eval_docs) paras_str = str(i1) + ',' + str(i2) print('begin' + paras_str) f = 0.0 clfs = [] for category, trainX, y in tn.getdata(trainX, trainY): #print('train the classfier in model selection MODEL:' + category) clf = MLPClassifier( solver=paras['solver'], hidden_layer_sizes=paras['hidden_layer_sizes'], alpha=paras['alpha'], learning_rate_init=paras['learning_rate_init'], random_state=paras['random_state'], verbose=False) clf.fit(trainX, y) clfs.append(clf) predict_prob = np.zeros((len(eval_docs), tn.category_num)) for col, clf in enumerate(clfs): prob = clf.predict_proba(evalX)[:, 1] predict_prob[:, col] = prob clfs.clear() #根据predict_label和threshold来决定预测label threshod_list = [ paras['threshold'] for j in range(tn.category_num) ] threshod_vec = np.array(threshod_list) for row in range(len(eval_docs)): predict_prob[row, :] = np.less_equal(threshod_vec, predict_prob[row, :]) predict_prob.astype(int) #get f score predict_prob and testY fscore = tn.get_fscore(evalY, predict_prob) if i == 0: paras_fscore_map[paras_str] = fscore else: paras_fscore_map[paras_str] = (paras_fscore_map[paras_str] * (i - 1) + fscore) / i # print((fscore,[i1,i2,i3,i4,i5])) # paras_group.append((fscore,[i1,i2,i3,i4,i5])) paras_fscore_map = sorted(paras_fscore_map.items(), key=lambda x: x[1]) for item in list(paras_fscore_map.items()): print(item)