def evaluate(tokenizer, terror_cat, texts, cats): tokens_list = (tokenizer(text) for text in texts) labels, scores = list(), list() for i, doc in enumerate(terror_cat.pipe(tokens_list)): label = cats[i]['cats'][true_label] score = doc.cats[true_label] labels.append(label) scores.append(score) au.precision_recall_threshold(labels, scores) print('\n')
def test(self, test_file): textarr, labelarr = file2label_text_array(test_file) featurearr = self.textarr2featurearr(textarr) probarr = self.predict_proba(featurearr) au.precision_recall_threshold( labelarr, probarr, file="performance.csv", thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)]) fu.dump_array("result.json", (labelarr, probarr))
def test(self, test_file): textarr, labelarr = file2label_text_array(test_file) """""" # docarr = su.textarr_nlp(textarr, self.get_nlp()) # featurearr = self.textarr2featurearr(textarr, docarr) featurearr = self.textarr2featurearr_no_gpe(textarr) """""" probarr = self.predict_proba(featurearr) au.precision_recall_threshold(labelarr, probarr, thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)]) fu.dump_array("result.json", (labelarr, probarr))
def test(self, test_file): """ 给定带标记和文本的文件,读取其中的文本-标记对,调用向量化接口以及分类器,评估分类器在测试集上的性能 :param test_file: str,测试用文本文件的路径 :return: """ textarr, labelarr = file2label_text_array(test_file) """""" # docarr = su.textarr_nlp(textarr, self.get_nlp()) # featurearr = self.textarr2featurearr(textarr, docarr) featurearr = self.textarr2featurearr_no_gpe(textarr) """""" probarr = self.predict_proba(featurearr) au.precision_recall_threshold( labelarr, probarr, thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)]) fu.dump_array("result.json", (labelarr, probarr))
def train_keyword_similarity(exec_train=True): np.random.seed(134590) data = np.load('./data/data_sim.npy') data_len = len(data) sep_idx = int(data_len * 0.8) # train, test = data[:sep_idx], data[sep_idx:] rand_idx = au.shuffle([i for i in range(data_len)]) train, test = data[rand_idx[:sep_idx]], data[rand_idx[sep_idx:]] train_x, train_y = train[:, :-1], train[:, -1] test_x, test_y = test[:, :-1], test[:, -1] print(data.shape, train.shape, test.shape) if exec_train: clf = LogisticRegression() clf.fit(train_x, train_y) joblib.dump(clf, './data/judge_merge_model') else: clf = joblib.load('./data/judge_merge_model') # print("coef:", c.coef_.tolist()) pred = clf.predict_proba(test_x)[:, 1] au.precision_recall_threshold(test_y, pred, thres_range=[i / 10 for i in range(1, 10)])
def perfomance_analysis(): labal, proba = fu.load_array('label_proba') print(len(labal), len(proba)) au.precision_recall_threshold(labal, proba)
# post_twarr = list() # for idx in range(len(probarr)): # if probarr[idx] >= 0.35: # post_twarr.append(twarr[idx]) # else: # print(twarr[idx][tk.key_text]) # post_twarr = [tw for idx, tw in enumerate(twarr) if probarr[idx] >= 0.4] # post_total_len += len(post_twarr) # print(len(post_twarr) / len(twarr), '\n\n\n') tmu.check_time() lblarr = [1 for _ in range(len(pos_probarr))] + [0 for _ in range(len(neg_probarr))] prbarr = pos_probarr + neg_probarr fu.dump_array("prb_lbl_arr.txt", (lblarr, prbarr)) lblarr, prbarr = fu.load_array("prb_lbl_arr.txt") au.precision_recall_threshold(lblarr, prbarr) # print('total portion = {} / {} = {}'.format(post_total_len, pre_total_len, post_total_len / pre_total_len)) tmu.check_time() exit() sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19] twarr = au.merge_array([fu.load_array(file) for file in sub_files]) print(len(twarr)) tmu.check_time(print_func=None) for idx, tw in enumerate(twarr[14000:15000]): if (idx + 1) % 1000 == 0: print(idx) try: my_filter.get_features(tw) except: # print(tw[tk.key_text])
import matplotlib.pyplot as plt import utils.function_utils as fu def figure(X, Y, fig_name): # plt.figure(figsize=(13, 7)) plt.plot(X, Y, color="blue", linewidth=1) plt.xlim([-0.03, 1.03]) plt.ylim([-0.03, 1.03]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title("roc curve") plt.legend(loc='lower right') plt.savefig(fig_name, format='png') if __name__ == '__main__': import utils.array_utils as au labelarr, probarr = fu.load_array( "/home/nfs/cdong/tw/src/preprocess/filter/prb_lbl_arr.txt") from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(labelarr, probarr) figure(fpr, tpr, "/home/nfs/cdong/tw/src/preprocess/filter/roc_curve.png") au.precision_recall_threshold( labelarr, probarr, file="/home/nfs/cdong/tw/src/preprocess/filter/performance.csv", thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)])