示例#1
0
def evaluate(tokenizer, terror_cat, texts, cats):
    tokens_list = (tokenizer(text) for text in texts)
    labels, scores = list(), list()
    for i, doc in enumerate(terror_cat.pipe(tokens_list)):
        label = cats[i]['cats'][true_label]
        score = doc.cats[true_label]
        labels.append(label)
        scores.append(score)
    au.precision_recall_threshold(labels, scores)
    print('\n')
示例#2
0
 def test(self, test_file):
     textarr, labelarr = file2label_text_array(test_file)
     featurearr = self.textarr2featurearr(textarr)
     probarr = self.predict_proba(featurearr)
     au.precision_recall_threshold(
         labelarr,
         probarr,
         file="performance.csv",
         thres_range=[i / 100 for i in range(1, 10)] +
         [i / 20 for i in range(2, 20)])
     fu.dump_array("result.json", (labelarr, probarr))
示例#3
0
 def test(self, test_file):
     textarr, labelarr = file2label_text_array(test_file)
     """"""
     # docarr = su.textarr_nlp(textarr, self.get_nlp())
     # featurearr = self.textarr2featurearr(textarr, docarr)
     featurearr = self.textarr2featurearr_no_gpe(textarr)
     """"""
     probarr = self.predict_proba(featurearr)
     au.precision_recall_threshold(labelarr, probarr,
         thres_range=[i / 100 for i in range(1, 10)] + [i / 20 for i in range(2, 20)])
     fu.dump_array("result.json", (labelarr, probarr))
示例#4
0
 def test(self, test_file):
     """
     给定带标记和文本的文件,读取其中的文本-标记对,调用向量化接口以及分类器,评估分类器在测试集上的性能
     :param test_file: str,测试用文本文件的路径
     :return:
     """
     textarr, labelarr = file2label_text_array(test_file)
     """"""
     # docarr = su.textarr_nlp(textarr, self.get_nlp())
     # featurearr = self.textarr2featurearr(textarr, docarr)
     featurearr = self.textarr2featurearr_no_gpe(textarr)
     """"""
     probarr = self.predict_proba(featurearr)
     au.precision_recall_threshold(
         labelarr,
         probarr,
         thres_range=[i / 100 for i in range(1, 10)] +
         [i / 20 for i in range(2, 20)])
     fu.dump_array("result.json", (labelarr, probarr))
示例#5
0
def train_keyword_similarity(exec_train=True):
    np.random.seed(134590)
    data = np.load('./data/data_sim.npy')
    data_len = len(data)
    sep_idx = int(data_len * 0.8)
    # train, test = data[:sep_idx], data[sep_idx:]
    rand_idx = au.shuffle([i for i in range(data_len)])
    train, test = data[rand_idx[:sep_idx]], data[rand_idx[sep_idx:]]
    train_x, train_y = train[:, :-1], train[:, -1]
    test_x, test_y = test[:, :-1], test[:, -1]
    print(data.shape, train.shape, test.shape)
    
    if exec_train:
        clf = LogisticRegression()
        clf.fit(train_x, train_y)
        joblib.dump(clf, './data/judge_merge_model')
    else:
        clf = joblib.load('./data/judge_merge_model')
    
    # print("coef:", c.coef_.tolist())
    pred = clf.predict_proba(test_x)[:, 1]
    au.precision_recall_threshold(test_y, pred, thres_range=[i / 10 for i in range(1, 10)])
def perfomance_analysis():
    labal, proba = fu.load_array('label_proba')
    print(len(labal), len(proba))
    au.precision_recall_threshold(labal, proba)
     # post_twarr = list()
 
     # for idx in range(len(probarr)):
     #     if probarr[idx] >= 0.35:
     #         post_twarr.append(twarr[idx])
     #     else:
     #         print(twarr[idx][tk.key_text])
     # post_twarr = [tw for idx, tw in enumerate(twarr) if probarr[idx] >= 0.4]
     # post_total_len += len(post_twarr)
     # print(len(post_twarr) / len(twarr), '\n\n\n')
 tmu.check_time()
 lblarr = [1 for _ in range(len(pos_probarr))] + [0 for _ in range(len(neg_probarr))]
 prbarr = pos_probarr + neg_probarr
 fu.dump_array("prb_lbl_arr.txt", (lblarr, prbarr))
 lblarr, prbarr = fu.load_array("prb_lbl_arr.txt")
 au.precision_recall_threshold(lblarr, prbarr)
 # print('total portion = {} / {} = {}'.format(post_total_len, pre_total_len, post_total_len / pre_total_len))
 tmu.check_time()
 exit()
 
 sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19]
 twarr = au.merge_array([fu.load_array(file) for file in sub_files])
 print(len(twarr))
 tmu.check_time(print_func=None)
 for idx, tw in enumerate(twarr[14000:15000]):
     if (idx + 1) % 1000 == 0:
         print(idx)
     try:
         my_filter.get_features(tw)
     except:
         # print(tw[tk.key_text])
示例#8
0
import matplotlib.pyplot as plt

import utils.function_utils as fu


def figure(X, Y, fig_name):
    # plt.figure(figsize=(13, 7))
    plt.plot(X, Y, color="blue", linewidth=1)
    plt.xlim([-0.03, 1.03])
    plt.ylim([-0.03, 1.03])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.title("roc curve")
    plt.legend(loc='lower right')
    plt.savefig(fig_name, format='png')


if __name__ == '__main__':
    import utils.array_utils as au
    labelarr, probarr = fu.load_array(
        "/home/nfs/cdong/tw/src/preprocess/filter/prb_lbl_arr.txt")
    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(labelarr, probarr)
    figure(fpr, tpr, "/home/nfs/cdong/tw/src/preprocess/filter/roc_curve.png")
    au.precision_recall_threshold(
        labelarr,
        probarr,
        file="/home/nfs/cdong/tw/src/preprocess/filter/performance.csv",
        thres_range=[i / 100
                     for i in range(1, 10)] + [i / 20 for i in range(2, 20)])