Exemplo n.º 1
0
def run(client, clf, print_prf=False, print_main_proportion=False):
    attr_dict, label_dict, pullinfo_list_dict = load_data(ayonel_numerical_attr=ayonel_numerical_attr, ayonel_boolean_attr=ayonel_boolean_attr,
                                  ayonel_categorical_attr_handler=ayonel_categorical_attr_handler)
    ACC = 0.0
    for org, repo in org_list:
        input_X = attr_dict[org]
        input_y = label_dict[org]
        seg_point = int(len(input_X) * SEG_PROPORTION)

        train_X = np.array(input_X[:seg_point])
        train_y = np.array(input_y[:seg_point])
        # X_sparse = coo_matrix(train_X)
        #
        # train_X, X_sparse, train_y = shuffle(train_X, X_sparse, train_y, random_state=0)
        # train_X, train_y = AS().fit_sample(train_X, train_y)
        test_X = np.array(input_X[seg_point:])
        test_y = np.array(input_y[seg_point:])

        train(clf, train_X, train_y)
        accuracy = clf.score(test_X, test_y)
        ACC += accuracy
        predict_result = clf.predict(test_X).tolist()
        actual_result = test_y.tolist()
        precision, recall, F1 = precision_recall_f1(predict_result, actual_result)
        print(accuracy, end='')
        if print_prf:
            print(",%f,%f,%f" % (precision, recall, F1), end='')
        if print_main_proportion:
            main_proportion = predict_result.count(1) / len(predict_result)
            print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='')
        print()
    print(ACC/len(org_list))
Exemplo n.º 2
0
def run(clf, print_prf=False, print_main_proportion=False):
    attr_dict, label_dict = load_data(gousios_attr_list=gousios_attr_list)
    ACC = 0.0
    for org, repo in org_list:
        input_X = attr_dict[org]
        input_y = label_dict[org]
        # print(sorted(Counter(input_y).items()))

        seg_point = int(len(input_X) * SEG_PROPORTION)

        train_X = np.array(input_X[:seg_point])
        train_y = np.array(input_y[:seg_point])

        train_X, train_y = SMOTE().fit_sample(train_X, train_y)

        test_X = np.array(input_X[seg_point:])
        test_y = np.array(input_y[seg_point:])

        train(clf, train_X, train_y)
        actual_result = test_y.tolist()
        predict_result = clf.predict(test_X).tolist()

        accuracy = clf.score(test_X, test_y)

        ACC += accuracy

        print(accuracy, end='')
        precision, recall, F1 = precision_recall_f1(predict_result, actual_result)
        if print_prf:
            print(",%f,%f,%f" % (precision, recall, F1), end='')

        if print_main_proportion:
            main_proportion = predict_result.count(1) / len(predict_result)
            print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='')
        print()
    print(str(ACC/len(org_list)))
Exemplo n.º 3
0
def run_monthly(clf, print_time=False, print_acc=False, print_prf=False, print_prf_each=False, print_main_proportion=False, print_AUC=False, MonthGAP=1):
    data_dict = load_data_monthly(gousios_attr_list=gousios_attr_list, MonthGAP=MonthGAP)
    for org, repo in org_list:
        outfile = open("../anova/" + repo + '.csv', "a", newline='')
        writer = csv.writer(outfile)

        train_cost_time = 0
        test_cost_time = 0
        total_start_time = time.time()
        print(repo, end='')
        batch_iter = data_dict[org]
        train_batch = batch_iter.__next__()
        train_X = np.array(train_batch[0])
        train_y = np.array(train_batch[1])
        predict_result = []
        actual_result = []
        predict_result_prob = []
        samples = 0
        for batch in batch_iter:
            if len(batch[0]) == 0:  # 测试集没有数据,直接预测下一batch
                continue
            test_X = np.array(batch[0])
            test_y = np.array(batch[1])
            train_start_time = time.time()
            train(clf, train_X, train_y)
            train_cost_time += time.time() - train_start_time

            actual_result += test_y.tolist()  # 真实结果
            test_start_time = time.time()
            predict_result += clf.predict(test_X).tolist()  # 预测结果
            test_cost_time += time.time() - test_start_time
            predict_result_prob += [x[0] for x in clf.predict_proba(test_X).tolist()]
            samples += test_X.size


            train_X = np.concatenate((train_X, test_X))
            train_y = np.concatenate((train_y, test_y))
        total_cost_time = time.time() - total_start_time
        acc_num = 0
        for i in range(len(actual_result)):
            if actual_result[i] == predict_result[i]:
                acc_num += 1
                writer.writerow([0, 1])
            else:
                writer.writerow([0, 0])
        if print_acc:
            print(',%f' % (acc_num/len(actual_result)), end='')
        if print_AUC:
            y = np.array(actual_result)
            pred = np.array(predict_result_prob)
            fpr, tpr, thresholds = roc_curve(y, pred, pos_label=1)
            AUC = auc(fpr, tpr)
            print(',%f' % (AUC if AUC > 0.5 else 1-AUC), end='')
        precision, recall, F1 = precision_recall_f1(predict_result, actual_result)
        if print_prf:
            print(",%f,%f,%f" % (precision, recall, F1), end='')

        if print_prf_each:
            merged_precision, merged_recall, merged_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=0)
            rejected_precision, rejected_recall, rejected_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=1)
            print(',%f,%f,%f,%f,%f,%f' % (merged_F1, merged_precision, merged_recall, rejected_F1,rejected_precision, rejected_recall ), end='')

        if print_main_proportion:
            main_proportion = predict_result.count(1) / len(predict_result)
            print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='')

        if print_time:
            print(',%f,%f,%f' % (train_cost_time, test_cost_time, total_cost_time), end='')
        print()
Exemplo n.º 4
0
Arquivo: clf.py Projeto: zjtjames/DLPP
def run_monthly(client,
                clf,
                print_time=True,
                over_sample=False,
                print_acc=False,
                print_prf_each=False,
                print_main_proportion=False,
                print_AUC=False,
                MonthGAP=1,
                persistence=False):
    data_dict, pullinfo_list_dict = load_data_monthly(
        ayonel_numerical_attr=ayonel_numerical_attr,
        ayonel_boolean_attr=ayonel_boolean_attr,
        ayonel_categorical_attr_handler=ayonel_categorical_attr_handler,
        MonthGAP=MonthGAP)

    for org, repo in org_list:

        train_cost_time = 0  # 训练耗时
        test_cost_time = 0  # 预测耗时
        total_start_time = time.time()  # 每个项目开始时间
        print(org, end='')

        pullinfo_list = pullinfo_list_dict[org]

        batch_iter = data_dict[org]  # 获取数据集迭代器

        train_batch = batch_iter.__next__()  # 获取一轮数据

        train_X = np.array(train_batch[0])
        train_y = np.array(train_batch[1])

        cursor = train_y.size  # 游标,用于记录第一条开始预测pr的位置

        predict_result = []  # 预测结果,0|1表示
        predict_result_prob = []  # 预测结果,概率值
        actual_result = []  # 实际结果,0|1表示
        mean_accuracy = 0  # 平均acc
        round = 1  # 轮次计数

        for batch in batch_iter:
            if len(batch[0]) == 0:  # 测试集没有数据,直接预测下一batch
                continue
            test_X = np.array(batch[0])
            test_y = np.array(batch[1])

            # 过采样
            if over_sample:
                if train_y.tolist().count(0) <= 6 or train_y.tolist().count(
                        1) <= 6:
                    train(clf, train_X, train_y)
                else:
                    resample_train_X, resample_train_y = SMOTE(
                        ratio='auto',
                        random_state=RANDOM_SEED).fit_sample(train_X, train_y)
                    train(clf, resample_train_X, resample_train_y)
            else:  # 正常算
                if train_y.sum() != 0 and train_y.sum() != train_y.size:
                    train_start_time = time.time()
                    train(clf, train_X, train_y)
                    train_cost_time += time.time() - train_start_time  # 更新训练时间
                else:
                    train_X = np.concatenate((train_X, test_X))  # 将测试集加入训练集中
                    train_y = np.concatenate((train_y, test_y))  # 将测试集加入训练集中
                    continue

            actual_result += test_y.tolist()  # 真实结果
            test_start_time = time.time()
            predict_result += clf.predict(test_X).tolist()  # 预测结果
            test_cost_time += time.time() - test_start_time  # 更新预测时间

            predict_result_prob += [
                x[0] for x in clf.predict_proba(test_X).tolist()
            ]

            mean_accuracy += clf.score(test_X, test_y)

            train_X = np.concatenate((train_X, test_X))  # 将测试集加入训练集中
            train_y = np.concatenate((train_y, test_y))  # 将测试集加入训练集中
            round += 1

        total_cost_time = time.time() - total_start_time

        acc_num = 0  # 预测正确的pr数量
        for i in range(len(actual_result)):
            if actual_result[i] == predict_result[i]:
                acc_num += 1
        # 如果需要将结果入库
        if persistence:
            for i in range(len(predict_result)):
                number = int(pullinfo_list[cursor + i]['number'])
                data = {
                    'number': number,
                    'org': org,
                    'repo': repo,
                    'created_at':
                    float(pullinfo_list[cursor + i]['created_at']),
                    'closed_at': float(pullinfo_list[cursor + i]['closed_at']),
                    'title': pullinfo_list[cursor + i]['title'],
                    'submitted_by': pullinfo_list[cursor + i]['author'],
                    'merged': pullinfo_list[cursor + i]['merged'],
                    'predict_merged': True if predict_result[i] == 0 else False
                }

                client[persistence_db][persistence_col].insert(
                    data)  # 该表为原型系统使用表

        if print_acc:  # 如果需要打印acc
            print(',%f' % (acc_num / len(actual_result)), end='')

        if print_AUC:  # 如果需要打印AUC
            y = np.array(actual_result)
            pred = np.array(predict_result_prob)
            fpr, tpr, thresholds = roc_curve(y, pred)
            AUC = auc(fpr, tpr)
            print(',%f' % (AUC if AUC > 0.5 else 1 - AUC), end='')

        if print_prf_each:  # 如果需要按照拒绝类、接受类打印精准率、召回率、F1
            merged_precision, merged_recall, merged_F1 = precision_recall_f1(
                predict_result, actual_result, POSITIVE=0)
            rejected_precision, rejected_recall, rejected_F1 = precision_recall_f1(
                predict_result, actual_result, POSITIVE=1)
            print(',%f,%f,%f,%f,%f,%f' %
                  (merged_F1, merged_precision, merged_recall, rejected_F1,
                   rejected_precision, rejected_recall),
                  end='')

        if print_main_proportion:  # 如果需要打印主类占比
            main_proportion = predict_result.count(1) / len(predict_result)
            print(',%f' % (main_proportion if main_proportion > 0.5 else 1 -
                           main_proportion),
                  end='')

        if print_time:  # 如果需要打印时间统计
            print(',%f,%f,%f' %
                  (train_cost_time, test_cost_time, total_cost_time),
                  end='')
        print()
Exemplo n.º 5
0
def run_monthly(client, clf, print_prf=False, print_prf_each=False, print_main_proportion=False, print_AUC=False, MonthGAP=1, persistence=False, ayonel_numerical_attr=None, ayonel_boolean_attr=None):

    data_dict, pullinfo_list_dict = load_data_monthly(ayonel_numerical_attr=ayonel_numerical_attr, ayonel_boolean_attr=ayonel_boolean_attr,
                                  ayonel_categorical_attr_handler=ayonel_categorical_attr_handler, MonthGAP=MonthGAP)
    accuracy = 0
    AUC = 0
    mean_merged_precision, mean_merged_recall, mean_merged_F1, \
    mean_rejected_precision, mean_rejected_recall, mean_rejected_F1 = 0, 0, 0, 0, 0, 0

    for org, repo in org_list:
        # print(org+",", end='')
        pullinfo_list = pullinfo_list_dict[org]
        batch_iter = data_dict[org]
        train_batch = batch_iter.__next__()
        train_X = np.array(train_batch[0])
        train_y = np.array(train_batch[1])
        cursor = train_y.size  # 游标,用于记录第一条开始预测pr的位置
        predict_result = []
        predict_result_prob = []
        actual_result = []
        mean_accuracy = 0

        for batch in batch_iter:
            if len(batch[0]) == 0:  # 测试集没有数据,直接预测下一batch
                continue
            test_X = np.array(batch[0])
            test_y = np.array(batch[1])
            # 正常训练
            train(clf, train_X, train_y)

            actual_result += test_y.tolist()  # 真实结果
            predict_result += clf.predict(test_X).tolist()  # 预测结果
            predict_result_prob += [x[0] for x in clf.predict_proba(test_X).tolist()]
            mean_accuracy += clf.score(test_X, test_y)
            train_X = np.concatenate((train_X, test_X))
            train_y = np.concatenate((train_y, test_y))

        acc_num = 0
        for i in range(len(actual_result)):
            if actual_result[i] == predict_result[i]:
                acc_num += 1
        # 需要将结果入库
        if persistence:
            for i in range(len(predict_result)):
                number = int(pullinfo_list[cursor + i]['number'])
                data = {
                    'number':           number,
                    'org':              org,
                    'repo':             repo,
                    'created_at':       float(pullinfo_list[cursor + i]['created_at']),
                    'closed_at':        float(pullinfo_list[cursor + i]['closed_at']),
                    'title':            pullinfo_list[cursor + i]['title'],
                    'submitted_by':     pullinfo_list[cursor + i]['author'],
                    'merged':           pullinfo_list[cursor + i]['merged'],
                    'predict_merged':   True if predict_result[i] == 0 else False
                }

                client[persistence_db][persistence_col].insert(data)

        accuracy += acc_num / len(actual_result)

        if print_prf_each:
            try:
                merged_precision, merged_recall, merged_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=0)
                if mean_merged_precision == -1 or mean_merged_recall == -1 or mean_merged_F1 == -1:
                    pass
                else:
                    mean_merged_precision += merged_precision
                    mean_merged_recall += merged_recall
                    mean_merged_F1 += merged_F1
            except ValueError:
                mean_merged_precision, mean_merged_recall, mean_merged_F1 = -1, -1, -1


            try:
                rejected_precision, rejected_recall, rejected_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=1)
                if mean_rejected_precision == -1 or mean_rejected_recall == -1 or mean_rejected_F1 == -1:
                    pass
                else:
                    mean_rejected_precision += rejected_precision
                    mean_rejected_recall += rejected_recall
                    mean_rejected_F1 += rejected_F1
            except ValueError:
                mean_rejected_precision, mean_rejected_recall, mean_rejected_F1 = -1, -1, -1

            # print(',%f,%f,%f,%f,%f,%f' % (merged_F1, merged_precision, merged_recall, rejected_F1,rejected_precision, rejected_recall ), end='')

        if print_main_proportion:
            main_proportion = predict_result.count(1) / len(predict_result)
            print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='')

        if print_AUC:
            y = np.array(actual_result)
            pred = np.array(predict_result_prob)
            fpr, tpr, thresholds = roc_curve(y, pred)
            this_AUC = auc(fpr, tpr)
            AUC += this_AUC if this_AUC > 0.5 else 1 - this_AUC
            # print(',%f' % (AUC if AUC > 0.5 else 1 - AUC), end='')
        # print()
    if mean_merged_precision == -1 or mean_merged_recall == -1 or mean_merged_F1 == -1 \
        or mean_rejected_precision == -1 or mean_rejected_recall == -1 or mean_rejected_F1 == -1:
        return accuracy/len(org_list), AUC/len(org_list), -1, -1, -1, -1, -1, -1
    else:
        return accuracy / len(org_list), AUC / len(org_list), \
               mean_merged_precision/len(org_list), \
               mean_merged_recall/len(org_list), \
               mean_merged_F1/len(org_list), \
               mean_rejected_precision/len(org_list), \
               mean_rejected_recall/len(org_list), \
               mean_rejected_recall/len(org_list)