def run(client, clf, print_prf=False, print_main_proportion=False): attr_dict, label_dict, pullinfo_list_dict = load_data(ayonel_numerical_attr=ayonel_numerical_attr, ayonel_boolean_attr=ayonel_boolean_attr, ayonel_categorical_attr_handler=ayonel_categorical_attr_handler) ACC = 0.0 for org, repo in org_list: input_X = attr_dict[org] input_y = label_dict[org] seg_point = int(len(input_X) * SEG_PROPORTION) train_X = np.array(input_X[:seg_point]) train_y = np.array(input_y[:seg_point]) # X_sparse = coo_matrix(train_X) # # train_X, X_sparse, train_y = shuffle(train_X, X_sparse, train_y, random_state=0) # train_X, train_y = AS().fit_sample(train_X, train_y) test_X = np.array(input_X[seg_point:]) test_y = np.array(input_y[seg_point:]) train(clf, train_X, train_y) accuracy = clf.score(test_X, test_y) ACC += accuracy predict_result = clf.predict(test_X).tolist() actual_result = test_y.tolist() precision, recall, F1 = precision_recall_f1(predict_result, actual_result) print(accuracy, end='') if print_prf: print(",%f,%f,%f" % (precision, recall, F1), end='') if print_main_proportion: main_proportion = predict_result.count(1) / len(predict_result) print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='') print() print(ACC/len(org_list))
def run(clf, print_prf=False, print_main_proportion=False): attr_dict, label_dict = load_data(gousios_attr_list=gousios_attr_list) ACC = 0.0 for org, repo in org_list: input_X = attr_dict[org] input_y = label_dict[org] # print(sorted(Counter(input_y).items())) seg_point = int(len(input_X) * SEG_PROPORTION) train_X = np.array(input_X[:seg_point]) train_y = np.array(input_y[:seg_point]) train_X, train_y = SMOTE().fit_sample(train_X, train_y) test_X = np.array(input_X[seg_point:]) test_y = np.array(input_y[seg_point:]) train(clf, train_X, train_y) actual_result = test_y.tolist() predict_result = clf.predict(test_X).tolist() accuracy = clf.score(test_X, test_y) ACC += accuracy print(accuracy, end='') precision, recall, F1 = precision_recall_f1(predict_result, actual_result) if print_prf: print(",%f,%f,%f" % (precision, recall, F1), end='') if print_main_proportion: main_proportion = predict_result.count(1) / len(predict_result) print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='') print() print(str(ACC/len(org_list)))
def run_monthly(clf, print_time=False, print_acc=False, print_prf=False, print_prf_each=False, print_main_proportion=False, print_AUC=False, MonthGAP=1): data_dict = load_data_monthly(gousios_attr_list=gousios_attr_list, MonthGAP=MonthGAP) for org, repo in org_list: outfile = open("../anova/" + repo + '.csv', "a", newline='') writer = csv.writer(outfile) train_cost_time = 0 test_cost_time = 0 total_start_time = time.time() print(repo, end='') batch_iter = data_dict[org] train_batch = batch_iter.__next__() train_X = np.array(train_batch[0]) train_y = np.array(train_batch[1]) predict_result = [] actual_result = [] predict_result_prob = [] samples = 0 for batch in batch_iter: if len(batch[0]) == 0: # 测试集没有数据,直接预测下一batch continue test_X = np.array(batch[0]) test_y = np.array(batch[1]) train_start_time = time.time() train(clf, train_X, train_y) train_cost_time += time.time() - train_start_time actual_result += test_y.tolist() # 真实结果 test_start_time = time.time() predict_result += clf.predict(test_X).tolist() # 预测结果 test_cost_time += time.time() - test_start_time predict_result_prob += [x[0] for x in clf.predict_proba(test_X).tolist()] samples += test_X.size train_X = np.concatenate((train_X, test_X)) train_y = np.concatenate((train_y, test_y)) total_cost_time = time.time() - total_start_time acc_num = 0 for i in range(len(actual_result)): if actual_result[i] == predict_result[i]: acc_num += 1 writer.writerow([0, 1]) else: writer.writerow([0, 0]) if print_acc: print(',%f' % (acc_num/len(actual_result)), end='') if print_AUC: y = np.array(actual_result) pred = np.array(predict_result_prob) fpr, tpr, thresholds = roc_curve(y, pred, pos_label=1) AUC = auc(fpr, tpr) print(',%f' % (AUC if AUC > 0.5 else 1-AUC), end='') precision, recall, F1 = precision_recall_f1(predict_result, actual_result) if print_prf: print(",%f,%f,%f" % (precision, recall, F1), end='') if print_prf_each: merged_precision, merged_recall, merged_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=0) rejected_precision, rejected_recall, rejected_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=1) print(',%f,%f,%f,%f,%f,%f' % (merged_F1, merged_precision, merged_recall, rejected_F1,rejected_precision, rejected_recall ), end='') if print_main_proportion: main_proportion = predict_result.count(1) / len(predict_result) print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='') if print_time: print(',%f,%f,%f' % (train_cost_time, test_cost_time, total_cost_time), end='') print()
def run_monthly(client, clf, print_time=True, over_sample=False, print_acc=False, print_prf_each=False, print_main_proportion=False, print_AUC=False, MonthGAP=1, persistence=False): data_dict, pullinfo_list_dict = load_data_monthly( ayonel_numerical_attr=ayonel_numerical_attr, ayonel_boolean_attr=ayonel_boolean_attr, ayonel_categorical_attr_handler=ayonel_categorical_attr_handler, MonthGAP=MonthGAP) for org, repo in org_list: train_cost_time = 0 # 训练耗时 test_cost_time = 0 # 预测耗时 total_start_time = time.time() # 每个项目开始时间 print(org, end='') pullinfo_list = pullinfo_list_dict[org] batch_iter = data_dict[org] # 获取数据集迭代器 train_batch = batch_iter.__next__() # 获取一轮数据 train_X = np.array(train_batch[0]) train_y = np.array(train_batch[1]) cursor = train_y.size # 游标,用于记录第一条开始预测pr的位置 predict_result = [] # 预测结果,0|1表示 predict_result_prob = [] # 预测结果,概率值 actual_result = [] # 实际结果,0|1表示 mean_accuracy = 0 # 平均acc round = 1 # 轮次计数 for batch in batch_iter: if len(batch[0]) == 0: # 测试集没有数据,直接预测下一batch continue test_X = np.array(batch[0]) test_y = np.array(batch[1]) # 过采样 if over_sample: if train_y.tolist().count(0) <= 6 or train_y.tolist().count( 1) <= 6: train(clf, train_X, train_y) else: resample_train_X, resample_train_y = SMOTE( ratio='auto', random_state=RANDOM_SEED).fit_sample(train_X, train_y) train(clf, resample_train_X, resample_train_y) else: # 正常算 if train_y.sum() != 0 and train_y.sum() != train_y.size: train_start_time = time.time() train(clf, train_X, train_y) train_cost_time += time.time() - train_start_time # 更新训练时间 else: train_X = np.concatenate((train_X, test_X)) # 将测试集加入训练集中 train_y = np.concatenate((train_y, test_y)) # 将测试集加入训练集中 continue actual_result += test_y.tolist() # 真实结果 test_start_time = time.time() predict_result += clf.predict(test_X).tolist() # 预测结果 test_cost_time += time.time() - test_start_time # 更新预测时间 predict_result_prob += [ x[0] for x in clf.predict_proba(test_X).tolist() ] mean_accuracy += clf.score(test_X, test_y) train_X = np.concatenate((train_X, test_X)) # 将测试集加入训练集中 train_y = np.concatenate((train_y, test_y)) # 将测试集加入训练集中 round += 1 total_cost_time = time.time() - total_start_time acc_num = 0 # 预测正确的pr数量 for i in range(len(actual_result)): if actual_result[i] == predict_result[i]: acc_num += 1 # 如果需要将结果入库 if persistence: for i in range(len(predict_result)): number = int(pullinfo_list[cursor + i]['number']) data = { 'number': number, 'org': org, 'repo': repo, 'created_at': float(pullinfo_list[cursor + i]['created_at']), 'closed_at': float(pullinfo_list[cursor + i]['closed_at']), 'title': pullinfo_list[cursor + i]['title'], 'submitted_by': pullinfo_list[cursor + i]['author'], 'merged': pullinfo_list[cursor + i]['merged'], 'predict_merged': True if predict_result[i] == 0 else False } client[persistence_db][persistence_col].insert( data) # 该表为原型系统使用表 if print_acc: # 如果需要打印acc print(',%f' % (acc_num / len(actual_result)), end='') if print_AUC: # 如果需要打印AUC y = np.array(actual_result) pred = np.array(predict_result_prob) fpr, tpr, thresholds = roc_curve(y, pred) AUC = auc(fpr, tpr) print(',%f' % (AUC if AUC > 0.5 else 1 - AUC), end='') if print_prf_each: # 如果需要按照拒绝类、接受类打印精准率、召回率、F1 merged_precision, merged_recall, merged_F1 = precision_recall_f1( predict_result, actual_result, POSITIVE=0) rejected_precision, rejected_recall, rejected_F1 = precision_recall_f1( predict_result, actual_result, POSITIVE=1) print(',%f,%f,%f,%f,%f,%f' % (merged_F1, merged_precision, merged_recall, rejected_F1, rejected_precision, rejected_recall), end='') if print_main_proportion: # 如果需要打印主类占比 main_proportion = predict_result.count(1) / len(predict_result) print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='') if print_time: # 如果需要打印时间统计 print(',%f,%f,%f' % (train_cost_time, test_cost_time, total_cost_time), end='') print()
def run_monthly(client, clf, print_prf=False, print_prf_each=False, print_main_proportion=False, print_AUC=False, MonthGAP=1, persistence=False, ayonel_numerical_attr=None, ayonel_boolean_attr=None): data_dict, pullinfo_list_dict = load_data_monthly(ayonel_numerical_attr=ayonel_numerical_attr, ayonel_boolean_attr=ayonel_boolean_attr, ayonel_categorical_attr_handler=ayonel_categorical_attr_handler, MonthGAP=MonthGAP) accuracy = 0 AUC = 0 mean_merged_precision, mean_merged_recall, mean_merged_F1, \ mean_rejected_precision, mean_rejected_recall, mean_rejected_F1 = 0, 0, 0, 0, 0, 0 for org, repo in org_list: # print(org+",", end='') pullinfo_list = pullinfo_list_dict[org] batch_iter = data_dict[org] train_batch = batch_iter.__next__() train_X = np.array(train_batch[0]) train_y = np.array(train_batch[1]) cursor = train_y.size # 游标,用于记录第一条开始预测pr的位置 predict_result = [] predict_result_prob = [] actual_result = [] mean_accuracy = 0 for batch in batch_iter: if len(batch[0]) == 0: # 测试集没有数据,直接预测下一batch continue test_X = np.array(batch[0]) test_y = np.array(batch[1]) # 正常训练 train(clf, train_X, train_y) actual_result += test_y.tolist() # 真实结果 predict_result += clf.predict(test_X).tolist() # 预测结果 predict_result_prob += [x[0] for x in clf.predict_proba(test_X).tolist()] mean_accuracy += clf.score(test_X, test_y) train_X = np.concatenate((train_X, test_X)) train_y = np.concatenate((train_y, test_y)) acc_num = 0 for i in range(len(actual_result)): if actual_result[i] == predict_result[i]: acc_num += 1 # 需要将结果入库 if persistence: for i in range(len(predict_result)): number = int(pullinfo_list[cursor + i]['number']) data = { 'number': number, 'org': org, 'repo': repo, 'created_at': float(pullinfo_list[cursor + i]['created_at']), 'closed_at': float(pullinfo_list[cursor + i]['closed_at']), 'title': pullinfo_list[cursor + i]['title'], 'submitted_by': pullinfo_list[cursor + i]['author'], 'merged': pullinfo_list[cursor + i]['merged'], 'predict_merged': True if predict_result[i] == 0 else False } client[persistence_db][persistence_col].insert(data) accuracy += acc_num / len(actual_result) if print_prf_each: try: merged_precision, merged_recall, merged_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=0) if mean_merged_precision == -1 or mean_merged_recall == -1 or mean_merged_F1 == -1: pass else: mean_merged_precision += merged_precision mean_merged_recall += merged_recall mean_merged_F1 += merged_F1 except ValueError: mean_merged_precision, mean_merged_recall, mean_merged_F1 = -1, -1, -1 try: rejected_precision, rejected_recall, rejected_F1 = precision_recall_f1(predict_result, actual_result, POSITIVE=1) if mean_rejected_precision == -1 or mean_rejected_recall == -1 or mean_rejected_F1 == -1: pass else: mean_rejected_precision += rejected_precision mean_rejected_recall += rejected_recall mean_rejected_F1 += rejected_F1 except ValueError: mean_rejected_precision, mean_rejected_recall, mean_rejected_F1 = -1, -1, -1 # print(',%f,%f,%f,%f,%f,%f' % (merged_F1, merged_precision, merged_recall, rejected_F1,rejected_precision, rejected_recall ), end='') if print_main_proportion: main_proportion = predict_result.count(1) / len(predict_result) print(',%f' % (main_proportion if main_proportion > 0.5 else 1 - main_proportion), end='') if print_AUC: y = np.array(actual_result) pred = np.array(predict_result_prob) fpr, tpr, thresholds = roc_curve(y, pred) this_AUC = auc(fpr, tpr) AUC += this_AUC if this_AUC > 0.5 else 1 - this_AUC # print(',%f' % (AUC if AUC > 0.5 else 1 - AUC), end='') # print() if mean_merged_precision == -1 or mean_merged_recall == -1 or mean_merged_F1 == -1 \ or mean_rejected_precision == -1 or mean_rejected_recall == -1 or mean_rejected_F1 == -1: return accuracy/len(org_list), AUC/len(org_list), -1, -1, -1, -1, -1, -1 else: return accuracy / len(org_list), AUC / len(org_list), \ mean_merged_precision/len(org_list), \ mean_merged_recall/len(org_list), \ mean_merged_F1/len(org_list), \ mean_rejected_precision/len(org_list), \ mean_rejected_recall/len(org_list), \ mean_rejected_recall/len(org_list)