def testRF_AAlgorithms(projects, dates, filter_train=False, filter_test=False, error_analysis=True): """ RF 算法由于特征和输入无法和ML兼容,单独开一个文件 """ startTime = datetime.now() for project in projects: excelName = f'outputRF_A_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx' recommendNum = 5 # 推荐数量 sheetName = 'result' """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) """初始化项目抬头""" content = ["项目名称:", project] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) """计算累积数据""" topks = [] mrrs = [] precisionks = [] recallks = [] fmeasureks = [] recommend_positive_success_pr_ratios = [] # pr 中有推荐成功人选的比例 recommend_positive_success_time_ratios = [] # 推荐pr * 人次 中有推荐成功人选的频次比例 recommend_negative_success_pr_ratios = [] # pr 中有推荐人选Hit 但被滤掉的pr的比例 recommend_negative_success_time_ratios = [] # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例 recommend_positive_fail_pr_ratios = [] # pr 中有推荐人选推荐错误的pr比例 recommend_positive_fail_time_ratios = [] # pr 中有pr * 人次有推荐错误的频次比例 recommend_negative_fail_pr_ratios = [] # pr 中有推荐人选不知道是否正确的比例 recommend_negative_fail_time_ratios = [] # pr中有pr * 人次有不知道是否正确的比例 error_analysis_datas = None for date in dates: recommendList, answerList, prList, convertDict, trainSize = RF_ATrain.algorithmBody(date, project, recommendNum, filter_train=filter_train, filter_test=filter_test) """根据推荐列表做评价""" topk, mrr, precisionk, recallk, fmeasurek = \ DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) topks.append(topk) mrrs.append(mrr) precisionks.append(precisionk) recallks.append(recallk) fmeasureks.append(fmeasurek) error_analysis_data = None if error_analysis: y = date[2] m = date[3] filename = projectConfig.getRF_ADataPath() + os.sep + f'RF_A_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList, convertDict, filename, 'review_user_login', 'pr_number') # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \ # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \ # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis( # recommendList, answerList, filter_answer_list, recommendNum) # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, # recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio, # recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio] recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \ recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis( recommendList, answerList, filter_answer_list, recommendNum) error_analysis_data = [recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, recommend_negative_fail_pr_ratio] # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio) # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio) # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio) # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio) recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) if error_analysis_data: # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios, # recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios, # recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios, # recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios] error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_negative_success_pr_ratios, recommend_positive_fail_pr_ratios, recommend_negative_fail_pr_ratios] """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date, error_analysis_data) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime) """推荐错误可视化""" DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'RF_{filter_train}_{filter_test}') """计算历史累积数据""" DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks, error_analysis_datas)
def testAlgorithm(project, dates, filter_train=False, filter_test=False, error_analysis=False, test_type=StringKeyUtils.STR_TEST_TYPE_SLIDE): # 多个case, 元组代表总共的时间跨度,最后一个月用于测试 """ algorithm : 基于信息检索 """ recommendNum = 5 # 推荐数量 excelName = f'outputIR_AC_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx' sheetName = 'result' """计算累积数据""" topks = [] mrrs = [] precisionks = [] recallks = [] fmeasureks = [] recommend_positive_success_pr_ratios = [] # pr 中有推荐成功人选的比例 recommend_positive_success_time_ratios = [] # 推荐pr * 人次 中有推荐成功人选的频次比例 recommend_negative_success_pr_ratios = [] # pr 中有推荐人选Hit 但被滤掉的pr的比例 recommend_negative_success_time_ratios = [] # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例 recommend_positive_fail_pr_ratios = [] # pr 中有推荐人选推荐错误的pr比例 recommend_positive_fail_time_ratios = [] # pr 中有pr * 人次有推荐错误的频次比例 recommend_negative_fail_pr_ratios = [] # pr 中有推荐人选不知道是否正确的比例 recommend_negative_fail_time_ratios = [] # pr中有pr * 人次有不知道是否正确的比例 error_analysis_datas = None """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for date in dates: startTime = datetime.now() """根据推荐列表做评价""" recommendList, answerList, prList, convertDict, trainSize = IR_ACTrain.algorithmBody(date, project, recommendNum, filter_train=filter_train, filter_test=filter_test, test_type=test_type) topk, mrr, precisionk, recallk, fmeasurek = \ DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) topks.append(topk) mrrs.append(mrr) precisionks.append(precisionk) recallks.append(recallk) fmeasureks.append(fmeasurek) error_analysis_data = None filter_answer_list = None if error_analysis: if test_type == StringKeyUtils.STR_TEST_TYPE_SLIDE: y = date[2] m = date[3] filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList, convertDict, filename, 'review_user_login', 'pr_number') elif test_type == StringKeyUtils.STR_TEST_TYPE_INCREMENT: fileList = [] for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 fileList.append( projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv') filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerDataByIncrement(project, prList, convertDict, fileList, 'review_user_login', 'pr_number') # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \ # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \ # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis( # recommendList, answerList, filter_answer_list, recommendNum) # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, # recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio, # recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio] recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \ recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis( recommendList, answerList, filter_answer_list, recommendNum) error_analysis_data = [recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, recommend_negative_fail_pr_ratio] # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio) # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio) # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio) # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio) recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) if error_analysis_data: # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios, # recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios, # recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios, # recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios] error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_negative_success_pr_ratios, recommend_positive_fail_pr_ratios, recommend_negative_fail_pr_ratios] """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime) """推荐错误可视化""" DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'IR_AC_{test_type}_{filter_train}_{filter_test}') """计算历史累积数据""" DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks, error_analysis_datas)
def testCNAlgorithm(project, dates, filter_train=False, filter_test=False, is_split=False, error_analysis=False): """整合 训练数据""" """2020.8.7 新增参数 filter_data 和 error_analysis filter_train 判断是否使用 changetrigger过滤的训练数据 filter_test 判断是否使用 changetrigger过滤的验证数据 error_analysis 表示是否开启chang_trigger过滤答案的错误统计机制 """ recommendNum = 5 # 推荐数量 excelName = f'outputCN_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx' sheetName = 'result' """计算累积数据""" topks = [] mrrs = [] precisionks = [] recallks = [] fmeasureks = [] recommend_positive_success_pr_ratios = [] # pr 中有推荐成功人选的比例 recommend_positive_success_time_ratios = [] # 推荐pr * 人次 中有推荐成功人选的频次比例 recommend_negative_success_pr_ratios = [] # pr 中有推荐人选Hit 但被滤掉的pr的比例 recommend_negative_success_time_ratios = [] # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例 recommend_positive_fail_pr_ratios = [] # pr 中有推荐人选推荐错误的pr比例 recommend_positive_fail_time_ratios = [] # pr 中有pr * 人次有推荐错误的频次比例 recommend_negative_fail_pr_ratios = [] # pr 中有推荐人选不知道是否正确的比例 recommend_negative_fail_time_ratios = [] # pr中有pr * 人次有不知道是否正确的比例 error_analysis_datas = None """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for date in dates: CNTrain.clean() startTime = datetime.now() prList, convertDict, trainSize, communities_data= CNTrain.algorithmBody(date, project, recommendNum, filter_train=filter_train, filter_test=filter_test, is_split=is_split) communitiesTuple = sorted(communities_data.items(), key=lambda x: x[0]) for cid, c_data in communitiesTuple: """根据推荐列表做评价""" topk, mrr, precisionk, recallk, fmeasurek = \ DataProcessUtils.judgeRecommend(c_data['recommend_list'], c_data['answer_list'], recommendNum) communities_data[cid]['topk'] = topk communities_data[cid]['mrr'] = mrr communities_data[cid]['precisionk'] = precisionk communities_data[cid]['recallk'] = recallk communities_data[cid]['fmeasurek'] = fmeasurek print("project: {0}, modularity: {1}, entropy: {2}, avg_variance: {3}".format(project, communities_data['whole']['modularity'], communities_data['whole']['entropy'], communities_data['whole']['avg_variance'])) error_analysis_data = None if error_analysis: y = date[2] m = date[3] filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList, convertDict, filename, 'reviewer', 'pull_number') # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \ # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \ # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis( # recommendList, answerList, filter_answer_list, recommendNum) # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, # recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio, # recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio] recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \ recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis( communities_data['whole']['recommend_list'], communities_data['whole']['answer_list'], filter_answer_list, recommendNum) error_analysis_data = [recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, recommend_negative_fail_pr_ratio] # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio) # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio) # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio) # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio) recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) if error_analysis_data: # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios, # recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios, # recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios, # recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios] error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_negative_success_pr_ratios, recommend_positive_fail_pr_ratios, recommend_negative_fail_pr_ratios] topks.append(communities_data['whole']['topk']) mrrs.append(communities_data['whole']['mrr']) precisionks.append(communities_data['whole']['precisionk']) recallks.append(communities_data['whole']['recallk']) fmeasureks.append(communities_data['whole']['fmeasurek']) """结果写入excel""" DataProcessUtils.saveResult_Community_Version(excelName, sheetName, communities_data, date) error_analysis_data = None if error_analysis: y = date[2] m = date[3] filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList, convertDict, filename, 'reviewer', 'pull_number') # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \ # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \ # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis( # recommendList, answerList, filter_answer_list, recommendNum) # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, # recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio, # recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio] recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio,\ recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis( communities_data['whole']['recommend_list'], communities_data['whole']['answer_list'] , filter_answer_list, recommendNum) error_analysis_data = [recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, recommend_negative_fail_pr_ratio] # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio) # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio) # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio) # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio) recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) if error_analysis_data: # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios, # recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios, # recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios, # recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios] error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_negative_success_pr_ratios, recommend_positive_fail_pr_ratios, recommend_negative_fail_pr_ratios] """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date, error_analysis_data) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime) """推荐错误可视化""" DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'CN_{filter_train}_{filter_test}') """计算历史累积数据""" DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks, error_analysis_datas)