Пример #1
0
    def testMLAlgorithms(project, dates, algorithm):
        """
           测试算法接口,把流程相似的算法统一
           algorithm : svm, dt, rf
        """

        recommendNum = 5  # 推荐数量
        excelName = f'output{algorithm}.xlsx'
        sheetName = 'result'

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])

        for date in dates:
            startTime = datetime.now()

            """直接读取不带路径的信息"""
            filename = projectConfig.getRootPath() + os.sep + 'data' + os.sep + 'train' + os.sep + \
                       f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv'
            df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD)
            print("raw df:", df.shape)

            # """读取带路径的文件信息"""
            # filename = projectConfig.getRootPath() + os.sep + r'data' + os.sep + 'train' + os.sep + \
            #            f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}_include_filepath.csv'
            # df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD,
            #                               sep=StringKeyUtils.STR_SPLIT_SEP_CSV)

            """df做预处理"""
            train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project,
                                                                                                isNOR=True)
            recommendList = None
            answerList = None
            """根据算法获得推荐列表"""
            if algorithm == StringKeyUtils.STR_ALGORITHM_SVM:  # 支持向量机
                recommendList, answerList = MLTrain.RecommendBySVM(train_data, train_data_y, test_data,
                                                                   test_data_y, recommendNum=recommendNum)
            elif algorithm == StringKeyUtils.STR_ALGORITHM_DT:  # 决策树
                recommendList, answerList = MLTrain.RecommendByDecisionTree(train_data, train_data_y, test_data,
                                                                            test_data_y, recommendNum=recommendNum)
            elif algorithm == StringKeyUtils.STR_ALGORITHM_RF:  # 随机森林
                recommendList, answerList = MLTrain.RecommendByRandomForest(train_data, train_data_y, test_data,
                                                                            test_data_y, recommendNum=recommendNum)

            """根据推荐列表做评价"""
            topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum)

            """结果写入excel"""
            DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date)

            """文件分割"""
            content = ['']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            content = ['训练集', '测试集']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())

            print("cost time:", datetime.now() - startTime)
Пример #2
0
 def getCommentsForProject(self, owner, repo , limit = -1):
     
     ExcelHelper().initExcelFile(projectConfig.getTestoutputExcelPath(), projectConfig.TEST_OUT_PUT_SHEET_NAME)
     helper = ApiHelper(owner=owner, repo=repo)
     helper.setAuthorization(True)
     #获取项目pullrequest的数量
     #requestNumber = helper.getTotalPullRequestNumberForProject()
     requestNumber = helper.getMaxSolvedPullRequestNumberForProject()
     
     print("total pull request number:",requestNumber)
     
     language = helper.getLanguageForProject()
     print("project language:",language)
     
     resNumber = requestNumber
     rr = 0
     usefulRequestNumber = 0
     commentNumber = 0
     while(resNumber > 0):
         print("pull request:",resNumber, " now:",rr)
         comments = helper.getCommentsForPullRequest(resNumber)
         if(comments.__len__()>0):
             usefulRequestNumber = usefulRequestNumber + 1
             
         for comment in comments:
             self.writeRawCommmentIntoExcel(comment,repo,language)
             commentNumber = commentNumber + 1
             
         resNumber =  resNumber - 1
         rr =  rr + 1
         if(limit > 0  and rr> limit):
             break
     
     self.commentDesensitization()
     print("useful pull request:",usefulRequestNumber,"  total comment:", commentNumber)
Пример #3
0
    def testEARECAlgorithm(project, dates, filter_train=False, filter_test=False, a=0.5):
        """整合 训练数据"""
        recommendNum = 5  # 推荐数量
        excelName = f'outputEAREC_{project}_{filter_train}_{filter_test}.xls'
        sheetName = 'result'

        """计算累积数据"""
        topks = []
        mrrs = []
        precisionks = []
        recallks = []
        fmeasureks = []

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])
        for date in dates:
            startTime = datetime.now()
            recommendList, answerList, prList, convertDict, trainSize = EARECTrain.algorithmBody(date, project,
                                                                                                 recommendNum,
                                                                                                 filter_train=filter_train,
                                                                                                 filter_test=filter_test,
                                                                                                 a=a)
            """根据推荐列表做评价"""
            topk, mrr, precisionk, recallk, fmeasurek = \
                DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum)

            topks.append(topk)
            mrrs.append(mrr)
            precisionks.append(precisionk)
            recallks.append(recallk)
            fmeasureks.append(fmeasurek)

            """结果写入excel"""
            DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date)

            """文件分割"""
            content = ['']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            content = ['训练集', '测试集']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())

            print("cost time:", datetime.now() - startTime)

        """计算历史累积数据"""
        DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks,
                                           fmeasureks)
Пример #4
0
    def testBayesAlgorithms(project, dates):  # 输入测试日期和对应文件序列  输出一整个算法的表现

        recommendNum = 5  # 推荐数量
        excelName = 'outputNB.xlsx'
        sheetName = 'result'

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])

        for i in range(1, 4):  # Bayes 有三个模型
            for date in dates:
                filename = projectConfig.getRootPath() + r'\data\train' + r'\\' \
                           + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv'
                df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD)
                """df做预处理"""
                isNOR = True
                if i == 1 or i == 3:
                    isNOR = False  # 对伯努利不做归一
                train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project,
                                                                                                    isNOR=isNOR)

                """根据算法获得推荐列表"""
                recommendList, answerList = MLTrain.RecommendByNativeBayes(train_data, train_data_y, test_data,
                                                                           test_data_y, recommendNum, i)

                """根据推荐列表做评价"""
                topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum)

                """结果写入excel"""
                DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date)

            """文件分割"""
            content = ['']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            content = ['训练集', '测试集']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
Пример #5
0
 def writeRawCommmentIntoExcel(self,comment,repo,language):
     dataList = []
     dataList.append(repo)
     dataList.append(comment.path)
     dataList.append(language)
     dataList.append(comment.body)
     dataList.append(comment.userId)
     dataList.append(datetime.strptime(comment.createTime,ExcelHelper.STR_STYLE_DATA_DATE))
     dataList.append(comment.line)
     
     styleList = [ExcelHelper.getNormalStyle(), ExcelHelper.getNormalStyle(),ExcelHelper.getNormalStyle(),
                  ExcelHelper.getNormalStyle(), ExcelHelper.getNormalStyle(),ExcelHelper.getDateStyle(),
                  ExcelHelper.getNormalStyle()]
     
     ExcelHelper().appendExcelRowWithDiffStyle(projectConfig.getTestoutputExcelPath(), projectConfig.TEST_OUT_PUT_SHEET_NAME
                                               , dataList, styleList)
     print(dataList)
Пример #6
0
    def testCAAlgorithm(project, dates):  # 多个case, 元组代表总共的时间跨度,最后一个月用于测试
        """
           algorithm : 基于用户聚类
        """

        recommendNum = 5  # 推荐数量
        excelName = f'outputCA.xlsx'
        sheetName = 'result'

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])
        for date in dates:
            startTime = datetime.now()
            """根据推荐列表做评价"""

            CATrain.algorithmBody(date, project, recommendNum)
Пример #7
0
    def testAlgorithm(project, dates, filter_train=False, filter_test=False, error_analysis=False,
                      test_type=StringKeyUtils.STR_TEST_TYPE_SLIDE):  # 多个case, 元组代表总共的时间跨度,最后一个月用于测试
        """
           algorithm : 基于信息检索
        """

        recommendNum = 5  # 推荐数量
        excelName = f'outputIR_AC_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx'
        sheetName = 'result'

        """计算累积数据"""
        topks = []
        mrrs = []
        precisionks = []
        recallks = []
        fmeasureks = []
        recommend_positive_success_pr_ratios = []  # pr 中有推荐成功人选的比例
        recommend_positive_success_time_ratios = []  # 推荐pr * 人次 中有推荐成功人选的频次比例
        recommend_negative_success_pr_ratios = []  # pr 中有推荐人选Hit 但被滤掉的pr的比例
        recommend_negative_success_time_ratios = []  # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例
        recommend_positive_fail_pr_ratios = []  # pr 中有推荐人选推荐错误的pr比例
        recommend_positive_fail_time_ratios = []  # pr 中有pr * 人次有推荐错误的频次比例
        recommend_negative_fail_pr_ratios = []  # pr 中有推荐人选不知道是否正确的比例
        recommend_negative_fail_time_ratios = []  # pr中有pr * 人次有不知道是否正确的比例
        error_analysis_datas = None

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])
        for date in dates:
            startTime = datetime.now()
            """根据推荐列表做评价"""

            recommendList, answerList, prList, convertDict, trainSize = IR_ACTrain.algorithmBody(date, project,
                                                                                                 recommendNum,
                                                                                                 filter_train=filter_train,
                                                                                                 filter_test=filter_test,
                                                                                                 test_type=test_type)

            topk, mrr, precisionk, recallk, fmeasurek = \
                DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum)

            topks.append(topk)
            mrrs.append(mrr)
            precisionks.append(precisionk)
            recallks.append(recallk)
            fmeasureks.append(fmeasurek)

            error_analysis_data = None
            filter_answer_list = None
            if error_analysis:
                if test_type == StringKeyUtils.STR_TEST_TYPE_SLIDE:
                    y = date[2]
                    m = date[3]
                    filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                    filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList,
                                                                                             convertDict, filename,
                                                                                             'review_user_login',
                                                                                             'pr_number')
                elif test_type == StringKeyUtils.STR_TEST_TYPE_INCREMENT:
                    fileList = []
                    for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
                        y = int((i - i % 12) / 12)
                        m = i % 12
                        if m == 0:
                            m = 12
                            y = y - 1
                        fileList.append(
                            projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv')

                    filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerDataByIncrement(project, prList,
                                                                                                        convertDict,
                                                                                                        fileList,
                                                                                                        'review_user_login',
                                                                                                        'pr_number')

                # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \
                # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \
                # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis(
                #     recommendList, answerList, filter_answer_list, recommendNum)
                # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio,
                #                        recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio,
                #                        recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio,
                #                        recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio]

                recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \
                recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis(
                    recommendList, answerList, filter_answer_list, recommendNum)
                error_analysis_data = [recommend_positive_success_pr_ratio,
                                       recommend_negative_success_pr_ratio,
                                       recommend_positive_fail_pr_ratio,
                                       recommend_negative_fail_pr_ratio]

                # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio)
                # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio)
                # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio)
                # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)
                # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio)

                recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)

            if error_analysis_data:
                # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios,
                #                         recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios,
                #                         recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios,
                #                         recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios]
                error_analysis_datas = [recommend_positive_success_pr_ratios,
                                        recommend_negative_success_pr_ratios,
                                        recommend_positive_fail_pr_ratios,
                                        recommend_negative_fail_pr_ratios]

            """结果写入excel"""
            DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date)

            """文件分割"""
            content = ['']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            content = ['训练集', '测试集']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            print("cost time:", datetime.now() - startTime)

        """推荐错误可视化"""
        DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project,
                                                 f'IR_AC_{test_type}_{filter_train}_{filter_test}')

        """计算历史累积数据"""
        DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks,
                                           fmeasureks, error_analysis_datas)
Пример #8
0
    def testRF_AAlgorithms(projects, dates, filter_train=False, filter_test=False, error_analysis=True):
        """
           RF 算法由于特征和输入无法和ML兼容,单独开一个文件
        """
        startTime = datetime.now()

        for project in projects:
            excelName = f'outputRF_A_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx'
            recommendNum = 5  # 推荐数量
            sheetName = 'result'
            """初始化excel文件"""
            ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])
            """初始化项目抬头"""
            content = ["项目名称:", project]
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())

            """计算累积数据"""
            topks = []
            mrrs = []
            precisionks = []
            recallks = []
            fmeasureks = []
            recommend_positive_success_pr_ratios = []  # pr 中有推荐成功人选的比例
            recommend_positive_success_time_ratios = []  # 推荐pr * 人次 中有推荐成功人选的频次比例
            recommend_negative_success_pr_ratios = []  # pr 中有推荐人选Hit 但被滤掉的pr的比例
            recommend_negative_success_time_ratios = []  # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例
            recommend_positive_fail_pr_ratios = []  # pr 中有推荐人选推荐错误的pr比例
            recommend_positive_fail_time_ratios = []  # pr 中有pr * 人次有推荐错误的频次比例
            recommend_negative_fail_pr_ratios = []  # pr 中有推荐人选不知道是否正确的比例
            recommend_negative_fail_time_ratios = []  # pr中有pr * 人次有不知道是否正确的比例
            error_analysis_datas = None

            for date in dates:
                recommendList, answerList, prList, convertDict, trainSize = RF_ATrain.algorithmBody(date, project,
                                                                                                   recommendNum,
                                                                                                   filter_train=filter_train,
                                                                                                   filter_test=filter_test)
                """根据推荐列表做评价"""
                topk, mrr, precisionk, recallk, fmeasurek = \
                    DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum)

                topks.append(topk)
                mrrs.append(mrr)
                precisionks.append(precisionk)
                recallks.append(recallk)
                fmeasureks.append(fmeasurek)

                error_analysis_data = None
                if error_analysis:
                    y = date[2]
                    m = date[3]
                    filename = projectConfig.getRF_ADataPath() + os.sep + f'RF_A_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                    filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date,
                                                                                             prList,
                                                                                             convertDict, filename,
                                                                                             'review_user_login',
                                                                                             'pr_number')
                    # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \
                    # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \
                    # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis(
                    #     recommendList, answerList, filter_answer_list, recommendNum)
                    # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio,
                    #                        recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio,
                    #                        recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio,
                    #                        recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio]

                    recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \
                    recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis(
                        recommendList, answerList, filter_answer_list, recommendNum)
                    error_analysis_data = [recommend_positive_success_pr_ratio,
                                           recommend_negative_success_pr_ratio,
                                           recommend_positive_fail_pr_ratio,
                                           recommend_negative_fail_pr_ratio]

                    # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                    # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio)
                    # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                    # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio)
                    # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                    # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio)
                    # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)
                    # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio)

                    recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                    recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                    recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                    recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)

                if error_analysis_data:
                    # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios,
                    #                         recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios,
                    #                         recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios,
                    #                         recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios]
                    error_analysis_datas = [recommend_positive_success_pr_ratios,
                                            recommend_negative_success_pr_ratios,
                                            recommend_positive_fail_pr_ratios,
                                            recommend_negative_fail_pr_ratios]

                """结果写入excel"""
                DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date, error_analysis_data)

                """文件分割"""
                content = ['']
                ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
                content = ['训练集', '测试集']
                ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())

                print("cost time:", datetime.now() - startTime)
                """推荐错误可视化"""
                DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'RF_{filter_train}_{filter_test}')

                """计算历史累积数据"""
                DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks,
                                                   error_analysis_datas)
Пример #9
0
def readOutPutFile(project, keys, cid='whole', filter_train=True, filter_test=True):
    """
    key表示具体要获取的数据
    """
    # 拼接output文件
    returns = []
    filename = f"outputCN_{project}_{filter_train}_{filter_test}.xls"
    for key in keys:
        if key == MODULARITY:
            modularitys = []
            row_idx = 2
            row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            while row is not None and modularitys.__len__() < 12:
                if key in row:
                    next_row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx + 1)
                    if next_row[1] == cid:
                       modularitys.append(next_row[4])
                row_idx += 18
                row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            returns.append(modularitys)
        if key == ENTROPY:
            entropys = []
            row_idx = 2
            row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            while row is not None and entropys.__len__() < 12:
                if key in row:
                    next_row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx + 1)
                    if next_row[1] == cid:
                        entropys.append(next_row[5])
                row_idx += 18
                row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            returns.append(entropys)
        if key == SIZE:
            sizes = []
            row_idx = 2
            row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            while row is not None and sizes.__len__() < 12:
                if key in row:
                    next_row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx + 1)
                    if next_row[1] == cid:
                        sizes.append(next_row[2])
                row_idx += 18
                row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            returns.append(sizes)
        if key == TOPK_ACC:
            topks = [[], [], []]
            row_idx = 4
            row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            while row is not None and topks[0].__len__() < 12:
                if key in row:
                    next_row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx + 2)
                    topks[0].append(next_row[2])
                    topks[1].append(next_row[4])
                    topks[2].append(next_row[6])
                row_idx += 18
                row = ExcelHelper().readExcelRow(filename, "result", startRow=row_idx)
            returns.append(topks)
    return returns
Пример #10
0
    def testCBAlgorithmsByMultipleLabels(projects, dates, algorithms):
        """
             algorithm : 混合算法,提供算法的排列组合
             项目 -> 日期 -> 算法排列组合
             每一个项目占一个文件位置  每一个算法组合占一页
          """
        recommendNum = 5  # 推荐数量
        for project in projects:
            excelName = f'outputCB_{project}.xlsx'
            sheetName = 'result'

            """初始化excel文件"""
            ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])

            """对不同时间做一个综合统计
               组合的int -> [[],[]....]
            """
            topks = {}
            mrrs = {}
            precisionks = {}
            recallks = {}
            fmeasureks = {}
            """初始化"""
            for i in range(1, 2 ** algorithms.__len__()):
                topks[i] = []
                mrrs[i] = []
                precisionks[i] = []
                recallks[i] = []
                fmeasureks[i] = []

            for date in dates:
                """获得不同算法的推荐列表,答案和pr列表"""
                """不同算法预处理可能会筛去一些pr  pr列表用于做统一"""
                prs = []
                recommendLists = []
                answerLists = []

                """计算不同人之前在训练集review的次数 作为后面综合统计的第二依据"""
                reviewerFreq = DataProcessUtils.getReviewerFrequencyDict(project, date)

                for algorithm in algorithms:
                    print(f"project:{project},  date:{date}, algorithm:{algorithm}")
                    """根据算法获得推荐列表"""
                    recommendList, answerList, prList, convertDict, trainSize = CBTrain.algorithmBody(date, project, algorithm,
                                                                                           recommendNum)
                    # print(recommendList)
                    print("trainSize:", trainSize)

                    """人名还原"""
                    recommendList, answerList = CBTrain.recoverName(recommendList, answerList, convertDict)

                    # print(recommendList)

                    prs.append(prList)
                    recommendLists.append(recommendList)
                    answerLists.append(answerList)

                """不同算法按照共有的pr 顺序调整"""
                prs, recommendLists, answerLists = CBTrain.normList(prs, recommendLists, answerLists)

                """貌似推荐是人名也可以做效果评估 暂时不转化"""
                # CBTrain.convertNameToNumber(recommendLists, answerLists)

                """对不同算法做排列组合"""
                for i in range(1, 2 ** algorithms.__len__()):
                    tempRecommendList = []
                    """不同算法测试的 answer列表相同,取一个即可"""
                    answer = answerLists[0]

                    involve = [0] * algorithms.__len__()
                    k = i
                    for j in range(0, algorithms.__len__()):
                        involve[algorithms.__len__() - j - 1] = k % 2
                        k = floor(k / 2)
                    """组合算法label为excel sheetName"""
                    label = ''
                    for j in range(0, algorithms.__len__()):
                        if involve[j] == 1:
                            if label != '':
                                label = label + '_'
                            label = label + algorithms[j]
                            tempRecommendList.append(recommendLists[j])
                    sheetName = label
                    ExcelHelper().addSheet(filename=excelName, sheetName=sheetName)
                    """波达计数 结合不同投票选出最终名单"""
                    finalRecommendList = []
                    for j in range(0, answer.__len__()):
                        recommendList = SortAlgorithmUtils.BordaCountSortWithFreq([x[j] for x in tempRecommendList],
                                                                                  reviewerFreq)
                        finalRecommendList.append(recommendList)

                    """评价指标"""
                    topk, mrr, precisionk, recallk, fmeasurek = \
                        DataProcessUtils.judgeRecommend(finalRecommendList, answer, recommendNum)

                    """结果写入excel"""
                    DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date)

                    """累积评价指标"""
                    topks[i].append(topk)
                    mrrs[i].append(mrr)
                    precisionks[i].append(precisionk)
                    recallks[i].append(recallk)
                    fmeasureks[i].append(fmeasurek)

            """对指标做综合评判"""
            for i in range(1, 2 ** algorithms.__len__()):
                involve = [0] * algorithms.__len__()
                k = i
                for j in range(0, algorithms.__len__()):
                    involve[algorithms.__len__() - j - 1] = k % 2
                    k = floor(k / 2)
                """组合算法label为excel sheetName"""
                label = ''
                for j in range(0, algorithms.__len__()):
                    if involve[j] == 1:
                        if label != '':
                            label = label + '_'
                        label = label + algorithms[j]
                sheetName = label
                DataProcessUtils.saveFinallyResult(excelName, sheetName, topks[i], mrrs[i], precisionks[i], recallks[i],
                                                   fmeasureks[i])
Пример #11
0
    def testCNAlgorithm(project, dates, filter_train=False, filter_test=False, is_split=False, error_analysis=False):
        """整合 训练数据"""
        """2020.8.7 新增参数 filter_data 和 error_analysis
           filter_train 判断是否使用 changetrigger过滤的训练数据
           filter_test 判断是否使用 changetrigger过滤的验证数据
           error_analysis 表示是否开启chang_trigger过滤答案的错误统计机制
        """
        recommendNum = 5  # 推荐数量
        excelName = f'outputCN_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx'
        sheetName = 'result'

        """计算累积数据"""
        topks = []
        mrrs = []
        precisionks = []
        recallks = []
        fmeasureks = []
        recommend_positive_success_pr_ratios = []  # pr 中有推荐成功人选的比例
        recommend_positive_success_time_ratios = []  # 推荐pr * 人次 中有推荐成功人选的频次比例
        recommend_negative_success_pr_ratios = []  # pr 中有推荐人选Hit 但被滤掉的pr的比例
        recommend_negative_success_time_ratios = []  # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例
        recommend_positive_fail_pr_ratios = []  # pr 中有推荐人选推荐错误的pr比例
        recommend_positive_fail_time_ratios = []  # pr 中有pr * 人次有推荐错误的频次比例
        recommend_negative_fail_pr_ratios = []  # pr 中有推荐人选不知道是否正确的比例
        recommend_negative_fail_time_ratios = []  # pr中有pr * 人次有不知道是否正确的比例
        error_analysis_datas = None

        """初始化excel文件"""
        ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集'])
        for date in dates:
            CNTrain.clean()
            startTime = datetime.now()
            prList, convertDict, trainSize, communities_data= CNTrain.algorithmBody(date, project,
                                                                                              recommendNum,
                                                                                              filter_train=filter_train,
                                                                                              filter_test=filter_test,
                                                                                              is_split=is_split)

            communitiesTuple = sorted(communities_data.items(), key=lambda x: x[0])
            for cid, c_data in communitiesTuple:
                """根据推荐列表做评价"""
                topk, mrr, precisionk, recallk, fmeasurek = \
                    DataProcessUtils.judgeRecommend(c_data['recommend_list'], c_data['answer_list'], recommendNum)
                communities_data[cid]['topk'] = topk
                communities_data[cid]['mrr'] = mrr
                communities_data[cid]['precisionk'] = precisionk
                communities_data[cid]['recallk'] = recallk
                communities_data[cid]['fmeasurek'] = fmeasurek

            print("project: {0}, modularity: {1}, entropy: {2}, avg_variance: {3}".format(project,
                                                                       communities_data['whole']['modularity'],
                                                                       communities_data['whole']['entropy'],
                                                                       communities_data['whole']['avg_variance']))

            error_analysis_data = None
            if error_analysis:
                y = date[2]
                m = date[3]
                filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList,
                                                                                         convertDict, filename,
                                                                                         'reviewer', 'pull_number')
                # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \
                # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \
                # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis(
                #     recommendList, answerList, filter_answer_list, recommendNum)
                # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio,
                #                        recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio,
                #                        recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio,
                #                        recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio]

                recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \
                recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis(
                    communities_data['whole']['recommend_list'], communities_data['whole']['answer_list'], filter_answer_list, recommendNum)
                error_analysis_data = [recommend_positive_success_pr_ratio,
                                       recommend_negative_success_pr_ratio,
                                       recommend_positive_fail_pr_ratio,
                                       recommend_negative_fail_pr_ratio]

                # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio)
                # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio)
                # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio)
                # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)
                # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio)

                recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)

            if error_analysis_data:
                # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios,
                #                         recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios,
                #                         recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios,
                #                         recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios]
                error_analysis_datas = [recommend_positive_success_pr_ratios,
                                        recommend_negative_success_pr_ratios,
                                        recommend_positive_fail_pr_ratios,
                                        recommend_negative_fail_pr_ratios]

            topks.append(communities_data['whole']['topk'])
            mrrs.append(communities_data['whole']['mrr'])
            precisionks.append(communities_data['whole']['precisionk'])
            recallks.append(communities_data['whole']['recallk'])
            fmeasureks.append(communities_data['whole']['fmeasurek'])

            """结果写入excel"""
            DataProcessUtils.saveResult_Community_Version(excelName, sheetName, communities_data, date)

            error_analysis_data = None
            if error_analysis:
                y = date[2]
                m = date[3]
                filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv'
                filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList,
                                                                                         convertDict, filename,
                                                                                         'reviewer', 'pull_number')
                # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \
                # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \
                # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis(
                #     recommendList, answerList, filter_answer_list, recommendNum)
                # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio,
                #                        recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio,
                #                        recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio,
                #                        recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio]

                recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio,\
                recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis(
                    communities_data['whole']['recommend_list'], communities_data['whole']['answer_list']
                    , filter_answer_list, recommendNum)
                error_analysis_data = [recommend_positive_success_pr_ratio,
                                       recommend_negative_success_pr_ratio,
                                       recommend_positive_fail_pr_ratio,
                                       recommend_negative_fail_pr_ratio]

                # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio)
                # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio)
                # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio)
                # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)
                # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio)

                recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio)
                recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio)
                recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio)
                recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio)

            if error_analysis_data:
                # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios,
                #                         recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios,
                #                         recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios,
                #                         recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios]
                error_analysis_datas = [recommend_positive_success_pr_ratios,
                                        recommend_negative_success_pr_ratios,
                                        recommend_positive_fail_pr_ratios,
                                        recommend_negative_fail_pr_ratios]

            """结果写入excel"""
            DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date, error_analysis_data)

            """文件分割"""
            content = ['']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
            content = ['训练集', '测试集']
            ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())

            print("cost time:", datetime.now() - startTime)

        """推荐错误可视化"""
        DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'CN_{filter_train}_{filter_test}')

        """计算历史累积数据"""
        DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks, error_analysis_datas)
            print(reviewer, temp_df.shape[0])
            if reviewer not in reviewer_list:
                tempDict = {"reviewer": reviewer}
                for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1):  # 拆分的数据做拼接
                    y = int((i - i % 12) / 12)
                    m = i % 12
                    if m == 0:
                        m = 12
                        y = y - 1

                    df = temp_df.loc[(temp_df['label_y'] == y) & (temp_df['label_m'] == m)].copy(deep=True)
                    sum = df.shape[0]
                    if sum == 0:
                        pass
                        # tempDict[f'{y}年{m}月'] = 0
                    else:
                        valid = df.loc[df['change_trigger'] >= 0].shape[0]
                        tempDict[f'{y}年{m}月'] = valid / sum
                ratio_df = ratio_df.append(tempDict, ignore_index=True)

        print(ratio_df.shape)
        # ratio_df.to_excel("q5_change_trigger_ratio.xls")


if __name__ == "__main__":
    df = commentAcceptRate.commentAcceptRatioByProject(["tezos"], (2020, 7, 2020, 9))
    """计算的df写入xlsx"""
    fileName = "project_index.xlsx"
    sheetName = "commentAcceptRatio"
    ExcelHelper().writeDataFrameToExcel(fileName, sheetName, df)
Пример #13
0
 def commentDesensitization(self):
     for col in self.DesensitizeKeyList:
         ExcelHelper().desensitizationExcelCol(projectConfig.getTestoutputExcelPath(), projectConfig.TEST_OUT_PUT_SHEET_NAME\
                                               , col, 1)