Пример #1
0
    def preProcess(df, date, project, isSTD=False, isNOR=False):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df.reset_index(drop=True, inplace=True)

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login'])
        recoverDict = {v: k for k, v in convertDict.items()}

        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'review_user_login', 'author_user_login', 'author_association', 'commits',
                 'deletions', 'additions', 'changed_files', 'label', 'merged']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)

        """计算作者的关系"""
        df['author_association'] = df['author_association'].apply(lambda x: x == 'MEMBER')

        """计算累积的历史数据"""
        request_number_prs = []  # 作者之前发出的数量
        request_number_merged_prs = []  # 作者发出的被接受的数量
        request_number_rejected_prs = []  # 作者发出被拒绝的数量
        request_accept_rate = []  # 作者pr被接受的概率
        request_reject_rate = []  # 作者pr被拒绝的概率

        for row in df.itertuples():
            pr_num = getattr(row, 'pr_number')
            author = getattr(row, 'author_user_login')
            """过滤历史的pr"""
            temp_df = df.loc[(df['pr_number'] < pr_num)&(df['author_user_login'] == author)]
            request_number_prs.append(temp_df.shape[0])
            accept_times = temp_df.loc[temp_df['merged'] == 1].shape[0]
            request_number_merged_prs.append(accept_times)
            request_number_rejected_prs.append(temp_df.shape[0] - accept_times)
            if temp_df.shape[0] > 0:
                request_accept_rate.append(accept_times/temp_df.shape[0])
                request_reject_rate.append(1 - accept_times / temp_df.shape[0])
            else:
                request_accept_rate.append(0)
                request_reject_rate.append(0)

        df['request_number_prs'] = request_number_prs
        df['request_number_merged_prs'] = request_number_merged_prs
        df['request_number_rejected_prs'] = request_number_rejected_prs
        df['request_accept_rate'] = request_accept_rate
        df['request_reject_rate'] = request_reject_rate

        """添加作者是否关注项目"""
        user_watch_repo_relation_path = projectConfig.getUserWatchRepoRelation()
        userWatchRepoRelation = pandasHelper.readTSVFile(
            os.path.join(user_watch_repo_relation_path, f'userWatchRepoRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )
        watchRepoMap = {}
        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            following_list = list(set(userWatchRepoRelation.loc[userWatchRepoRelation['login'] == k]['repo_full_name']))
            isFollow = False
            for repo in following_list:
                owner, name = repo.split('/')
                if name == project:
                    isFollow = True
            watchRepoMap[convertDict[k]] = isFollow

        request_watches = []
        for row in df.itertuples():
            author = getattr(row, 'author_user_login')
            request_watches.append(watchRepoMap[author])
        df['request_watches'] = request_watches

        """添加作者follower数量, followings数量, 是否follow团队成员"""

        user_follow_relation_path = projectConfig.getUserFollowRelation()
        userFollowRelation = pandasHelper.readTSVFile(
            os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )

        followMap = {}
        followerCountMap = {}
        followingCountMap = {}
        followCoreMemberMap = {}

        """收集核心成员列表"""
        coreMemberList = list(set(df.loc[df['author_association'] == 1]['author_user_login']))

        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            following_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login']))
            followingCountMap[convertDict[k]] = following_list.__len__()
            isFollowCoreMember = False
            for f in following_list:
                if f in convertDict.keys():
                    followMap[(convertDict[k], convertDict[f])] = 1
                if f in coreMemberList:
                    isFollowCoreMember = True
            followCoreMemberMap[convertDict[k]] = isFollowCoreMember

            follower_list = list(set(userFollowRelation.loc[userFollowRelation['following_login'] == k]['login']))
            followerCountMap[convertDict[k]] = follower_list.__len__()
            # for f in follower_list:
            #     if f in convertDict.keys():
            #         followMap[(convertDict[f], convertDict[k])] = 1

        request_number_follows = []
        request_number_following = []
        request_follow_ct = []
        for row in df.itertuples():
            pr_num = getattr(row, 'pr_number')
            author = getattr(row, 'author_user_login')
            """过滤历史的pr"""
            request_number_following.append(followingCountMap[author])
            request_number_follows.append(followerCountMap[author])
            request_follow_ct.append(followCoreMemberMap[author])

        df['request_number_following'] = request_number_following
        df['request_number_follows'] = request_number_follows
        df['request_follow_ct'] = request_follow_ct

        """先提前统计正确答案"""
        tagDict = dict(list(df.groupby('pr_number')))

        train_data = df.loc[df['label'] == 0].copy(deep=True)
        test_data = df.loc[df['label'] == 1].copy(deep=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True)
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)

        """获得pr list"""
        prList = list(test_data['pr_number'])
        test_data.drop(columns=['pr_number'], inplace=True)

        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Пример #2
0
    def preProcess(df, date, project, isSTD=False, isNOR=False):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df.reset_index(drop=True, inplace=True)

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login'])
        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """先对输入数据做精简 只留下感兴趣的数据"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'review_user_login', 'label']].copy(deep=True)

        print("before filter:", df.shape)
        df.drop_duplicates(inplace=True)
        print("after filter:", df.shape)


        """先提前统计正确答案"""
        tagDict = dict(list(df.groupby('pr_number')))

        train_data = df.loc[df['label'] == 0].copy(deep=True)
        test_data = df.loc[df['label'] == 1].copy(deep=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        """pr_number  经过去重"""
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """先尝试所有信息团在一起"""
        df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True)
        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)

        """用于收集所有文本向量分词"""
        stopwords = SplitWordHelper().getEnglishStopList()  # 获取通用英语停用词

        textList = []
        for row in df.itertuples(index=False, name='Pandas'):
            tempList = []
            """获取pull request的标题"""
            pr_title = getattr(row, 'pr_title')
            pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords]

            """初步尝试提取词干效果反而下降了 。。。。"""

            """对单词做提取词干"""
            pr_title_word_list = nltkFunction.stemList(pr_title_word_list)
            tempList.extend(pr_title_word_list)

            """pull request的body"""
            pr_body = getattr(row, 'pr_body')
            pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords]
            """对单词做提取词干"""
            pr_body_word_list = nltkFunction.stemList(pr_body_word_list)
            tempList.extend(pr_body_word_list)
            textList.append(tempList)

        print(textList.__len__())
        """对分词列表建立字典 并提取特征数"""
        dictionary = corpora.Dictionary(textList)
        print('词典:', dictionary)

        feature_cnt = len(dictionary.token2id)
        print("词典特征数:", feature_cnt)

        """根据词典建立语料库"""
        corpus = [dictionary.doc2bow(text) for text in textList]
        # print('语料库:', corpus)
        """语料库训练TF-IDF模型"""
        tfidf = models.TfidfModel(corpus)

        """再次遍历数据,形成向量,向量是稀疏矩阵的形式"""
        wordVectors = []
        for i in range(0, df.shape[0]):
            wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])]))

        """对已经有的本文特征向量和标签做训练集和测试集的拆分"""

        trainData_index = df.loc[df['label'] == False].index
        testData_index = df.loc[df['label'] == True].index

        """训练集"""
        train_data = [wordVectors[x] for x in trainData_index]
        """测试集"""
        test_data = [wordVectors[x] for x in testData_index]
        """填充为向量"""
        train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt)
        test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Пример #3
0
    def preProcess(df, date, project, featureType, isSTD=False, isNOR=False):
        """参数说明
         df:读取的dataframe对象
         testDate:作为测试的年月 (year,month)
         isSTD:对数据是否标准化
         isNOR:对数据是否归一化
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))

        df.sort_values(by='pr_number', ascending=True, inplace=True)
        df.reset_index(drop=True, inplace=True)

        # """在现有的特征中添加文本路径特征"""
        """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息
           所以添加之前必须两者分别处理 4.13 
           append 函数必须在表明label后面使用"""

        if featureType == 1 or featureType == 3:
            df = appendFilePathFeatureVector(df, project, date, 'pr_number')
        """在现有的特征中添加pr标题和内容文本特征"""
        if featureType == 2 or featureType == 3:
            df = appendTextualFeatureVector(df, project, date, 'pr_number')

        # """频率统计每一个reviewer的次数,排除数量过少的reviewer"""
        # freq = {}
        # for data in df.itertuples(index=False):
        #     name = data[list(df.columns).index('review_user_login')]
        #     if freq.get(name, None) is None:
        #         freq[name] = 0
        #     """训练集用户次数加一  测试集直接保留 """
        #     if not data[list(df.columns).index('label')]:
        #         freq[name] += 1
        #     else:
        #         freq[name] += 1
        #
        # num = 5
        # df['freq'] = df['review_user_login'].apply(lambda x: freq[x])
        # df = df.loc[df['freq'] > num].copy(deep=True)
        # df.drop(columns=['freq'], inplace=True)
        # df.reset_index(drop=True, inplace=True)
        # print("after lifter unexperienced user:"******"""对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login'])
        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """对branch做处理  舍弃base,head做拆分 并数字化"""
        df.drop(axis=1, columns=['pr_base_label'], inplace=True)  # inplace 代表直接数据上面
        df['pr_head_tail'] = df['pr_head_label']
        df['pr_head_tail'] = df['pr_head_tail'].apply(lambda x: x.split(':')[1])
        df['pr_head_label'] = df['pr_head_label'].apply(lambda x: x.split(':')[0])

        df.drop(axis=1, columns=['pr_head_tail'], inplace=True)

        # MLTrain.changeStringToNumber(df, ['pr_head_tail'])
        DataProcessUtils.changeStringToNumber(df, ['pr_head_label'])

        """时间转时间戳处理"""
        df['pr_created_at'] = df['pr_created_at'].apply(
            lambda x: int(time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))))

        """先对tag做拆分"""
        tagDict = dict(list(df.groupby('pr_number')))

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """去除pr number"""
        test_data.drop(columns=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)

        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            # print(train_data_std)
            # print(test_data_std.shape)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList
Пример #4
0
    def preProcess(df, date, project, isSTD=False, isNOR=False, m=3):
        """参数说明
        df:读取的dataframe对象
        testDate:作为测试的年月 (year,month)
        isSTD:对数据是否标准化
        isNOR:对数据是否归一化
        m: 超参数,窗口时间
        """
        print("start df shape:", df.shape)
        """过滤NA的数据"""
        df.dropna(axis=0, how='any', inplace=True)
        print("after fliter na:", df.shape)

        """对df添加一列标识训练集和测试集"""
        df['label'] = df['pr_created_at'].apply(
            lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and
                       time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3]))
        df['label_y'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year)
        df['label_m'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon)
        df.reset_index(drop=True, inplace=True)

        """更正说明:由于PCA不能训练集和测试集同时降维,否则相当于使用了后面的信息
            所以添加之前必须两者分别处理 4.13 
            append 函数必须在表明label后面使用"""

        """添加File Path Features"""
        df = appendFilePathFeatureVector(df, project, date, 'pr_number')


        """读取User Follow的信息"""
        user_follow_relation_path = projectConfig.getUserFollowRelation()
        userFollowRelation = pandasHelper.readTSVFile(
            os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'),
            pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False
        )

        def isInTimeGap(x, m, maxYear, maxMonth):
            d = x['label_y'] * 12 + x['label_m']
            d2 = maxYear * 12 + maxMonth
            return d >= d2 - m

        """对人名字做数字处理"""
        """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况"""
        """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用"""
        convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'pr_user_login'])

        print(df.shape)
        candicateNum = max(df.loc[df['label'] == 0]['review_user_login'])
        print("candicate Num:", candicateNum)

        """计算contributor set"""
        contribute_list = list(set(df.loc[df['label'] == 1]['pr_user_login']))
        reviewer_list = list(set(df.loc[df['label'] == 0]['review_user_login']))

        """添加Relation ship Features"""
        """对 train set和test set的处理方式稍微不同   train set数据统计依照之前pr
            而训练集的统计数据只限制于trianset
        """

        """把  df 的pr_created_at 和 comment_at 转化为时间戳"""
        df['pr_created_at'] = df['pr_created_at'].apply(
            lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['comment_at'] = df['comment_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S")))
        df['response_time'] = df['comment_at'] - df['pr_created_at']

        """Prior Evaluation  reviewer cm 之前 review co的次数
           Recent Evaluation reviewer cm 在 m 个月 reivew co的次数
           Follow Relation  co 是否follow cm
           Follower Relation  cm 是否follow co
        """
        startTime = datetime.now()
        prior_evaluation = {}
        recent_evaluation = {}
        follower_relation = {}
        following_relation = {}
        followMap = {}
        for k in convertDict.keys():
            """获取 reviewer 的 follow 列表"""
            follower_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login']))
            for f in follower_list:
                if f in convertDict.keys():
                    followMap[(convertDict[k], convertDict[f])] = 1

        for reviewer in reviewer_list:
            prior_evaluation[reviewer] = []
            recent_evaluation[reviewer] = []
            follower_relation[reviewer] = []
            following_relation[reviewer] = []
        cols = list(df.columns)

        for data in df.itertuples(index=False, name='Pandas'):
            if data.__len__() < 14:
                pullNumber = getattr(data, 'pr_number')
                author = getattr(data, 'pr_user_login')
                label = getattr(data, 'label')
                label_m = getattr(data, 'label_m')
                label_y = getattr(data, 'label_y')
            else:
                pullNumber = data[cols.index("pr_number")]
                author = data[cols.index("pr_user_login")]
                label = data[cols.index("label")]
                label_m = data[cols.index("label_m")]
                label_y = data[cols.index("label_y")]

            temp = None
            if label == 0:
                temp = df.loc[df['pr_number'] < pullNumber]
            else:
                temp = df.loc[df['label'] == 0]
            temp = temp.loc[df['pr_user_login'] == author].copy(deep=True)
            """依次遍历每个候选者统计"""
            prior_evaluation_dict = dict(temp['review_user_login'].value_counts())
            for r in reviewer_list:
                prior_evaluation[r].append(prior_evaluation_dict.get(r, 0))
            """temp 二次过滤  选m个月以内的"""
            if temp.shape[0] > 0:
                if label == 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1)
                else:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1)
                temp = temp.loc[temp['target'] == 1]
            """依次遍历每个候选者统计"""
            recent_evaluation_dict = dict(temp['review_user_login'].value_counts())
            for r in reviewer_list:
                recent_evaluation[r].append(recent_evaluation_dict.get(r, 0))
            """添加 follow 和 following 信息"""
            for r in reviewer_list:
                follower_relation[r].append(followMap.get((author, r), 0))
                following_relation[r].append(followMap.get((r, author), 0))

        """添加"""
        for r in reviewer_list:
            df[f'prior_evaluation_{r}'] = prior_evaluation[r]
            df[f'recent_evaluation_{r}'] = recent_evaluation[r]
            df[f'follower_relation_{r}'] = follower_relation[r]
            df[f'following_relation_{r}'] = following_relation[r]

        print("prior cost time:", datetime.now() - startTime)
        startTime = datetime.now()

        # 开始时间:数据集开始时间的前一天
        start_time = time.strptime(str(date[0]) + "-" + str(date[1]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S")
        start_time = int(time.mktime(start_time) - 86400)
        # 结束时间:数据集的最后一天
        end_time = time.strptime(str(date[2]) + "-" + str(date[3]) + "-" + "01 00:00:00", "%Y-%m-%d %H:%M:%S")
        end_time = int(time.mktime(end_time) - 1)

        """Activeness Feature 添加"""
        total_pulls = {}  # 项目有的所有pr
        evaluate_pulls = {}  # co 之前review的数量
        recent_pulls = {}  # co 最近m月 review的数量
        evaluate_time = {}  # co 平均回应时间
        last_time = {}  # co 最后一次reivew 的时间间隔
        first_time = {}  # co 第一次review的时间间隔
        for reviewer in reviewer_list:
            total_pulls[reviewer] = []
            evaluate_pulls[reviewer] = []
            recent_pulls[reviewer] = []
            evaluate_time[reviewer] = []
            last_time[reviewer] = []
            first_time[reviewer] = []
        count = 0
        cols = list(df.columns)

        index_pr_number = cols.index("pr_number")
        index_pr_label = cols.index("label")
        index_pr_label_m = cols.index("label_m")
        index_pr_label_y = cols.index("label_y")

        for data in df.itertuples(index=False):
            print("count for active:", count)
            count += 1
            pullNumber = data[index_pr_number]
            label = data[index_pr_label]
            label_m = data[index_pr_label_m]
            label_y = data[index_pr_label_y]
            temp = None
            if label == 0:
                temp = df.loc[df['pr_number'] < pullNumber].copy(deep=True)
            else:
                temp = df.loc[df['label'] == 0].copy(deep=True)
            """依次遍历每个候选者统计"""
            total_pull_number = list(set(temp['pr_number'])).__len__()
            res_reviewer_list = reviewer_list.copy()

            groups = dict(list(temp.groupby('review_user_login')))
            """先遍历有tempDf的reviewer"""
            for r, tempDf in groups.items():
                total_pulls[r].append(total_pull_number)
                res_reviewer_list.remove(r)
                if tempDf.shape[0] == 0:
                    """没有历史 认为age=0, 间隔是最大间隔"""
                    first_time[r].append(0)
                    last_time[r].append(end_time - start_time)
                else:
                    pr_created_time_list = list(tempDf['pr_created_at'])
                    first_review_time = min(pr_created_time_list)
                    last_review_time = max(pr_created_time_list)
                    first_time[r].append(end_time - first_review_time)
                    last_time[r].append(end_time - last_review_time)
                evaluate_pulls[r].append(tempDf.shape[0])

                """平均回应时间统计"""
                if tempDf.shape[0] > 0:
                    evaluate_avg = sum(tempDf['response_time'])
                    evaluate_avg /= tempDf.shape[0]
                else:
                    evaluate_avg = end_time - start_time
                evaluate_time[r].append(evaluate_avg)

            for r in res_reviewer_list:
                total_pulls[r].append(total_pull_number)
                evaluate_pulls[r].append(0)
                first_time[r].append(0)
                last_time[r].append(end_time - start_time)
                evaluate_avg = end_time - start_time
                evaluate_time[r].append(evaluate_avg)
                # recent_pulls[r].append(0)

            """过滤k个月 重新计算"""
            if label == 0:
                if temp.shape[0] > 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, label_y, label_m), axis=1)
                    temp = temp.loc[temp['target'] == 1]
            else:
                if temp.shape[0] > 0:
                    temp['target'] = temp.apply(lambda x: isInTimeGap(x, m, date[2], date[3]), axis=1)
                    temp = temp.loc[temp['target'] == 1]

            res_reviewer_list = reviewer_list.copy()
            groups = dict(list(temp.groupby('review_user_login')))
            """先遍历有tempDf的reviewer"""
            for r, tempDf in groups.items():
                recent_pulls[r].append(tempDf.shape[0])
                res_reviewer_list.remove(r)

            for r in res_reviewer_list:
                recent_pulls[r].append(0)

        """Activeness Feature增加到 dataframe"""
        for r in reviewer_list:
            df[f'total_pulls_{r}'] = total_pulls[r]
            df[f'evaluate_pulls_{r}'] = evaluate_pulls[r]
            df[f'recent_pulls_{r}'] = recent_pulls[r]
            df[f'first_time_{r}'] = first_time[r]
            df[f'last_time_{r}'] = last_time[r]
            df[f'evaluate_time_{r}'] = evaluate_time[r]

        print("active cost time:", datetime.now() - startTime)

        tagDict = dict(list(df.groupby('pr_number')))

        """对已经有的特征向量和标签做训练集的拆分"""
        train_data = df.loc[df['label'] == False].copy(deep=True)
        test_data = df.loc[df['label']].copy(deep=True)

        train_data.drop(columns=['label'], inplace=True)
        test_data.drop(columns=['label'], inplace=True)

        """问题转化为多标签问题
            train_data_y   [{pull_number:[r1, r2, ...]}, ... ,{}]
        """
        train_data_y = {}
        pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in pull_number_list:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            train_data_y[pull_number] = reviewers

        train_data.drop(columns=['review_user_login'], inplace=True)
        train_data.drop_duplicates(inplace=True)
        train_data.drop_duplicates(subset=['pr_number'], inplace=True)
        """训练集 结果做出多标签分类通用的模式"""
        train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum)

        test_data_y = {}
        pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number']
        for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']:
            reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login'])
            test_data_y[pull_number] = reviewers

        test_data.drop(columns=['review_user_login'], inplace=True)
        test_data.drop_duplicates(inplace=True)
        test_data.drop_duplicates(subset=['pr_number'], inplace=True)
        # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum)
        test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list)

        """获得pr list"""
        prList = list(test_data['pr_number'])

        """去除pr number"""
        test_data.drop(columns=['pr_number'], inplace=True)
        train_data.drop(columns=['pr_number'], inplace=True)

        test_data.drop(columns=['pr_created_at', 'pr_user_login',
                                'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True)
        train_data.drop(columns=['pr_created_at',  'pr_user_login',
                                'comment_at', 'label_y', 'label_m', 'response_time'], inplace=True)
        """参数规范化"""
        if isSTD:
            stdsc = StandardScaler()
            train_data_std = stdsc.fit_transform(train_data)
            test_data_std = stdsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        elif isNOR:
            maxminsc = MinMaxScaler()
            train_data_std = maxminsc.fit_transform(train_data)
            test_data_std = maxminsc.transform(test_data)
            return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList
        else:
            return train_data, train_data_y, test_data, test_data_y, convertDict, prList