def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" t1 = datetime.now() """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pr_number'))) """先尝试所有信息团在一起""" df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] """由于特殊性 PB算法的训练集不是dataFrame { p1:set1, p2:set2, ... } """ train_data = {} test_data = {} for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的number""" pr_num = getattr(row, 'pr_number') label = getattr(row, 'label') """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) wordSet = MultisetHelper.WordMultiset() wordSet.add(tempList) if label == 0: train_data[pr_num] = wordSet else: test_data[pr_num] = wordSet print("train size:", train_data.items().__len__()) print("test size:", test_data.items().__len__()) """问题转化为多标签问题 train_data_y [{pull_number:[(r1, s1), (r2, s2), ...]}, ... ,{}] r 代表reviewer s 代表集合 """ train_data_y = {} for pull_number in df.loc[df['label'] == False]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True) commentDict = dict(list(tempDf.groupby('review_user_login'))) reviewerList = [] for reviewer in reviewers: commentDf = commentDict[reviewer] wordSet = MultisetHelper.WordMultiset() for row in commentDf.itertuples(index=False, name='Pandas'): comment = getattr(row, 'comment_body') comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords] """对单词做提取词干""" comment_body_word_list = nltkFunction.stemList(comment_body_word_list) wordSet.add(comment_body_word_list) reviewerList.append((reviewer, wordSet)) train_data_y[pull_number] = reviewerList test_data_y = {} for pull_number in df.loc[df['label'] == True]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True) commentDict = dict(list(tempDf.groupby('review_user_login'))) reviewerList = [] for reviewer in reviewers: commentDf = commentDict[reviewer] wordSet = MultisetHelper.WordMultiset() for row in commentDf.itertuples(index=False, name='Pandas'): comment = getattr(row, 'comment_body') comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords] """对单词做提取词干""" comment_body_word_list = nltkFunction.stemList(comment_body_word_list) wordSet.add(comment_body_word_list) reviewerList.append((reviewer, wordSet)) test_data_y[pull_number] = reviewerList print("preprocess cost time:", datetime.now() - t1) return train_data, train_data_y, test_data, test_data_y, convertDict
def preProcessBySlide(df, dates): """参数说明 df:读取的dataframe对象 dates:作为测试的年月四元组 """ """注意: 输入文件中已经带有列名了""" """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """创建时间转化为时间戳""" df['pr_created_at'] = df['pr_created_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) df['pr_created_at'] = df['pr_created_at'] / (24 * 3600) """先对输入数据做精简 只留下感兴趣的数据""" df = df[['pr_number', 'pr_title', 'review_user_login', 'label', 'pr_created_at']].copy(deep=True) print("before filter:", df.shape) df.drop_duplicates(inplace=True) print("after filter:", df.shape) """对人名字做数字处理""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pr_number'))) """先尝试所有信息团在一起""" df = df[['pr_number', 'pr_title', 'label', 'pr_created_at']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) textList.append(tempList) print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(textList) print('词典:', dictionary) feature_cnt = len(dictionary.token2id) print("词典特征数:", feature_cnt) """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in textList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, df.shape[0]): wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])])) """对已经有的本文特征向量和标签做训练集和测试集的拆分""" trainData_index = df.loc[df['label'] == False].index testData_index = df.loc[df['label'] == True].index """训练集""" train_data = [wordVectors[x] for x in trainData_index] """测试集""" test_data = [wordVectors[x] for x in testData_index] """填充为向量""" train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt) test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt) train_data['pr_number'] = list(df.loc[df['label'] == False]['pr_number']) test_data['pr_number'] = list(df.loc[df['label'] == True]['pr_number']) train_data['pr_created_at'] = list(df.loc[df['label'] == False]['pr_created_at']) test_data['pr_created_at'] = list(df.loc[df['label'] == True]['pr_created_at']) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} for pull_number in df.loc[df['label'] == False]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers test_data_y = {} for pull_number in df.loc[df['label'] == True]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers """train_data ,test_data 最后一列是pr number test_data_y 的形式是dict""" return train_data, train_data_y, test_data, test_data_y, convertDict
def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na""" # """处理NAN""" # df.dropna(how='any', inplace=True) # df.reset_index(drop=True, inplace=True) df['pr_title'].fillna(value='', inplace=True) df['pr_body'].fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer']) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 """问题:lsi的过程不能在整个数据集上面做,不然会导致pr的文本重复问题""" df_pr = df.copy(deep=True) df_pr.drop_duplicates(subset=['pull_number'], keep='first', inplace=True) df_pr.reset_index(drop=True, inplace=True) # 用于记录pr中文字的数量,对于pr少于10个word的pr.直接去掉 df_pr_word_count = [] textList = [] for row in df_pr.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) if tempList.__len__() >= 10 or getattr(row, 'label'): textList.append(tempList) if getattr(row, 'label'): df_pr_word_count.append(10) # 以便过后面的过滤 else: df_pr_word_count.append(tempList.__len__()) """去除无用的训练pr""" df_pr['count'] = df_pr_word_count df_pr = df_pr.loc[df_pr['count'] >= 10].copy(deep=True) df_pr.reset_index(drop=True, inplace=True) df_pr.drop(['count'], inplace=True, axis=1) """保存只有pr的列表""" prList = list(df_pr['pull_number']) """对已经有的本文特征向量和标签做训练集和测试集的拆分""" trainData_index = df_pr.loc[df_pr['label'] == False].index testData_index = df_pr.loc[df_pr['label'] == True].index trainDataTextList = [textList[x] for x in trainData_index] testDataTextList = [textList[x] for x in testData_index] print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(trainDataTextList) print('词典:', dictionary) """感觉有问题,tfidf模型不应该是在全数据集上面计算,而是在训练集上面计算,而测试集的向量就是 单纯的带入模型的计算结果""" """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in trainDataTextList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] topic_num = 10 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_num) topic_list = lsi.print_topics() print("{0}个主题的单词分布为:\n".format(topic_num)) for topic in topic_list: print(topic) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, trainDataTextList.__len__()): wordVectors.append(dict(lsi[dictionary.doc2bow(trainDataTextList[i])])) for i in range(0, testDataTextList.__len__()): wordVectors.append(dict(lsi[dictionary.doc2bow(testDataTextList[i])])) """训练集""" train_data = [wordVectors[x] for x in trainData_index] """测试集""" test_data = [wordVectors[x] for x in testData_index] """填充为向量""" train_v_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=topic_num) test_v_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=topic_num) lsi_data = pandas.concat([train_v_data, test_v_data], axis=0) # 0 轴合并 lsi_data['pull_number'] = prList lsi_data.reset_index(inplace=True, drop=True) train_data = df.loc[df['label'] == False] train_data.reset_index(drop=True, inplace=True) test_data = df.loc[df['label'] == True] test_data.reset_index(drop=True, inplace=True) train_data = train_data.merge(lsi_data, on="pull_number") train_data.drop(columns=['label'], inplace=True) test_data = test_data.merge(lsi_data, on="pull_number") test_data.drop(columns=['label'], inplace=True) """8ii处理NAN""" train_data.dropna(how='any', inplace=True) train_data.reset_index(drop=True, inplace=True) train_data.fillna(value='', inplace=True) """先对tag做拆分""" trainDict = dict(list(train_data.groupby('pull_number'))) testDict = dict(list(test_data.groupby('pull_number'))) test_data_y = {} for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(testDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) test_data_y[pull_number] = reviewers train_data_y = {} for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(trainDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) train_data_y[pull_number] = reviewers return train_data, train_data_y, test_data, test_data_y, convertDict
def preProcess(df, date, project, isSTD=False, isNOR=False): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df.reset_index(drop=True, inplace=True) """对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """先对输入数据做精简 只留下感兴趣的数据""" df = df[['pr_number', 'pr_title', 'pr_body', 'review_user_login', 'label']].copy(deep=True) print("before filter:", df.shape) df.drop_duplicates(inplace=True) print("after filter:", df.shape) """先提前统计正确答案""" tagDict = dict(list(df.groupby('pr_number'))) train_data = df.loc[df['label'] == 0].copy(deep=True) test_data = df.loc[df['label'] == 1].copy(deep=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login'], inplace=True) train_data.drop_duplicates(inplace=True) train_data.drop_duplicates(subset=['pr_number'], inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login'], inplace=True) test_data.drop_duplicates(inplace=True) """pr_number 经过去重""" test_data.drop_duplicates(subset=['pr_number'], inplace=True) # test_data_y = DataProcessUtils.convertLabelListToDataFrame(test_data_y, pull_number_list, candicateNum) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """获得pr list""" prList = list(test_data['pr_number']) """先尝试所有信息团在一起""" df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) textList.append(tempList) print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(textList) print('词典:', dictionary) feature_cnt = len(dictionary.token2id) print("词典特征数:", feature_cnt) """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in textList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, df.shape[0]): wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])])) """对已经有的本文特征向量和标签做训练集和测试集的拆分""" trainData_index = df.loc[df['label'] == False].index testData_index = df.loc[df['label'] == True].index """训练集""" train_data = [wordVectors[x] for x in trainData_index] """测试集""" test_data = [wordVectors[x] for x in testData_index] """填充为向量""" train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt) test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList
def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na""" # """处理NAN""" # df.dropna(how='any', inplace=True) # df.reset_index(drop=True, inplace=True) df['pr_title'].fillna(value='', inplace=True) df['pr_body'].fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer']) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) textList.append(tempList) print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(textList) print('词典:', dictionary) feature_cnt = len(dictionary.token2id) print("词典特征数:", feature_cnt) """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in textList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, df.shape[0]): wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])])) """对已经有的本文特征向量和标签做训练集和测试集的拆分""" trainData_index = df.loc[df['label'] == False].index testData_index = df.loc[df['label'] == True].index """训练集""" train_data = [wordVectors[x] for x in trainData_index] """测试集""" test_data = [wordVectors[x] for x in testData_index] """填充为向量""" train_v_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt) test_v_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt) train_data = df.loc[df['label'] == False] train_data.reset_index(drop=True, inplace=True) test_data = df.loc[df['label'] == True] test_data.reset_index(drop=True, inplace=True) train_data = train_data.join(train_v_data) train_data.drop(columns=['label'], inplace=True) test_data = test_data.join(test_v_data) test_data.drop(columns=['label'], inplace=True) """8ii处理NAN""" train_data.dropna(how='any', inplace=True) train_data.reset_index(drop=True, inplace=True) train_data.fillna(value='', inplace=True) """先对tag做拆分""" trainDict = dict(list(train_data.groupby('pull_number'))) testDict = dict(list(test_data.groupby('pull_number'))) """过滤掉评论时间在数据集时间范围内之后的数据""" end_time = str(dates[2]) + "-" + str(dates[3]) + "-" + "01 00:00:00" train_data = train_data[train_data['commented_at'] < end_time] train_data.reset_index(drop=True, inplace=True) test_data_y = {} for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(testDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) test_data_y[pull_number] = reviewers train_data_y = {} for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(trainDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) train_data_y[pull_number] = reviewers return train_data, train_data_y, test_data, test_data_y, convertDict
def appendTextualFeatureVector(inputDf, projectName, date, pull_number_name): """ 用tf-idf模型计算pr的所有title,pr的文本 pr的信息直接从PRDataFile 那里获取 @description: 给df, 在之前的dataframe的基础上面追加 pr路径形成的tf-idf特征向量 @notice: datafrme 必须有pull_number_id,可以重复 @param origin_df: 预先读取好的dataframe @param projectName: 指定项目名 @param date: 开始年,开始月,结束年,结束月的四元组 @return: df: 添加路径权重后的dataframe,可直接用于机器学习算法 """ """对输入df做label存在检测""" if 'label' not in inputDf.columns: raise Exception("label not in input dataframe!") print("input shape:", inputDf.shape) print(date) df = inputDf[[pull_number_name]].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) df.columns = ['pr_number'] """读取pullrequestData 文件""" pull_request_path = projectConfig.getPullRequestPath() pullRequestData = pandasHelper.readTSVFile( os.path.join(pull_request_path, f'ALL_{projectName}_data_pullrequest.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) """pull_number和pr review commit relation做拼接""" df = pandas.merge(df, pullRequestData, left_on='pr_number', right_on='number') df = df[['pr_number', 'title', 'body']].copy(deep=True) df.columns = ['pr_number', 'pr_title', 'pr_body'] df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = row[list(df.columns).index('pr_title')] pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = row[list(df.columns).index('pr_body')] pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) textList.append(tempList) print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(textList) print('词典:', dictionary) feature_cnt = len(dictionary.token2id) print("词典特征数:", feature_cnt) """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in textList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, df.shape[0]): wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])])) """填充为向量""" wordVectors = DataProcessUtils.convertFeatureDictToDataFrame(wordVectors, featureNum=feature_cnt) """PCA 做缩减之前需要把pr_path_weight_df 做分割 训练集和测试集分别处理""" tempData = wordVectors.copy(deep=True) tempData['pr_number'] = df['pr_number'] labelData = inputDf[['pr_number', 'label']].drop_duplicates().copy(deep=True) tempData = pandas.merge(tempData, labelData, on='pr_number') tempData_train = tempData.loc[tempData['label'] == 0].copy(deep=True) tempData_test = tempData.loc[tempData['label'] == 1].copy(deep=True) tempData_train.drop(columns=['pr_number', 'label'], inplace=True) tempData_test.drop(columns=['pr_number', 'label'], inplace=True) """PAC 做缩减""" pca = PCA(n_components=0.95) tempData_train = pca.fit_transform(tempData_train) print("after pca :", tempData_train.shape) print(pca.explained_variance_ratio_) tempData_train = pandas.DataFrame(tempData_train) tempData_test = pca.transform(tempData_test) print("after pca :", tempData_train.shape) tempData_test = pandas.DataFrame(tempData_test) tempData = pandas.concat([tempData_train, tempData_test], axis=0) tempData.reset_index(drop=True, inplace=True) tempData['pr_number_t'] = df['pr_number'].copy(deep=True) """和原来特征做拼接""" inputDf = pandas.merge(inputDf, tempData, left_on=pull_number_name, right_on='pr_number_t') inputDf.drop(columns=['pr_number_t'], inplace=True) return inputDf
def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" t1 = datetime.now() """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pr_number'))) commentDf = df[['pr_number', 'review_user_login', 'comment_body', 'label']].copy(deep=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 """先尝试所有信息团在一起""" df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """训练和测试做分割""" df_train = df.loc[df['label'] == 0].copy(deep=True) df_test = df.loc[df['label'] == 1].copy(deep=True) df_test.reset_index(drop=True, inplace=True) """收集训练集中的pr的文本作为 文档做LDA提取主题""" trainTextList = [] testTextList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的number""" pr_num = getattr(row, 'pr_number') label = getattr(row, 'label') """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) if label == 0: trainTextList.append(tempList) elif label == 1: testTextList.append(tempList) """收集 训练集中的comment""" trainCommentList = [] review_comment_map = {} # pr -> [(reviewer, [w1, w2, w3]), .....] for row in commentDf.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的number""" pr_num = getattr(row, 'pr_number') label = getattr(row, 'label') reviewer = getattr(row, 'review_user_login') """获取pull request的标题""" comment_body = getattr(row, 'comment_body') comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment_body) if x not in stopwords] """对单词做提取词干""" comment_body_word_list = nltkFunction.stemList(comment_body_word_list) tempList.extend(comment_body_word_list) if review_comment_map.get(pr_num, None) is None: review_comment_map[pr_num] = [] if label == 0: review_comment_map[pr_num].append((reviewer, tempList.copy())) trainCommentList.append(tempList) """建立LDA模型提取数据""" # 接下来就是模型构建的步骤了,首先构建词频矩阵 allTextList = [] allTextList.extend(trainTextList) allTextList.extend(trainCommentList) dictionary = corpora.Dictionary(trainTextList) corpus = [dictionary.doc2bow(text) for text in trainTextList] lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) topic_list = lda.print_topics(20) print("20个主题的单词分布为:\n") for topic in topic_list: print(topic) """建立训练集和测试集所需的主题分布 pr_num -> {[(t1, p1), (t2, p2), .....]} """ train_data = {} test_data = {} for index, d in enumerate(lda.get_document_topics([dictionary.doc2bow(text) for text in trainTextList])): train_data[df_train['pr_number'][index]] = d for index, d in enumerate(lda.get_document_topics([dictionary.doc2bow(text) for text in testTextList])): test_data[df_test['pr_number'][index]] = d train_data_y = {} # pr -> [(reviewer, [(comment1), (comment2) ...])] for pull_number in df.loc[df['label'] == False]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) reviewerList = [] for reviewer in reviewers: commentTopicList = [] for r, words in review_comment_map[pull_number]: if r == reviewer: commentTopicList.append(words) commentTopicList = lda.get_document_topics([dictionary.doc2bow(text) for text in commentTopicList]) reviewerList.append((reviewer, [x for x in commentTopicList])) train_data_y[pull_number] = reviewerList test_data_y = {} for pull_number in df.loc[df['label'] == True]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) reviewerList = [] for reviewer in reviewers: commentTopicList = [] for r, words in review_comment_map[pull_number]: if r == reviewer: commentTopicList.append(words) commentTopicList = lda.get_document_topics([dictionary.doc2bow(text) for text in commentTopicList]) reviewerList.append((reviewer, commentTopicList)) test_data_y[pull_number] = reviewerList print("preprocess cost time:", datetime.now() - t1) return train_data, train_data_y, test_data, test_data_y, convertDict