def RecommendByDT(train_data, train_data_y, test_data, test_data_y, recommendNum=5): grid_parameters = [ {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8]}] # 调节参数 from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV clf = DecisionTreeClassifier() clf = GridSearchCV(clf, param_grid=grid_parameters, n_jobs=-1) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) print(clf.best_params_) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByRF(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """多标签分类 随机森林""" clf = RandomForestClassifier(n_estimators=50, max_depth=5, n_jobs=-1) """对弱分类器数量做调参数量""" # param_test1 = {'n_estimators': range(200, 250, 10)} # clf = GridSearchCV(estimator=clf, param_grid=param_test1) # print(clf.best_params_) # print(clf.best_params_, clf.best_score_) """对决策树的参数做调参""" # param_test2 = {'max_depth': range(6, 8, 1), 'min_samples_split': range(18, 22, 1)} # clf = GridSearchCV(estimator=clf, param_grid=param_test1, cv=5, n_jobs=5) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) # print(clf.best_params_) # print(clf.best_score_) # print(clf.cv_results_) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, is_split=False): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getCNDataPath() + os.sep + f'CN_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = CNTrain.preProcess(df, date) if not is_split: prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) prList.sort() prList, communities_data = CNTrain.RecommendByCN(project, date, train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum) else: prList, communities_data = CNTrain.RecommendByCNSplit(project, date, train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, communities_data['whole']['recommend_list'], communities_data['whole']['answer_list'], convertDict, communities_data['whole']['author_list'], key=project + str(date) + str(filter_train) + str(filter_test)) """新增返回测试 训练集大小,用于做统计""" # from source.scikit.combine.CBTrain import CBTrain # recommendList, answerList = CBTrain.recoverName(recommendList, answerList, convertDict) """新增返回训练集 测试集大小""" trainSize = (train_data.shape, test_data.shape) print(trainSize) return prList, convertDict, trainSize, communities_data
def testMLAlgorithms(project, dates, algorithm): """ 测试算法接口,把流程相似的算法统一 algorithm : svm, dt, rf """ recommendNum = 5 # 推荐数量 excelName = f'output{algorithm}.xlsx' sheetName = 'result' """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for date in dates: startTime = datetime.now() """直接读取不带路径的信息""" filename = projectConfig.getRootPath() + os.sep + 'data' + os.sep + 'train' + os.sep + \ f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv' df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD) print("raw df:", df.shape) # """读取带路径的文件信息""" # filename = projectConfig.getRootPath() + os.sep + r'data' + os.sep + 'train' + os.sep + \ # f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}_include_filepath.csv' # df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD, # sep=StringKeyUtils.STR_SPLIT_SEP_CSV) """df做预处理""" train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project, isNOR=True) recommendList = None answerList = None """根据算法获得推荐列表""" if algorithm == StringKeyUtils.STR_ALGORITHM_SVM: # 支持向量机 recommendList, answerList = MLTrain.RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) elif algorithm == StringKeyUtils.STR_ALGORITHM_DT: # 决策树 recommendList, answerList = MLTrain.RecommendByDecisionTree(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) elif algorithm == StringKeyUtils.STR_ALGORITHM_RF: # 随机森林 recommendList, answerList = MLTrain.RecommendByRandomForest(train_data, train_data_y, test_data, test_data_y, recommendNum=recommendNum) """根据推荐列表做评价""" topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime)
def algorithmBody(date, project, recommendNum=5, filter_train=False, filter_test=False, a=0.5): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getEARECDataPath() + os.sep + f'EAREC_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = EARECTrain.preProcess(df, date) prList = list(test_data.drop_duplicates(['pull_number'])['pull_number']) # prList.sort() recommendList, answerList, = EARECTrain.RecommendByEAREC(train_data, train_data_y, test_data, test_data_y, convertDict, recommendNum=recommendNum, a=a) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date)) """新增返回训练集 测试集大小""" trainSize = (train_data.shape, test_data.shape) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def RecommendByNativeBayes(train_data, train_data_y, test_data, test_data_y, recommendNum=5, bayesType=1): """使用NB recommendNum : 推荐数量 bayesType : 1 Bernoulli 2 Gaussian 3 Multionmial """ from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB clf = None if bayesType == 2: clf = GaussianNB() elif bayesType == 3: clf = MultinomialNB() param = {"alpha": [0.2 * x for x in range(0, 10)], "fit_prior": [False, True]} clf = GridSearchCV(clf, param_grid=param) elif bayesType == 1: clf = BernoulliNB() clf.fit(X=train_data, y=train_data_y) if bayesType == 3: print(clf.best_params_, clf.best_score_) """查看算法的学习曲线""" MLGraphHelper.plot_learning_curve(clf, 'Bayes', train_data, train_data_y).show() pre = clf.predict_proba(test_data) # print(clf.classes_) pre_class = clf.classes_ recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList) answer = [[x] for x in test_data_y] # print(answer) return [recommendList, answer]
def preProcessByIncrement(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pr_number'))) print("before drop:", df.shape) df = df.copy(deep=True) df.drop(columns=['review_user_login'], inplace=True) df.drop_duplicates(['pr_number'], inplace=True) print("after drop:", df.shape) test_data = df """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ test_data_y = {} for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers return test_data, test_data_y, convertDict
def RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=5, CoreType='rbf', C=1, gamma='auto', decisionShip='ovo'): """使用SVM recommendNum : 推荐数量 CoreType : 'linear' 线性 'rbf' 高斯 C: 惩罚系数 gamma: 核参数lambda decisionShip: 分类策略 """ """设定判断参数""" """训练集按照3 7开分成训练集和交叉验证集""" """自定义验证集 而不是使用交叉验证""" """这里使用交叉验证还是自定义验证需要再研究一下 3.31""" test_fold = numpy.zeros(train_data.shape[0]) test_fold[:ceil(train_data.shape[0] * 0.7)] = -1 ps = PredefinedSplit(test_fold=test_fold) grid_parameters = [ {'kernel': ['rbf'], 'gamma': [0.0005, 0.00075, 0.0001], 'C': [100, 105, 108, 110], 'decision_function_shape': ['ovr']}] # {'kernel': ['linear'], 'C': [90, 95, 100], # 'decision_function_shape': ['ovr', 'ovo'], # 'class_weight': ['balanced', None]}] # 调节参数 from sklearn import svm from sklearn.model_selection import GridSearchCV clf = svm.SVC(C=C, kernel=CoreType, probability=True, gamma=gamma, decision_function_shape=decisionShip) """ 因为REVIEW中有特征是时间相关的 所以讲道理nfold不能使用 需要自定义验证集 如果使用自定义验证集 GridSearchCVA(CV=ps) """ # clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps) # 网格搜索参数 clf.fit(X=train_data, y=train_data_y) # clf.fit(X=train_features, y=train_label) # print(clf.best_params_) # clf = svm.SVC(C=100, kernel='linear', probability=True) # clf.fit(train_data, train_data_y) pre = clf.predict_proba(test_data) pre_class = clf.classes_ # print(pre) # print(pre_class) """查看算法的学习曲线""" MLGraphHelper.plot_learning_curve(clf, 'SVM', train_data, train_data_y).show() recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList.__len__()) answer = [[x] for x in test_data_y] # print(answer.__len__()) return [recommendList, answer]
def RecommendByKN(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """ML KNeighbors""" clf = KNeighborsClassifier() clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def algorithmBody(date, project, algorithmType, recommendNum=5, featureType=3, filter_train=False, filter_test=False): df = None """对需求文件做合并 """ for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 if i < date[2] * 12 + date[3]: if filter_train: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' else: if filter_test: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' else: filename = projectConfig.getMLDataPath() + os.sep + f'ML_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """获取测试的 pull number列表""" train_data, train_data_y, test_data, test_data_y, convertDict, prList = MLTrain.preProcess(df, date, project, featureType, isNOR=True) print("train data:", train_data.shape) print("test data:", test_data.shape) recommendList, answerList = MultipleLabelAlgorithm. \ RecommendByAlgorithm(train_data, train_data_y, test_data, test_data_y, algorithmType) trainSize = (train_data.shape[0], test_data.shape[0]) """保存推荐结果到本地""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date)) return recommendList, answerList, prList, convertDict, trainSize
def testEARECAlgorithm(project, dates, filter_train=False, filter_test=False, a=0.5): """整合 训练数据""" recommendNum = 5 # 推荐数量 excelName = f'outputEAREC_{project}_{filter_train}_{filter_test}.xls' sheetName = 'result' """计算累积数据""" topks = [] mrrs = [] precisionks = [] recallks = [] fmeasureks = [] """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for date in dates: startTime = datetime.now() recommendList, answerList, prList, convertDict, trainSize = EARECTrain.algorithmBody(date, project, recommendNum, filter_train=filter_train, filter_test=filter_test, a=a) """根据推荐列表做评价""" topk, mrr, precisionk, recallk, fmeasurek = \ DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) topks.append(topk) mrrs.append(mrr) precisionks.append(precisionk) recallks.append(recallk) fmeasureks.append(fmeasurek) """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime) """计算历史累积数据""" DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks)
def algorithmBody(date, project, recommendNum=5, alpha=0.98, K=20, c=1): """提供单个日期和项目名称 返回推荐列表和答案 这个接口可以被混合算法调用 """ print(date) df = None for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 # print(y, m) filename = projectConfig.getHGDataPath() + os.sep + f'HG_ALL_{project}_data_{y}_{m}_to_{y}_{m}.tsv' """数据自带head""" if df is None: df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) else: temp = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITH_HEAD) df = df.append(temp) # 合并 df.reset_index(inplace=True, drop=True) """df做预处理""" """新增人名映射字典""" train_data, train_data_y, test_data, test_data_y, convertDict = HGTrain.preProcess(df, date) prList = list(set(test_data['pr_number'])) prList.sort() recommendList, answerList, authorList = HGTrain.RecommendByHG(train_data, train_data_y, test_data, test_data_y, date, project, convertDict, recommendNum=recommendNum, alpha=alpha, K=K, c=c, useLocalPrDis=False) """保存推荐结果,用于做统计""" DataProcessUtils.saveRecommendList(prList, recommendList, answerList, convertDict, key=project + str(date), authorList=authorList) """新增返回训练集 测试集大小""" trainSize = (train_data.shape[0], test_data.shape[0]) print(trainSize) return recommendList, answerList, prList, convertDict, trainSize
def preProcessBySlide(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """时间转为时间戳""" df['test'] = df['pr_created_at'] df['pr_created_at'] = df['pr_created_at'].apply( lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) """先对tag做拆分""" tagDict = dict(list(df.groupby('pull_number'))) print("before drop:", df.shape) df = df.copy(deep=True) df.drop(columns=['review_user_login', 'repo_full_name'], inplace=True) df.drop_duplicates(['pull_number', 'commit_sha', 'file_filename'], inplace=True) print("after drop:", df.shape) """对已经有的特征向量和标签做训练集的拆分""" train_data = df.loc[df['label'] == False].copy(deep=True) test_data = df.loc[df['label']].copy(deep=True) train_data.drop(columns=['label'], inplace=True) test_data.drop(columns=['label'], inplace=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers test_data_y = {} for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers return train_data, train_data_y, test_data, test_data_y, convertDict
def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na""" # """处理NAN""" # df.dropna(how='any', inplace=True) # df.reset_index(drop=True, inplace=True) # df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pull_number'))) """对已经有的特征向量和标签做训练集的拆分""" train_data = df.loc[df['label'] == False].copy(deep=True) test_data = df.loc[df['label']].copy(deep=True) train_data.drop(columns=['label'], inplace=True) test_data.drop(columns=['label'], inplace=True) """8ii处理NAN""" train_data.dropna(how='any', inplace=True) train_data.reset_index(drop=True, inplace=True) train_data.fillna(value='', inplace=True) """过滤掉评论时间在数据集时间范围内之后的数据""" # 结束时间:数据集pr最晚的创建时间 pr_created_time_data = train_data['pr_created_at'] end_time = max(pr_created_time_data.to_list()) train_data = train_data[train_data['comment_at'] <= end_time] train_data.reset_index(drop=True, inplace=True) test_data_y = {} for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) test_data_y[pull_number] = reviewers train_data_y = {} for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) train_data_y[pull_number] = reviewers return train_data, train_data_y, test_data, test_data_y, convertDict
def RecommendByETS(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """多标签分类 """ clf = ExtraTreesClassifier(n_jobs=3, n_estimators=250) param_test2 = {'max_depth': range(10, 40, 10), 'min_samples_split': range(15, 30, 5)} clf = GridSearchCV(estimator=clf, param_grid=param_test2, iid=False, cv=10, n_jobs=2) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByRandomForest(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """使用随机森林 n_estimators : 最大弱学习器个数 recommendNum : 推荐数量 max_depth 决策树最大深度 min_samples_split 内部节点划分所需最小样本数 min_samples_leaf 叶子节点最小样本数 class_weight 分类权重 """ """设定判断参数""" """自定义验证集 而不是使用交叉验证""" test_fold = numpy.zeros(train_data.shape[0]) test_fold[:ceil(train_data.shape[0] * 0.7)] = -1 ps = PredefinedSplit(test_fold=test_fold) """导入模型""" from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV clf = RandomForestClassifier(min_samples_split=100, min_samples_leaf=20, max_depth=8, max_features='sqrt', random_state=10) # clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1) # clf.fit(train_data, train_data_y) # # print("OOB SCORE:", clf.oob_score_) """对弱分类器数量做调参数量""" # param_test1 = {'n_estimators': range(10, 200, 10)} # clf = GridSearchCV(estimator=clf, param_grid=param_test1) # clf.fit(train_data, train_data_y) # print(clf.best_params_, clf.best_score_) """对决策树的参数做调参""" param_test2 = {'max_depth': range(3, 14, 2), 'min_samples_split': range(50, 201, 20)} clf = GridSearchCV(estimator=clf, param_grid=param_test2, iid=False, cv=5) clf.fit(train_data, train_data_y) # gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_ """查看算法的学习曲线""" MLGraphHelper.plot_learning_curve(clf, 'RF', train_data, train_data_y).show() pre = clf.predict_proba(test_data) pre_class = clf.classes_ # print(pre) # print(pre_class) recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList) answer = [[x] for x in test_data_y] # print(answer) return [recommendList, answer]
def testBayesAlgorithms(project, dates): # 输入测试日期和对应文件序列 输出一整个算法的表现 recommendNum = 5 # 推荐数量 excelName = 'outputNB.xlsx' sheetName = 'result' """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for i in range(1, 4): # Bayes 有三个模型 for date in dates: filename = projectConfig.getRootPath() + r'\data\train' + r'\\' \ + f'ML_{project}_data_{date[0]}_{date[1]}_to_{date[2]}_{date[3]}.tsv' df = pandasHelper.readTSVFile(filename, pandasHelper.INT_READ_FILE_WITHOUT_HEAD) """df做预处理""" isNOR = True if i == 1 or i == 3: isNOR = False # 对伯努利不做归一 train_data, train_data_y, test_data, test_data_y = MLTrain.preProcessForSingleLabel(df, date, project, isNOR=isNOR) """根据算法获得推荐列表""" recommendList, answerList = MLTrain.RecommendByNativeBayes(train_data, train_data_y, test_data, test_data_y, recommendNum, i) """根据推荐列表做评价""" topk, mrr = DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, date) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle())
def RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """svm 一对多""" classifier = SVC(kernel='linear', probability=True, class_weight='balanced', C=70) clf = OneVsRestClassifier(classifier) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y # print(predictions) # print(test_data_y) # print(recommendList) # print(answerList) return [recommendList, answerList]
def contactCAData(projectName): """ 通过SEAA的数据拼接出FPS所用的数据集 """ filename = os.path.join(projectConfig.getSEAADataPath(), f'{projectName}.json') print(filename) file = open(filename, 'rb') dataJson = json.load(file) """遍历数据集 用字典方式生成数据""" dictList = [] for data in dataJson: new_row = { 'repo_full_name': projectName + '/' + projectName, 'pull_number': data['changeNumber'], 'pr_created_at': convertTimeStampToTime(data['timestamp']), 'commit_sha': None } for reviewer in data['reviewers']: reviewer = reviewer['name'] for path in data['filePaths']: path = path['location'] new_row_c = new_row.copy() new_row_c['review_user_login'] = reviewer new_row_c['file_filename'] = path dictList.append(new_row_c) df = pandas.DataFrame(dictList, columns=SEAADataLoader.SEAA_DATAFRAME_COL) print(df.shape) """按照时间分成小片""" DataProcessUtils.splitDataByMonth( filename=None, targetPath=projectConfig.getCADataPath(), targetFileName=f'CA_SEAA_{projectName}_data', dateCol='pr_created_at', dataFrame=df)
def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """分类器链""" classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20)) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data) predictions = predictions.todense().getA() recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """ML KNN算法""" classifier = MLkNN(k=train_data_y.shape[1]) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data).todense() """预测结果转化为data array""" predictions = numpy.asarray(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByDecisionTree(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """使用决策树 recommendNum : 推荐数量 max_depth 决策树最大深度 min_samples_split 内部节点划分所需最小样本数 min_samples_leaf 叶子节点最小样本数 class_weight 分类权重 """ """设定判断参数""" """训练集按照3 7开分成训练集和交叉验证集""" """自定义验证集 而不是使用交叉验证""" test_fold = numpy.zeros(train_data.shape[0]) test_fold[:ceil(train_data.shape[0] * 0.7)] = -1 ps = PredefinedSplit(test_fold=test_fold) grid_parameters = [ {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8], 'class_weight': [None]}] # 调节参数 # # scores = ['precision', 'recall'] # 判断依据 from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV clf = DecisionTreeClassifier() clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1) clf.fit(train_data, train_data_y) print(clf.best_params_) # dot_data = export_graphviz(clf, out_file=None) # graph = graphviz.Source(dot_data) # graph.render("DTree") pre = clf.predict_proba(test_data) pre_class = clf.classes_ # print(pre) # print(pre_class) recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList) answer = [[x] for x in test_data_y] # print(answer) return [recommendList, answer]
def preProcessByIncrement(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,时间跨度相当于都是测试集, 没有作用 """ """注意: 输入文件中已经带有列名了""" """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pull_number'))) """时间转为时间戳""" df['pr_created_at'] = df['pr_created_at'].apply( lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) print("before drop:", df.shape) df = df.copy(deep=True) df.drop(columns=['review_user_login', 'repo_full_name'], inplace=True) df.drop_duplicates(['pull_number', 'commit_sha', 'file_filename'], inplace=True) print("after drop:", df.shape) test_data = df """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ test_data_y = {} for pull_number in df.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers return test_data, test_data_y, convertDict
def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" """空comment的review包含na信息,但作为结果集是有用的,所以只对训练集去掉na""" # """处理NAN""" # df.dropna(how='any', inplace=True) # df.reset_index(drop=True, inplace=True) df['pr_title'].fillna(value='', inplace=True) df['pr_body'].fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['pr_author', 'reviewer']) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 """问题:lsi的过程不能在整个数据集上面做,不然会导致pr的文本重复问题""" df_pr = df.copy(deep=True) df_pr.drop_duplicates(subset=['pull_number'], keep='first', inplace=True) df_pr.reset_index(drop=True, inplace=True) # 用于记录pr中文字的数量,对于pr少于10个word的pr.直接去掉 df_pr_word_count = [] textList = [] for row in df_pr.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) if tempList.__len__() >= 10 or getattr(row, 'label'): textList.append(tempList) if getattr(row, 'label'): df_pr_word_count.append(10) # 以便过后面的过滤 else: df_pr_word_count.append(tempList.__len__()) """去除无用的训练pr""" df_pr['count'] = df_pr_word_count df_pr = df_pr.loc[df_pr['count'] >= 10].copy(deep=True) df_pr.reset_index(drop=True, inplace=True) df_pr.drop(['count'], inplace=True, axis=1) """保存只有pr的列表""" prList = list(df_pr['pull_number']) """对已经有的本文特征向量和标签做训练集和测试集的拆分""" trainData_index = df_pr.loc[df_pr['label'] == False].index testData_index = df_pr.loc[df_pr['label'] == True].index trainDataTextList = [textList[x] for x in trainData_index] testDataTextList = [textList[x] for x in testData_index] print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(trainDataTextList) print('词典:', dictionary) """感觉有问题,tfidf模型不应该是在全数据集上面计算,而是在训练集上面计算,而测试集的向量就是 单纯的带入模型的计算结果""" """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in trainDataTextList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] topic_num = 10 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topic_num) topic_list = lsi.print_topics() print("{0}个主题的单词分布为:\n".format(topic_num)) for topic in topic_list: print(topic) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, trainDataTextList.__len__()): wordVectors.append(dict(lsi[dictionary.doc2bow(trainDataTextList[i])])) for i in range(0, testDataTextList.__len__()): wordVectors.append(dict(lsi[dictionary.doc2bow(testDataTextList[i])])) """训练集""" train_data = [wordVectors[x] for x in trainData_index] """测试集""" test_data = [wordVectors[x] for x in testData_index] """填充为向量""" train_v_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=topic_num) test_v_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=topic_num) lsi_data = pandas.concat([train_v_data, test_v_data], axis=0) # 0 轴合并 lsi_data['pull_number'] = prList lsi_data.reset_index(inplace=True, drop=True) train_data = df.loc[df['label'] == False] train_data.reset_index(drop=True, inplace=True) test_data = df.loc[df['label'] == True] test_data.reset_index(drop=True, inplace=True) train_data = train_data.merge(lsi_data, on="pull_number") train_data.drop(columns=['label'], inplace=True) test_data = test_data.merge(lsi_data, on="pull_number") test_data.drop(columns=['label'], inplace=True) """8ii处理NAN""" train_data.dropna(how='any', inplace=True) train_data.reset_index(drop=True, inplace=True) train_data.fillna(value='', inplace=True) """先对tag做拆分""" trainDict = dict(list(train_data.groupby('pull_number'))) testDict = dict(list(test_data.groupby('pull_number'))) test_data_y = {} for pull_number in test_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(testDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) test_data_y[pull_number] = reviewers train_data_y = {} for pull_number in train_data.drop_duplicates(['pull_number'])['pull_number']: reviewers = list(trainDict[pull_number].drop_duplicates(['reviewer'])['reviewer']) train_data_y[pull_number] = reviewers return train_data, train_data_y, test_data, test_data_y, convertDict
def testAlgorithm(project, dates, filter_train=False, filter_test=False, error_analysis=False, test_type=StringKeyUtils.STR_TEST_TYPE_SLIDE): # 多个case, 元组代表总共的时间跨度,最后一个月用于测试 """ algorithm : 基于信息检索 """ recommendNum = 5 # 推荐数量 excelName = f'outputIR_AC_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx' sheetName = 'result' """计算累积数据""" topks = [] mrrs = [] precisionks = [] recallks = [] fmeasureks = [] recommend_positive_success_pr_ratios = [] # pr 中有推荐成功人选的比例 recommend_positive_success_time_ratios = [] # 推荐pr * 人次 中有推荐成功人选的频次比例 recommend_negative_success_pr_ratios = [] # pr 中有推荐人选Hit 但被滤掉的pr的比例 recommend_negative_success_time_ratios = [] # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例 recommend_positive_fail_pr_ratios = [] # pr 中有推荐人选推荐错误的pr比例 recommend_positive_fail_time_ratios = [] # pr 中有pr * 人次有推荐错误的频次比例 recommend_negative_fail_pr_ratios = [] # pr 中有推荐人选不知道是否正确的比例 recommend_negative_fail_time_ratios = [] # pr中有pr * 人次有不知道是否正确的比例 error_analysis_datas = None """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) for date in dates: startTime = datetime.now() """根据推荐列表做评价""" recommendList, answerList, prList, convertDict, trainSize = IR_ACTrain.algorithmBody(date, project, recommendNum, filter_train=filter_train, filter_test=filter_test, test_type=test_type) topk, mrr, precisionk, recallk, fmeasurek = \ DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) topks.append(topk) mrrs.append(mrr) precisionks.append(precisionk) recallks.append(recallk) fmeasureks.append(fmeasurek) error_analysis_data = None filter_answer_list = None if error_analysis: if test_type == StringKeyUtils.STR_TEST_TYPE_SLIDE: y = date[2] m = date[3] filename = projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList, convertDict, filename, 'review_user_login', 'pr_number') elif test_type == StringKeyUtils.STR_TEST_TYPE_INCREMENT: fileList = [] for i in range(date[0] * 12 + date[1], date[2] * 12 + date[3] + 1): # 拆分的数据做拼接 y = int((i - i % 12) / 12) m = i % 12 if m == 0: m = 12 y = y - 1 fileList.append( projectConfig.getIR_ACDataPath() + os.sep + f'IR_AC_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv') filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerDataByIncrement(project, prList, convertDict, fileList, 'review_user_login', 'pr_number') # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \ # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \ # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis( # recommendList, answerList, filter_answer_list, recommendNum) # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, # recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio, # recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio] recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \ recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis( recommendList, answerList, filter_answer_list, recommendNum) error_analysis_data = [recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, recommend_negative_fail_pr_ratio] # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio) # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio) # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio) # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio) recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) if error_analysis_data: # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios, # recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios, # recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios, # recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios] error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_negative_success_pr_ratios, recommend_positive_fail_pr_ratios, recommend_negative_fail_pr_ratios] """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime) """推荐错误可视化""" DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'IR_AC_{test_type}_{filter_train}_{filter_test}') """计算历史累积数据""" DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks, error_analysis_datas)
def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:作为测试的年月四元组 """ """注意: 输入文件中已经带有列名了""" """issue comment 和 review comment关注的""" """处理NAN""" # df.dropna(how='any', inplace=True) # df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对人名字做数字处理""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login']) df['pr_created_at'] = df['pr_created_at'].apply(lambda x: time.strptime(x, "%Y-%m-%d %H:%M:%S")) """对 comment_at 处理增加具体天数的标识""" df['day'] = df['pr_created_at'].apply(lambda x: 10000 * x.tm_year + 100 * x.tm_mon + x.tm_mday) # 20200821 """先对tag做拆分""" temp_df = df.copy(deep=True) temp_df.drop(columns=['filename'], inplace=True) temp_df.drop_duplicates(inplace=True) tagDict = dict(list(temp_df.groupby('pr_number'))) """先尝试所有信息团在一起""" df = df[['pr_number', 'filename', 'label']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """对已经有的特征向量和标签做训练集的拆分""" train_data = df.loc[df['label'] == False].copy(deep=True) test_data = df.loc[df['label']].copy(deep=True) train_data.drop(columns=['label'], inplace=True) test_data.drop(columns=['label'], inplace=True) """问题转化为多标签问题 train_data_y [{pull_number:[(r1, d1), (r2, d2), ...]}, ... ,{}] """ """训练集存的是作者 测试集存的是评审者""" train_data_y = {} for pull_number in df.loc[df['label'] == False]['pr_number']: tempDf = tagDict[pull_number] author = [] for row in tempDf.itertuples(index=False, name='Pandas'): a = getattr(row, 'author_user_login') day = getattr(row, 'day') author.append((a, None, day)) break train_data_y[pull_number] = author test_data_y = {} for pull_number in df.loc[df['label'] == True]['pr_number']: tempDf = tagDict[pull_number] reviewers = [] for row in tempDf.itertuples(index=False, name='Pandas'): r = getattr(row, 'review_user_login') comment_node_id = getattr(row, 'comment_node_id') day = getattr(row, 'day') reviewers.append((r, comment_node_id, day)) test_data_y[pull_number] = reviewers """train_data ,test_data 最后一列是pr number test_data_y 的形式是dict""" return train_data, train_data_y, test_data, test_data_y, convertDict
def preProcess(df, dates): """参数说明 df:读取的dataframe对象 dates:四元组,后两位作为测试的年月 (,,year,month) """ """注意: 输入文件中已经带有列名了""" t1 = datetime.now() """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """对reviewer名字数字化处理 存储人名映射字典做返回""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pr_number'))) """先尝试所有信息团在一起""" df = df[['pr_number', 'pr_title', 'pr_body', 'label']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] """由于特殊性 PB算法的训练集不是dataFrame { p1:set1, p2:set2, ... } """ train_data = {} test_data = {} for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的number""" pr_num = getattr(row, 'pr_number') label = getattr(row, 'label') """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) """pull request的body""" pr_body = getattr(row, 'pr_body') pr_body_word_list = [x for x in FleshReadableUtils.word_list(pr_body) if x not in stopwords] """对单词做提取词干""" pr_body_word_list = nltkFunction.stemList(pr_body_word_list) tempList.extend(pr_body_word_list) wordSet = MultisetHelper.WordMultiset() wordSet.add(tempList) if label == 0: train_data[pr_num] = wordSet else: test_data[pr_num] = wordSet print("train size:", train_data.items().__len__()) print("test size:", test_data.items().__len__()) """问题转化为多标签问题 train_data_y [{pull_number:[(r1, s1), (r2, s2), ...]}, ... ,{}] r 代表reviewer s 代表集合 """ train_data_y = {} for pull_number in df.loc[df['label'] == False]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True) commentDict = dict(list(tempDf.groupby('review_user_login'))) reviewerList = [] for reviewer in reviewers: commentDf = commentDict[reviewer] wordSet = MultisetHelper.WordMultiset() for row in commentDf.itertuples(index=False, name='Pandas'): comment = getattr(row, 'comment_body') comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords] """对单词做提取词干""" comment_body_word_list = nltkFunction.stemList(comment_body_word_list) wordSet.add(comment_body_word_list) reviewerList.append((reviewer, wordSet)) train_data_y[pull_number] = reviewerList test_data_y = {} for pull_number in df.loc[df['label'] == True]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) tempDf = tagDict[pull_number][['review_user_login', 'comment_body']].copy(deep=True) commentDict = dict(list(tempDf.groupby('review_user_login'))) reviewerList = [] for reviewer in reviewers: commentDf = commentDict[reviewer] wordSet = MultisetHelper.WordMultiset() for row in commentDf.itertuples(index=False, name='Pandas'): comment = getattr(row, 'comment_body') comment_body_word_list = [x for x in FleshReadableUtils.word_list(comment) if x not in stopwords] """对单词做提取词干""" comment_body_word_list = nltkFunction.stemList(comment_body_word_list) wordSet.add(comment_body_word_list) reviewerList.append((reviewer, wordSet)) test_data_y[pull_number] = reviewerList print("preprocess cost time:", datetime.now() - t1) return train_data, train_data_y, test_data, test_data_y, convertDict
def testRF_AAlgorithms(projects, dates, filter_train=False, filter_test=False, error_analysis=True): """ RF 算法由于特征和输入无法和ML兼容,单独开一个文件 """ startTime = datetime.now() for project in projects: excelName = f'outputRF_A_{project}_{filter_train}_{filter_test}_{error_analysis}.xlsx' recommendNum = 5 # 推荐数量 sheetName = 'result' """初始化excel文件""" ExcelHelper().initExcelFile(fileName=excelName, sheetName=sheetName, excel_key_list=['训练集', '测试集']) """初始化项目抬头""" content = ["项目名称:", project] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) """计算累积数据""" topks = [] mrrs = [] precisionks = [] recallks = [] fmeasureks = [] recommend_positive_success_pr_ratios = [] # pr 中有推荐成功人选的比例 recommend_positive_success_time_ratios = [] # 推荐pr * 人次 中有推荐成功人选的频次比例 recommend_negative_success_pr_ratios = [] # pr 中有推荐人选Hit 但被滤掉的pr的比例 recommend_negative_success_time_ratios = [] # 推荐pr * 人次中有推荐人选Hit 但是被滤掉的pr的比例 recommend_positive_fail_pr_ratios = [] # pr 中有推荐人选推荐错误的pr比例 recommend_positive_fail_time_ratios = [] # pr 中有pr * 人次有推荐错误的频次比例 recommend_negative_fail_pr_ratios = [] # pr 中有推荐人选不知道是否正确的比例 recommend_negative_fail_time_ratios = [] # pr中有pr * 人次有不知道是否正确的比例 error_analysis_datas = None for date in dates: recommendList, answerList, prList, convertDict, trainSize = RF_ATrain.algorithmBody(date, project, recommendNum, filter_train=filter_train, filter_test=filter_test) """根据推荐列表做评价""" topk, mrr, precisionk, recallk, fmeasurek = \ DataProcessUtils.judgeRecommend(recommendList, answerList, recommendNum) topks.append(topk) mrrs.append(mrr) precisionks.append(precisionk) recallks.append(recallk) fmeasureks.append(fmeasurek) error_analysis_data = None if error_analysis: y = date[2] m = date[3] filename = projectConfig.getRF_ADataPath() + os.sep + f'RF_A_ALL_{project}_data_change_trigger_{y}_{m}_to_{y}_{m}.tsv' filter_answer_list = DataProcessUtils.getAnswerListFromChangeTriggerData(project, date, prList, convertDict, filename, 'review_user_login', 'pr_number') # recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, recommend_negative_success_pr_ratio, \ # recommend_negative_success_time_ratio, recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, \ # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio = DataProcessUtils.errorAnalysis( # recommendList, answerList, filter_answer_list, recommendNum) # error_analysis_data = [recommend_positive_success_pr_ratio, recommend_positive_success_time_ratio, # recommend_negative_success_pr_ratio, recommend_negative_success_time_ratio, # recommend_positive_fail_pr_ratio, recommend_positive_fail_time_ratio, # recommend_negative_fail_pr_ratio, recommend_negative_fail_time_ratio] recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, \ recommend_negative_fail_pr_ratio = DataProcessUtils.errorAnalysis( recommendList, answerList, filter_answer_list, recommendNum) error_analysis_data = [recommend_positive_success_pr_ratio, recommend_negative_success_pr_ratio, recommend_positive_fail_pr_ratio, recommend_negative_fail_pr_ratio] # recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) # recommend_positive_success_time_ratios.append(recommend_positive_success_time_ratio) # recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) # recommend_negative_success_time_ratios.append(recommend_negative_success_time_ratio) # recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) # recommend_positive_fail_time_ratios.append(recommend_positive_fail_time_ratio) # recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) # recommend_negative_fail_time_ratios.append(recommend_negative_fail_time_ratio) recommend_positive_success_pr_ratios.append(recommend_positive_success_pr_ratio) recommend_negative_success_pr_ratios.append(recommend_negative_success_pr_ratio) recommend_positive_fail_pr_ratios.append(recommend_positive_fail_pr_ratio) recommend_negative_fail_pr_ratios.append(recommend_negative_fail_pr_ratio) if error_analysis_data: # error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_positive_success_time_ratios, # recommend_negative_success_pr_ratios, recommend_negative_success_time_ratios, # recommend_positive_fail_pr_ratios, recommend_positive_fail_time_ratios, # recommend_negative_fail_pr_ratios, recommend_negative_fail_time_ratios] error_analysis_datas = [recommend_positive_success_pr_ratios, recommend_negative_success_pr_ratios, recommend_positive_fail_pr_ratios, recommend_negative_fail_pr_ratios] """结果写入excel""" DataProcessUtils.saveResult(excelName, sheetName, topk, mrr, precisionk, recallk, fmeasurek, date, error_analysis_data) """文件分割""" content = [''] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) content = ['训练集', '测试集'] ExcelHelper().appendExcelRow(excelName, sheetName, content, style=ExcelHelper.getNormalStyle()) print("cost time:", datetime.now() - startTime) """推荐错误可视化""" DataProcessUtils.recommendErrorAnalyzer2(error_analysis_datas, project, f'RF_{filter_train}_{filter_test}') """计算历史累积数据""" DataProcessUtils.saveFinallyResult(excelName, sheetName, topks, mrrs, precisionks, recallks, fmeasureks, error_analysis_datas)
def preProcessBySlide(df, dates): """参数说明 df:读取的dataframe对象 dates:作为测试的年月四元组 """ """注意: 输入文件中已经带有列名了""" """处理NAN""" df.dropna(how='any', inplace=True) df.reset_index(drop=True, inplace=True) df.fillna(value='', inplace=True) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == dates[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == dates[3])) """创建时间转化为时间戳""" df['pr_created_at'] = df['pr_created_at'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) df['pr_created_at'] = df['pr_created_at'] / (24 * 3600) """先对输入数据做精简 只留下感兴趣的数据""" df = df[['pr_number', 'pr_title', 'review_user_login', 'label', 'pr_created_at']].copy(deep=True) print("before filter:", df.shape) df.drop_duplicates(inplace=True) print("after filter:", df.shape) """对人名字做数字处理""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login']) """先对tag做拆分""" tagDict = dict(list(df.groupby('pr_number'))) """先尝试所有信息团在一起""" df = df[['pr_number', 'pr_title', 'label', 'pr_created_at']].copy(deep=True) df.drop_duplicates(inplace=True) df.reset_index(drop=True, inplace=True) """用于收集所有文本向量分词""" stopwords = SplitWordHelper().getEnglishStopList() # 获取通用英语停用词 textList = [] for row in df.itertuples(index=False, name='Pandas'): tempList = [] """获取pull request的标题""" pr_title = getattr(row, 'pr_title') pr_title_word_list = [x for x in FleshReadableUtils.word_list(pr_title) if x not in stopwords] """初步尝试提取词干效果反而下降了 。。。。""" """对单词做提取词干""" pr_title_word_list = nltkFunction.stemList(pr_title_word_list) tempList.extend(pr_title_word_list) textList.append(tempList) print(textList.__len__()) """对分词列表建立字典 并提取特征数""" dictionary = corpora.Dictionary(textList) print('词典:', dictionary) feature_cnt = len(dictionary.token2id) print("词典特征数:", feature_cnt) """根据词典建立语料库""" corpus = [dictionary.doc2bow(text) for text in textList] # print('语料库:', corpus) """语料库训练TF-IDF模型""" tfidf = models.TfidfModel(corpus) """再次遍历数据,形成向量,向量是稀疏矩阵的形式""" wordVectors = [] for i in range(0, df.shape[0]): wordVectors.append(dict(tfidf[dictionary.doc2bow(textList[i])])) """对已经有的本文特征向量和标签做训练集和测试集的拆分""" trainData_index = df.loc[df['label'] == False].index testData_index = df.loc[df['label'] == True].index """训练集""" train_data = [wordVectors[x] for x in trainData_index] """测试集""" test_data = [wordVectors[x] for x in testData_index] """填充为向量""" train_data = DataProcessUtils.convertFeatureDictToDataFrame(train_data, featureNum=feature_cnt) test_data = DataProcessUtils.convertFeatureDictToDataFrame(test_data, featureNum=feature_cnt) train_data['pr_number'] = list(df.loc[df['label'] == False]['pr_number']) test_data['pr_number'] = list(df.loc[df['label'] == True]['pr_number']) train_data['pr_created_at'] = list(df.loc[df['label'] == False]['pr_created_at']) test_data['pr_created_at'] = list(df.loc[df['label'] == True]['pr_created_at']) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} for pull_number in df.loc[df['label'] == False]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers test_data_y = {} for pull_number in df.loc[df['label'] == True]['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers """train_data ,test_data 最后一列是pr number test_data_y 的形式是dict""" return train_data, train_data_y, test_data, test_data_y, convertDict
def preProcess(df, date, project, isSTD=False, isNOR=False): """参数说明 df:读取的dataframe对象 testDate:作为测试的年月 (year,month) isSTD:对数据是否标准化 isNOR:对数据是否归一化 """ print("start df shape:", df.shape) """过滤NA的数据""" df.dropna(axis=0, how='any', inplace=True) print("after fliter na:", df.shape) """对df添加一列标识训练集和测试集""" df['label'] = df['pr_created_at'].apply( lambda x: (time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_year == date[2] and time.strptime(x, "%Y-%m-%d %H:%M:%S").tm_mon == date[3])) df.reset_index(drop=True, inplace=True) """对人名字做数字处理""" """频率不过的评审者在编号之前就已经过滤了,不用考虑分类不连续的情况""" """这里reviewer_user_login 放在 第一个否则会影响candicateNum这个变量在后面的引用""" convertDict = DataProcessUtils.changeStringToNumber(df, ['review_user_login', 'author_user_login']) recoverDict = {v: k for k, v in convertDict.items()} print(df.shape) candicateNum = max(df.loc[df['label'] == 0]['review_user_login']) print("candicate Num:", candicateNum) """先对输入数据做精简 只留下感兴趣的数据""" df = df[['pr_number', 'review_user_login', 'author_user_login', 'author_association', 'commits', 'deletions', 'additions', 'changed_files', 'label', 'merged']].copy(deep=True) print("before filter:", df.shape) df.drop_duplicates(inplace=True) print("after filter:", df.shape) """计算作者的关系""" df['author_association'] = df['author_association'].apply(lambda x: x == 'MEMBER') """计算累积的历史数据""" request_number_prs = [] # 作者之前发出的数量 request_number_merged_prs = [] # 作者发出的被接受的数量 request_number_rejected_prs = [] # 作者发出被拒绝的数量 request_accept_rate = [] # 作者pr被接受的概率 request_reject_rate = [] # 作者pr被拒绝的概率 for row in df.itertuples(): pr_num = getattr(row, 'pr_number') author = getattr(row, 'author_user_login') """过滤历史的pr""" temp_df = df.loc[(df['pr_number'] < pr_num)&(df['author_user_login'] == author)] request_number_prs.append(temp_df.shape[0]) accept_times = temp_df.loc[temp_df['merged'] == 1].shape[0] request_number_merged_prs.append(accept_times) request_number_rejected_prs.append(temp_df.shape[0] - accept_times) if temp_df.shape[0] > 0: request_accept_rate.append(accept_times/temp_df.shape[0]) request_reject_rate.append(1 - accept_times / temp_df.shape[0]) else: request_accept_rate.append(0) request_reject_rate.append(0) df['request_number_prs'] = request_number_prs df['request_number_merged_prs'] = request_number_merged_prs df['request_number_rejected_prs'] = request_number_rejected_prs df['request_accept_rate'] = request_accept_rate df['request_reject_rate'] = request_reject_rate """添加作者是否关注项目""" user_watch_repo_relation_path = projectConfig.getUserWatchRepoRelation() userWatchRepoRelation = pandasHelper.readTSVFile( os.path.join(user_watch_repo_relation_path, f'userWatchRepoRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) watchRepoMap = {} for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" following_list = list(set(userWatchRepoRelation.loc[userWatchRepoRelation['login'] == k]['repo_full_name'])) isFollow = False for repo in following_list: owner, name = repo.split('/') if name == project: isFollow = True watchRepoMap[convertDict[k]] = isFollow request_watches = [] for row in df.itertuples(): author = getattr(row, 'author_user_login') request_watches.append(watchRepoMap[author]) df['request_watches'] = request_watches """添加作者follower数量, followings数量, 是否follow团队成员""" user_follow_relation_path = projectConfig.getUserFollowRelation() userFollowRelation = pandasHelper.readTSVFile( os.path.join(user_follow_relation_path, f'userFollowRelation.tsv'), pandasHelper.INT_READ_FILE_WITH_HEAD, low_memory=False ) followMap = {} followerCountMap = {} followingCountMap = {} followCoreMemberMap = {} """收集核心成员列表""" coreMemberList = list(set(df.loc[df['author_association'] == 1]['author_user_login'])) for k in convertDict.keys(): """获取 reviewer 的 follow 列表""" following_list = list(set(userFollowRelation.loc[userFollowRelation['login'] == k]['following_login'])) followingCountMap[convertDict[k]] = following_list.__len__() isFollowCoreMember = False for f in following_list: if f in convertDict.keys(): followMap[(convertDict[k], convertDict[f])] = 1 if f in coreMemberList: isFollowCoreMember = True followCoreMemberMap[convertDict[k]] = isFollowCoreMember follower_list = list(set(userFollowRelation.loc[userFollowRelation['following_login'] == k]['login'])) followerCountMap[convertDict[k]] = follower_list.__len__() # for f in follower_list: # if f in convertDict.keys(): # followMap[(convertDict[f], convertDict[k])] = 1 request_number_follows = [] request_number_following = [] request_follow_ct = [] for row in df.itertuples(): pr_num = getattr(row, 'pr_number') author = getattr(row, 'author_user_login') """过滤历史的pr""" request_number_following.append(followingCountMap[author]) request_number_follows.append(followerCountMap[author]) request_follow_ct.append(followCoreMemberMap[author]) df['request_number_following'] = request_number_following df['request_number_follows'] = request_number_follows df['request_follow_ct'] = request_follow_ct """先提前统计正确答案""" tagDict = dict(list(df.groupby('pr_number'))) train_data = df.loc[df['label'] == 0].copy(deep=True) test_data = df.loc[df['label'] == 1].copy(deep=True) """问题转化为多标签问题 train_data_y [{pull_number:[r1, r2, ...]}, ... ,{}] """ train_data_y = {} pull_number_list = train_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in pull_number_list: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) train_data_y[pull_number] = reviewers train_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True) train_data.drop_duplicates(inplace=True) train_data.drop_duplicates(subset=['pr_number'], inplace=True) train_data.drop(columns=['pr_number'], inplace=True) """训练集 结果做出多标签分类通用的模式""" train_data_y = DataProcessUtils.convertLabelListToDataFrame(train_data_y, pull_number_list, candicateNum) test_data_y = {} pull_number_list = test_data.drop_duplicates(['pr_number']).copy(deep=True)['pr_number'] for pull_number in test_data.drop_duplicates(['pr_number'])['pr_number']: reviewers = list(tagDict[pull_number].drop_duplicates(['review_user_login'])['review_user_login']) test_data_y[pull_number] = reviewers test_data.drop(columns=['review_user_login', 'author_user_login', 'label', 'merged'], inplace=True) test_data.drop_duplicates(subset=['pr_number'], inplace=True) """获得pr list""" prList = list(test_data['pr_number']) test_data.drop(columns=['pr_number'], inplace=True) test_data_y = DataProcessUtils.convertLabelListToListArray(test_data_y, pull_number_list) """参数规范化""" if isSTD: stdsc = StandardScaler() train_data_std = stdsc.fit_transform(train_data) test_data_std = stdsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList elif isNOR: maxminsc = MinMaxScaler() train_data_std = maxminsc.fit_transform(train_data) test_data_std = maxminsc.transform(test_data) return train_data_std, train_data_y, test_data_std, test_data_y, convertDict, prList else: return train_data, train_data_y, test_data, test_data_y, convertDict, prList