def RecommendByNativeBayes(train_data, train_data_y, test_data, test_data_y, recommendNum=5, bayesType=1): """使用NB recommendNum : 推荐数量 bayesType : 1 Bernoulli 2 Gaussian 3 Multionmial """ from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB clf = None if bayesType == 2: clf = GaussianNB() elif bayesType == 3: clf = MultinomialNB() param = {"alpha": [0.2 * x for x in range(0, 10)], "fit_prior": [False, True]} clf = GridSearchCV(clf, param_grid=param) elif bayesType == 1: clf = BernoulliNB() clf.fit(X=train_data, y=train_data_y) if bayesType == 3: print(clf.best_params_, clf.best_score_) """查看算法的学习曲线""" MLGraphHelper.plot_learning_curve(clf, 'Bayes', train_data, train_data_y).show() pre = clf.predict_proba(test_data) # print(clf.classes_) pre_class = clf.classes_ recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList) answer = [[x] for x in test_data_y] # print(answer) return [recommendList, answer]
def RecommendByRF(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """多标签分类 随机森林""" clf = RandomForestClassifier(n_estimators=50, max_depth=5, n_jobs=-1) """对弱分类器数量做调参数量""" # param_test1 = {'n_estimators': range(200, 250, 10)} # clf = GridSearchCV(estimator=clf, param_grid=param_test1) # print(clf.best_params_) # print(clf.best_params_, clf.best_score_) """对决策树的参数做调参""" # param_test2 = {'max_depth': range(6, 8, 1), 'min_samples_split': range(18, 22, 1)} # clf = GridSearchCV(estimator=clf, param_grid=param_test1, cv=5, n_jobs=5) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) # print(clf.best_params_) # print(clf.best_score_) # print(clf.cv_results_) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByDT(train_data, train_data_y, test_data, test_data_y, recommendNum=5): grid_parameters = [ {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8]}] # 调节参数 from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV clf = DecisionTreeClassifier() clf = GridSearchCV(clf, param_grid=grid_parameters, n_jobs=-1) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) print(clf.best_params_) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=5, CoreType='rbf', C=1, gamma='auto', decisionShip='ovo'): """使用SVM recommendNum : 推荐数量 CoreType : 'linear' 线性 'rbf' 高斯 C: 惩罚系数 gamma: 核参数lambda decisionShip: 分类策略 """ """设定判断参数""" """训练集按照3 7开分成训练集和交叉验证集""" """自定义验证集 而不是使用交叉验证""" """这里使用交叉验证还是自定义验证需要再研究一下 3.31""" test_fold = numpy.zeros(train_data.shape[0]) test_fold[:ceil(train_data.shape[0] * 0.7)] = -1 ps = PredefinedSplit(test_fold=test_fold) grid_parameters = [ {'kernel': ['rbf'], 'gamma': [0.0005, 0.00075, 0.0001], 'C': [100, 105, 108, 110], 'decision_function_shape': ['ovr']}] # {'kernel': ['linear'], 'C': [90, 95, 100], # 'decision_function_shape': ['ovr', 'ovo'], # 'class_weight': ['balanced', None]}] # 调节参数 from sklearn import svm from sklearn.model_selection import GridSearchCV clf = svm.SVC(C=C, kernel=CoreType, probability=True, gamma=gamma, decision_function_shape=decisionShip) """ 因为REVIEW中有特征是时间相关的 所以讲道理nfold不能使用 需要自定义验证集 如果使用自定义验证集 GridSearchCVA(CV=ps) """ # clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps) # 网格搜索参数 clf.fit(X=train_data, y=train_data_y) # clf.fit(X=train_features, y=train_label) # print(clf.best_params_) # clf = svm.SVC(C=100, kernel='linear', probability=True) # clf.fit(train_data, train_data_y) pre = clf.predict_proba(test_data) pre_class = clf.classes_ # print(pre) # print(pre_class) """查看算法的学习曲线""" MLGraphHelper.plot_learning_curve(clf, 'SVM', train_data, train_data_y).show() recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList.__len__()) answer = [[x] for x in test_data_y] # print(answer.__len__()) return [recommendList, answer]
def RecommendByRandomForest(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """使用随机森林 n_estimators : 最大弱学习器个数 recommendNum : 推荐数量 max_depth 决策树最大深度 min_samples_split 内部节点划分所需最小样本数 min_samples_leaf 叶子节点最小样本数 class_weight 分类权重 """ """设定判断参数""" """自定义验证集 而不是使用交叉验证""" test_fold = numpy.zeros(train_data.shape[0]) test_fold[:ceil(train_data.shape[0] * 0.7)] = -1 ps = PredefinedSplit(test_fold=test_fold) """导入模型""" from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV clf = RandomForestClassifier(min_samples_split=100, min_samples_leaf=20, max_depth=8, max_features='sqrt', random_state=10) # clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1) # clf.fit(train_data, train_data_y) # # print("OOB SCORE:", clf.oob_score_) """对弱分类器数量做调参数量""" # param_test1 = {'n_estimators': range(10, 200, 10)} # clf = GridSearchCV(estimator=clf, param_grid=param_test1) # clf.fit(train_data, train_data_y) # print(clf.best_params_, clf.best_score_) """对决策树的参数做调参""" param_test2 = {'max_depth': range(3, 14, 2), 'min_samples_split': range(50, 201, 20)} clf = GridSearchCV(estimator=clf, param_grid=param_test2, iid=False, cv=5) clf.fit(train_data, train_data_y) # gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_ """查看算法的学习曲线""" MLGraphHelper.plot_learning_curve(clf, 'RF', train_data, train_data_y).show() pre = clf.predict_proba(test_data) pre_class = clf.classes_ # print(pre) # print(pre_class) recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList) answer = [[x] for x in test_data_y] # print(answer) return [recommendList, answer]
def RecommendBySVM(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """svm 一对多""" classifier = SVC(kernel='linear', probability=True, class_weight='balanced', C=70) clf = OneVsRestClassifier(classifier) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y # print(predictions) # print(test_data_y) # print(recommendList) # print(answerList) return [recommendList, answerList]
def RecommendByClassifierChain(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """分类器链""" classifier = ClassifierChain(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20)) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data) predictions = predictions.todense().getA() recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """ML KNN算法""" classifier = MLkNN(k=train_data_y.shape[1]) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data).todense() """预测结果转化为data array""" predictions = numpy.asarray(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByKN(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """ML KNeighbors""" clf = KNeighborsClassifier() clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def RecommendByDecisionTree(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """使用决策树 recommendNum : 推荐数量 max_depth 决策树最大深度 min_samples_split 内部节点划分所需最小样本数 min_samples_leaf 叶子节点最小样本数 class_weight 分类权重 """ """设定判断参数""" """训练集按照3 7开分成训练集和交叉验证集""" """自定义验证集 而不是使用交叉验证""" test_fold = numpy.zeros(train_data.shape[0]) test_fold[:ceil(train_data.shape[0] * 0.7)] = -1 ps = PredefinedSplit(test_fold=test_fold) grid_parameters = [ {'min_samples_leaf': [2, 4, 8, 16, 32, 64], 'max_depth': [2, 4, 6, 8], 'class_weight': [None]}] # 调节参数 # # scores = ['precision', 'recall'] # 判断依据 from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV clf = DecisionTreeClassifier() clf = GridSearchCV(clf, param_grid=grid_parameters, cv=ps, n_jobs=-1) clf.fit(train_data, train_data_y) print(clf.best_params_) # dot_data = export_graphviz(clf, out_file=None) # graph = graphviz.Source(dot_data) # graph.render("DTree") pre = clf.predict_proba(test_data) pre_class = clf.classes_ # print(pre) # print(pre_class) recommendList = DataProcessUtils.getListFromProbable(pre, pre_class, recommendNum) # print(recommendList) answer = [[x] for x in test_data_y] # print(answer) return [recommendList, answer]
def RecommendByETS(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """多标签分类 """ clf = ExtraTreesClassifier(n_jobs=3, n_estimators=250) param_test2 = {'max_depth': range(10, 40, 10), 'min_samples_split': range(15, 30, 5)} clf = GridSearchCV(estimator=clf, param_grid=param_test2, iid=False, cv=10, n_jobs=2) clf.fit(train_data, train_data_y) predictions = clf.predict_proba(test_data) """预测结果转化为data array""" predictions = DataProcessUtils.convertMultilabelProbaToDataArray(predictions) print(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]