clf = RandomForestClassifier() clf.fit(X_train, y_train) print(clf.predict(X_test)) print(clf.predict_proba(np.array([[5, 3]]))) print(clf.score(X_test, y_test)) # 参数调节 # n_estimators = 10 默认10,弱学习期最大迭代次数 # bootstrap = True 是否放回抽样 默认True # oob_score = False 是否采用袋外样本评估 袋外样本指未有抽到的样本 # criterion = 'gini' CART树对特征的评判标准 有基尼系数(gini)和信息增益(entropy) # 还有其他决策树的参数 clf = RandomForestClassifier(n_estimators=10, bootstrap=True, max_depth=8) clf.fit(X_train, y_train) print(clf.score(X_test, y_test)) classfy_plt_3d(clf, X_test, y_test) ''' 最佳参数搜索 并行调参 ''' # from sklearn.grid_search import GridSearchCV from sklearn.model_selection import GridSearchCV param_test = {'n_estimators': range(5, 50, 5), 'max_depth': range(5, 30, 5)} gsearch = GridSearchCV(estimator=RandomForestClassifier(bootstrap=True), param_grid=param_test, cv=5) gsearch.fit(X_train, y_train) print('最佳参数:', gsearch.best_params_, gsearch.best_score_) ''' RF 回归 ''' import Tdata from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt
# class_weight->dict,list of dicts,'balanced',None,optional(default=None),主要是考虑每个类的权重{class_label: weight} # max_leaf_nodes: 最大叶子节点数。通过限制最大叶子节点数,可以防止过拟合,默认是"None”,即不限制最大的叶子节点数。 tree_model = tree.DecisionTreeClassifier(criterion='entropy') tree_model = tree_model.fit(X, y) result_proba = tree_model.predict_proba([[3, 1]], check_input=True) print('分类概率:', result_proba.tolist()) print('分类结果:', tree_model.predict([[3, 2]])) # 3D 图表展示分类效果 iris = datasets.load_iris() # 使用自带的iris数据 X = iris.data[:, [0, 2]] y = iris.target clf = tree.DecisionTreeClassifier(max_depth=4) # 训练模型,限制树的最大深度4 clf.fit(X, y) #拟合模型 classfy_plt_3d(clf, X, y) # 决策树回归 # fit(变量,结果) train, test = sin_data() x_train, y_train = train[:, :2], train[:, 2] # 数据前两列是x1,x2 第三列是y,这里的y有随机噪声 x_test, y_test = test[:, :2], test[:, 2] # 同上,不过这里的y没有噪声 # train数据格式: # [[ 0. -10. 2.69876376] # [ 0.1002004 -9.95991984 2.36347624] # ..., # [ 50. 10. 7.29325787]] # 方法调度函数
print(estimator.fit_predict(data)) # 训练并直接输出结果 # 2个特征+类别 分类效果展示 import matplotlib.pyplot as plt data = np.random.rand(100, 2) estimator = KMeans(n_clusters=3) estimator.fit(data) plt.figure(figsize=(8,10)) # 设置尺寸 colors = ['blue', 'yellow', 'red'] markers = ['o', 's', 'D'] # 点形状 for i,l in enumerate(estimator.labels_): plt.plot(data[i][0],data[i][1],color=colors[l],marker=markers[l],ls='None') plt.show() # 3D分类效果展示 classfy_plt_3d(estimator, data, estimator.labels_) # 文本应用 # 文本聚类 import jieba from sklearn.feature_extraction.text import TfidfVectorizer def jieba_tokenize(text): print(jieba.lcut(text)) return jieba.lcut(text) # scikit-learn 自带的TF-IDF功能 依赖分词工具 tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, lowercase=False)
# tol=0.001, verbose=False) # SVC参数解释 # C: 目标函数的惩罚系数C,用来平衡分类间隔margin和错分样本的,default C = 1.0; # kernel:核函数选择,有RBF(高斯核函数,是线性不可分SVM常用的核函数之一), # Linear(线性核函数), Poly(多项式核函数), Sigmoid, 默认的是"RBF" # degree:Poly下多项式的最高次幂; # gamma:核函数的系数('Poly', 'RBF' and 'Sigmoid'), 默认是gamma = 1 / n_features; # coef0:核函数中的独立项,'RBF' and 'Poly'有效; # class_weight 指定样本各类别的的权重,主要是为了防止训练集某些类别的样本过多,导致训练的决策过于偏向这些类别。这里可以自己指定各个样本的权重,或者用“balanced”,如果使用“balanced”,则算法会自己计算权重,样本量少的类别所对应的样本权重会高。当然,如果你的样本类别分布没有明显的偏倚,则可以不管这个参数,选择默认的"None" # probablity: 可能性估计是否使用(true or false);及predict_proba是否可用,默认False # max_iter: 最大迭代次数,default = 1, if max_iter = -1, no limited; # decision_function_shape : ‘ovo’ 一对一, ‘ovr’ 多对多 or None 无, default=None ovo效果相对较精准 # random_state :用于概率估计的数据重排时的伪随机数生成器的种子。 print(clf.predict(test)) print(clf.predict_proba(test)) classfy_plt_3d(clf, X, Y) ''' LinearSVC ''' from sklearn.svm import LinearSVC clf = LinearSVC() clf.fit(X, Y) dec = clf.decision_function(test) # 返回的是样本距离超平面的距离 print(dec) # 预测 print(clf.predict(test)) ''' SVM之 SVR回归''' from sklearn.svm import SVR, LinearSVR
# # @method : scikit-learn LogisticRegression(逻辑回归) # @Time : 2018/4/2 # @Author : wooght # @File : w_LogisticRegression.py # 逻辑回归用于分类,二元分类常用, 特征没有线性要求,因变量是二元的 from Tdata import gender_sample from sklearn.linear_model import LogisticRegression import numpy as np from common.classfy_plt_3d import classfy_plt_3d # 性别分类数据 为了3D展示,只取了体重和身高作为特征 特征数据离散 x, y = gender_sample() x_train, y_train = np.row_stack([x[:50, :2], x[150:, :2]]), y[50:150] x_test, y_test = x[50:150, :2], y[50:150] logisticR = LogisticRegression() logisticR.fit(x_train, y_train) # y_train 必须是类别数据 result = logisticR.predict(x_test) print(result) classfy_plt_3d(logisticR, x_train, y_train) print(logisticR.score(x_train, y_train)) # class_weight 指定特征权重(注意这里是特征权重,而不是类别权重) logisticR = LogisticRegression(class_weight={0: 0.6, 1: 0.4}) logisticR.fit(x_train, y_train) result = logisticR.predict(x_test) print(logisticR.coef_, logisticR.intercept_) classfy_plt_3d(logisticR, x_train, y_train) print(logisticR.score(x_train, y_train))