from sklearn.preprocessing import StandardScaler sc=StandardScaler() df.iloc[:,:]=sc.fit_transform(df.iloc[:,:]) # Feature Selection # univariate Selection from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression,chi2 best_features=SelectKBest(score_func=f_regression,k='all') best_features.fit(df.iloc[:,1:],df.iloc[:,0]) feature_scores=pd.DataFrame(best_features.scores_,index=df.iloc[:,1:].columns) feature_scores.plot(kind='barh') # Feature Selection from sklearn.tree import ExtraTreeRegressor regressor=ExtraTreeRegressor() regressor.fit(df.iloc[:,1:],df.iloc[:,0]) importance_score=pd.Series(regressor.feature_importances_,index=df.iloc[:,1:].columns) importance_score.plot(kind='barh') # Segregating feature & target columns x=df.iloc[:,1:] y=df.iloc[:,0] # Modelling from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0) # Ridge Regression from sklearn.linear_model import Ridge from sklearn.model_selection import GridSearchCV
from pandas import read_csv from sklearn.tree import ExtraTreeRegressor # load data dataframe = read_csv('useformodel.csv') array = dataframe.values X = array[:, 0:26] Y = array[:, 26] # feature extraction model = ExtraTreeRegressor(random_state=0) model.fit(X, Y) print(model.feature_importances_)
et_m_Material_one_hot_encoded, et_m_Making_Co_one_hot_encoded), axis=1) et_m_Outputdata = et_MakingLT[['MakingLT']] # 학습모델 구축을 위해 data형식을 Vector로 변환 et_X1 = et_m_Inputdata.values et_Y1 = et_m_Outputdata.values # Training Data, Test Data 분리 et_X1_train, et_X1_test, et_Y1_train, et_Y1_test = train_test_split( et_X1, et_Y1, test_size=0.33, random_state=42) ######################################################################################################################## # ExtraTree 모델 구축 making_extratree_model = ExtraTreeRegressor(max_depth=10, random_state=42) making_extratree_model.fit(et_X1_train, et_Y1_train) et_m_predicted = making_extratree_model.predict(et_X1_test) et_m_predicted[et_m_predicted < 0] = 0 # [1,n]에서 [n,1]로 배열을 바꿔주는 과정을 추가 et_length_x1test = len(et_X1_test) et_m_predicted = et_m_predicted.reshape(et_length_x1test, 1) # 학습 모델 성능 확인 et_m_mae = abs(et_m_predicted - et_Y1_test).mean(axis=0) et_m_mape = (np.abs((et_m_predicted - et_Y1_test) / et_Y1_test).mean(axis=0)) et_m_rmse = np.sqrt(((et_m_predicted - et_Y1_test)**2).mean(axis=0)) et_m_rmsle = np.sqrt(
def pass_arguments(self, kwargs): super().__init__(ExtraTreeRegressor(**kwargs))
tree=DecisionTreeClassifier() tree=tree.fit(X_train,Y_train) Y1=tree.predict(X_train) Y2=tree.predict(X_test) print(accuracy_score(Y_train,Y1)) #1.0 print(accuracy_score(Y_test,Y2)) #0.8791208791208791 tree=DecisionTreeRegressor() tree = tree.fit(X_train, Y_train) Y1 = tree.predict(X_train) Y2 = tree.predict(X_test) print(accuracy_score(Y_train, Y1)) #1.0 print(accuracy_score(Y_test, Y2)) #0.8571428571428571 tree=ExtraTreeClassifier() tree = tree.fit(X_train, Y_train) Y1 = tree.predict(X_train) Y2 = tree.predict(X_test) print(accuracy_score(Y_train, Y1)) #1.0 print(accuracy_score(Y_test, Y2)) #0.7472527472527473 tree=ExtraTreeRegressor() tree = tree.fit(X_train, Y_train) Y1 = tree.predict(X_train) Y2 = tree.predict(X_test) print(accuracy_score(Y_train, Y1)) #1.0 print(accuracy_score(Y_test, Y2)) #0.7912087912087912
def getExtraTreeModel(x, y): et = ExtraTreeRegressor() et.fit(x, y) return et
def get_models(methods=List): models = dict() # linear models if 'LinearRegression' in methods: models['lr'] = LinearRegression() alpha = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] if 'Lasso' in methods: for a in alpha: models['lasso-' + str(a)] = Lasso(alpha=a) if 'Ridge' in methods: for a in alpha: models['ridge-' + str(a)] = Ridge(alpha=a) if 'ElasticNet' in methods: for a1 in alpha: for a2 in alpha: name = 'en-' + str(a1) + '-' + str(a2) models[name] = ElasticNet(a1, a2) if 'HuberRegressor' in methods: models['huber'] = HuberRegressor() if 'Lars' in methods: models['lars'] = Lars() if 'LassoLars' in methods: models['llars'] = LassoLars() if 'PassiveAggressiveRegressor' in methods: models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3) if 'RANSACRegressor' in methods: models['ranscac'] = RANSACRegressor() if 'SGDRegressor' in methods: models['sgd'] = SGDRegressor(max_iter=1000, tol=1e-3) if 'TheilSenRegressor' in methods: models['theil'] = TheilSenRegressor() # non-linear models if 'KNeighborsRegressor' in methods: n_neighbors = range(1, 21) for k in n_neighbors: models['knn-' + str(k)] = KNeighborsRegressor(n_neighbors=k) if 'DecisionTreeRegressor' in methods: models['cart'] = DecisionTreeRegressor() if 'ExtraTreeRegressor' in methods: models['extra'] = ExtraTreeRegressor() if 'SVR' in methods: models['svml'] = SVR(kernel='linear') models['svmp'] = SVR(kernel='poly') c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for c in c_values: models['svmr' + str(c)] = SVR(C=c) # ensemble models n_trees = 100 if 'AdaBoostRegressor' in methods: models['ada'] = AdaBoostRegressor(n_estimators=n_trees) if 'BaggingRegressor' in methods: models['bag'] = BaggingRegressor(n_estimators=n_trees) if 'RandomForestRegressor' in methods: models['rf'] = RandomForestRegressor(n_estimators=n_trees) if 'ExtraTreesRegressor' in methods: models['et'] = ExtraTreesRegressor(n_estimators=n_trees) if 'GradientBoostingRegressor' in methods: models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) print('Defined %d models' % len(models)) return models
('RANSACRegressor', lambda: RANSACRegressor()), ('SGDRegressor', lambda: SGDRegressor()), # Way too slow. #('TheilSenRegressor', lambda: TheilSenRegressor()), # Neighbors. ('KNeighborsRegressor', lambda: KNeighborsRegressor()), # Predicts Nan, infinity or too large of value. #('RadiusNeighborsRegressor', lambda: RadiusNeighborsRegressor()), # Neural network. # Increase max_iter to avoid Warning about non-convergence within max_iter. ('MLPRegressor', lambda: MLPRegressor(max_iter=1000)), # Support vector machine. ('SVR', lambda: SVR()), ('LinearSVR', lambda: LinearSVR()), ('NuSVR', lambda: NuSVR()), # Tree. ('DecisionTreeRegressor', lambda: DecisionTreeRegressor()), ('ExtraTreeRegressor', lambda: ExtraTreeRegressor()), ]) # Regressors that do not support the sample_weight optional fit() argument. REGRESSORS_NOT_SUPPORTING_SAMPLE_WEIGHT = set([ 'PLSRegression', 'GaussianProcessRegressor', 'PassiveAggressiveRegressor', 'RandomizedLogisticRegression', 'SGDRegressor', 'TheilSenRegressor', 'KNeighborsRegressor', 'MLPRegressor' ])
y_train = train_value.iloc[:, -1] test_value = df[df['date'] >= '2020-09-01'] x_test = test_value.iloc[:, 1:-1] y_test = test_value.iloc[:, -1] # print(x_train.shape, y_train.shape) #(321035, 6) (321035,) # print(x_test.shape, y_test.shape) #(16707, 6) (16707,) kfold = KFold(n_splits=5, shuffle=True) # 훈련 loop scalers = np.array([MinMaxScaler(), StandardScaler()]) models = np.array([DecisionTreeRegressor(), RandomForestRegressor(), BaggingRegressor(),\ ExtraTreeRegressor(), ExtraTreesRegressor()]) # , KNeighborsRegressor() # BaggingRegressor, DecisionTreeRegressor, ExtraTreeRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, RandomForestRegressor result_list = [] for i in models: print(i, ' :') #2. 모델구성 model = i scores = cross_val_score(model, x_train, y_train, cv=kfold) print('scores : ', scores) #3. 훈련 model.fit(x_train, y_train)
print Lscore.mean() plt.plot(Lscore) #%% SVM svc = svm.SVC(C=1, kernel='linear') SVCscore = cross_val_score(svc, X, Y_label, cv=k_fold, scoring='accuracy') print SVCscore.mean() plt.plot(SVCscore) #%% Regression Task x_train, x_test, y_train, y_test = train_test_split(X, Y_value, test_size=0.33, shuffle=False) models = [("RF", ExtraTreeRegressor(random_state=0)), ("LR", LinearRegression(n_jobs=-1))] for m in models: m[1].fit(x_train, y_train) # Make an array of predictions on the test set pred = m[1].predict(x_test) # Output the hit-rate and the confusion matrix for each model print("%s:\n%0.6f" % (m[0], m[1].score(x_test, y_test))) result = pd.DataFrame(index=y_test.index) result['y_pred'] = pred result['y_test'] = y_test #Linscore=cross_val_score(LinRe,X,Y_value, cv=k_fold, scoring= 'r2')
# KNN回归 from sklearn import neighbors model_knn = neighbors.KNeighborsRegressor() # 随机森林回归 from sklearn import ensemble model_random_forest = ensemble.RandomForestRegressor(n_estimators=20) # AdaBoost回归 from sklearn import ensemble model_adaboost = ensemble.AdaBoostRegressor(n_estimators=50) # GBRT回归 from sklearn import ensemble model_gradient_boost = ensemble.GradientBoostingRegressor(n_estimators=100) # Bagging回归 from sklearn.ensemble import BaggingRegressor model_bagging = BaggingRegressor() # ExtraTree极端随机树回归 from sklearn.tree import ExtraTreeRegressor model_extratree = ExtraTreeRegressor() # Ridge回归 model_ridge = linear_model.Ridge(alpha=0.01) # 绘制回归曲线 plot_regression(model_svr, x_data, y_data) #plot_decision(model_decisiontree_regression, x_data,y_data)
def regression(X,Y,method='svm'): ''' 分类器 ''' print("=======开始训练分类器======") print('采用的分类器为',method) if method=='svm': clf = svm.SVR(gamma='auto') # 方法选择 # 1.决策树回归 if method == 'tree': from sklearn import tree clf = tree.DecisionTreeRegressor() # 2.线性回归 if method == 'linear' : from sklearn.linear_model import LinearRegression clf = LinearRegression() # 3.SVM回归 # 4.kNN回归 if method == 'knn': from sklearn import neighbors clf = neighbors.KNeighborsRegressor() # 5.随机森林回归 if method == 'RFR': from sklearn import ensemble clf = ensemble.RandomForestRegressor(n_estimators=20) # 使用20个决策树 if method == 'Adaboost': # 6.Adaboost回归 from sklearn import ensemble clf = ensemble.AdaBoostRegressor(n_estimators=50) # 这里使用50个决策树 if method == 'GBR': # 7.GBRT回归 from sklearn import ensemble clf = ensemble.GradientBoostingRegressor(n_estimators=100) # 这里使用100个决策树 if method == 'Bag': # 8.Bagging回归 from sklearn import ensemble clf = ensemble.BaggingRegressor() if method == 'ETR': # 9.ExtraTree极端随机数回归 from sklearn.tree import ExtraTreeRegressor clf = ExtraTreeRegressor() if method == 'MLP': from sklearn.neural_network import MLPRegressor clf = MLPRegressor(solver='adam',alpha=1e-5, hidden_layer_sizes=(100,4), random_state=1) clf.fit(X, Y) print("==========训练完毕=========") return clf
# 6.Adaboost回归 from sklearn import ensemble model_adaboost_regressor = ensemble.AdaBoostRegressor(n_estimators=50) # 这里使用50个决策树 # 7.GBRT回归 from sklearn import ensemble model_gradient_boosting_regressor = ensemble.GradientBoostingRegressor(n_estimators=100) # 这里使用100个决策树 # 8.Bagging回归 from sklearn import ensemble model_bagging_regressor = ensemble.BaggingRegressor() # 9.ExtraTree极端随机数回归 from sklearn.tree import ExtraTreeRegressor model_extra_tree_regressor = ExtraTreeRegressor() # 10.多项式回归 model_Polynomial = make_pipeline(PolynomialFeatures(3), Ridge()) # 11.高斯过程(Gaussian Processes) from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel kernel = DotProduct() + WhiteKernel() model_GaussianProcessRegressor = GaussianProcessRegressor(kernel=kernel,random_state=0) # 根据溢价率,预测不同的转股价对应的价格 def pricingcb_different_delta(model,convert_value,pb,convertdelta): data_inconvert=data[(data['convertdelta'] ==0 )] data_outconvert = data[(data['convertdelta'] > 0)]
def get_algorithm(self): ''' Inputs: algorithm (string) - Name of the regressor to run. Follows Sklearn naming conventions. Available keys: ARDRegression | AdaBoostRegressor | BaggingRegressor | BayesianRidge | CCA DecisionTreeRegressor | ElasticNet | ExtraTreeRegressor ExtraTreesRegressor | GaussianProcessRegressor | GradientBoostingRegressor HuberRegressor | KNeighborsRegressor | KernelRidge | Lars | Lasso LassoLars | LinearRegression | LinearSVR | MLPRegressor | NuSVR | OrthogonalMatchingPursuit | PLSCanonical | PLSRegression | PassiveAggressiveRegressor | RANSACRegressor | RandomForestRegressor | Ridge | SGDRegressor | SVR | TheilSenRegressor | TransformedTargetRegressor Currently not supporting: ElasticNetCV | LarsCV | LassoCV | LassoLarsCV | LassoLarsIC | MultiTaskElasticNet | MultiTaskElasticNetCV | MultiTaskLasso | MultiTaskLassoCV | OrthogonalMatchingPursuitCV | RidgeCV | RadiusNeighborsRegressor Outputs: Notes: Scoring Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter ''' if (self.algorithmName == "ARDRegression"): algorithm = ARDRegression() elif (self.algorithmName == "AdaBoostRegressor"): algorithm = AdaBoostRegressor() elif (self.algorithmName == "BaggingRegressor"): algorithm = BaggingRegressor() elif (self.algorithmName == "BayesianRidge"): algorithm = BayesianRidge() elif (self.algorithmName == "CCA"): algorithm = CCA() elif (self.algorithmName == "DecisionTreeRegressor"): algorithm = DecisionTreeRegressor() elif (self.algorithmName == "ElasticNet"): algorithm = ElasticNet() elif (self.algorithmName == "ExtraTreeRegressor"): algorithm = ExtraTreeRegressor() elif (self.algorithmName == "ExtraTreesRegressor"): algorithm = ExtraTreesRegressor() elif (self.algorithmName == "GaussianProcessRegressor"): algorithm = GaussianProcessRegressor() elif (self.algorithmName == "GradientBoostingRegressor"): algorithm = GradientBoostingRegressor() elif (self.algorithmName == "HuberRegressor"): algorithm = HuberRegressor() elif (self.algorithmName == "KNeighborsRegressor"): algorithm = KNeighborsRegressor() elif (self.algorithmName == "KernelRidge"): algorithm = KernelRidge() elif (self.algorithmName == "Lars"): algorithm = Lars() elif (self.algorithmName == "Lasso"): algorithm = Lasso() elif (self.algorithmName == "LassoLars"): algorithm = LassoLars() elif (self.algorithmName == "LinearRegression"): algorithm = LinearRegression() elif (self.algorithmName == "LinearSVR"): algorithm = LinearSVR() elif (self.algorithmName == "MLPRegressor"): algorithm = MLPRegressor() elif (self.algorithmName == "NuSVR"): algorithm = NuSVR() elif (self.algorithmName == "OrthogonalMatchingPursuit"): algorithm = OrthogonalMatchingPursuit() elif (self.algorithmName == "PLSCanonical"): algorithm = PLSCanonical() elif (self.algorithmName == "PLSRegression"): algorithm = PLSRegression() elif (self.algorithmName == "PassiveAggressiveRegressor"): algorithm = PassiveAggressiveRegressor() elif (self.algorithmName == "RANSACRegressor"): algorithm = RANSACRegressor() elif (self.algorithmName == "RandomForestRegressor"): algorithm = RandomForestRegressor() elif (self.algorithmName == "Ridge"): algorithm = Ridge() elif (self.algorithmName == "SGDRegressor"): algorithm = SGDRegressor() elif (self.algorithmName == "SVR"): algorithm = SVR() elif (self.algorithmName == "TheilSenRegressor"): algorithm = TheilSenRegressor() elif (self.algorithmName == "TransformedTargetRegressor"): algorithm = TransformedTargetRegressor() else: return None return algorithm
def models_ML(cluster): models = dict() n_trees = 100 # random_state=42 #prameters for RandomSearch lr_param = { "fit_intercept": [True, False], "normalize": [False], "copy_X": [True, False] } knn_param = { "n_neighbors": [2, 3, 4, 5, 6, 7, 8], "metric": ["euclidean", "cityblock"] } dtree_param = { "max_depth": [3, None], "min_samples_leaf": sp_randint(1, 11), "criterion": ["mse"], "splitter": ["best", "random"], "max_features": ["auto", "sqrt", None] } lasso_param = { "alpha": [0.02, 0.024, 0.025, 0.026, 0.03], "fit_intercept": [True, False], "normalize": [True, False], "selection": ["random"] } ridge_param = { "alpha": [200, 230, 250, 265, 270, 275, 290, 300, 500], "fit_intercept": [True, False], "normalize": [True, False], "solver": ["auto"] } elas_param = { "alpha": list(np.logspace(-5, 2, 8)), "l1_ratio": [.2, .4, .6, .8], "fit_intercept": [True, False], "normalize": [True, False], "precompute": [True, False] } # g = [pow(2,-15),pow(2,-14),pow(2,-13),pow(2,-12),pow(2,-11),pow(2,-10),pow(2,-9),pow(2,-8),pow(2,-7),pow(2,-6),pow(2,-5),pow(2,-4),pow(2,-3),pow(2,-2),pow(2,-1),pow(1,0),pow(2,1),pow(2,2),pow(2,3)] # c=[pow(2,-5),pow(2,-4),pow(2,-3),pow(2,-2),pow(2,-1),pow(1,0),pow(2,1),pow(2,2),pow(2,3),pow(2,4),pow(2,5),pow(2,6),pow(2,7),pow(2,8),pow(2,9),pow(2,10),pow(2,11),pow(2,12),pow(2,13),pow(2,14),pow(2,15)] # svr_param={"C":c,"gamma":g,"kernel":["rbf","sigmoid"]} # gb_param={"n_estimators":[1, 2, 4, 8, 16, 32, 64, 100, 200],"max_depths":list(np.linspace(1, 32, 32, endpoint=True)),"min_samples_splits":list(np.linspace(0.1, 1.0, 10, endpoint=True)),"min_samples_leaf":list(np.linspace(0.1,0.5,5, endpoint=True)),"max_features":list(range(1,5))} if cluster in [1, 4, 7, 10, 13, 16, 19, 22, 25]: #Highly sparse data tree based algorithms models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) models['bag'] = BaggingRegressor(n_estimators=n_trees) models['rf'] = RandomForestRegressor(n_estimators=n_trees, random_state=42) models['et'] = ExtraTreesRegressor(n_estimators=n_trees, random_state=42) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) elif cluster == 2: models['llars'] = LassoLars() models['knn'] = KNeighborsRegressor(n_neighbors=7) models['et'] = ExtraTreesRegressor(n_estimators=n_trees, random_state=42) models['rf'] = RandomForestRegressor(n_estimators=n_trees, random_state=42) elif cluster == 3: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['cart'] = RandomizedSearchCV(DecisionTreeRegressor(), dtree_param, n_jobs=1, n_iter=100, random_state=42) models['rf'] = RandomForestRegressor(n_estimators=n_trees, random_state=42) models['et'] = ExtraTreesRegressor(n_estimators=n_trees, random_state=42) elif cluster == 5: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['huber'] = HuberRegressor() models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3, random_state=42) models['extra'] = ExtraTreeRegressor(random_state=42) models['svmr'] = SVR() elif cluster == 6: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['huber'] = HuberRegressor() models['svmr'] = SVR() models['rf'] = RandomForestRegressor(n_estimators=n_trees, random_state=42) elif cluster == 8: models['llars'] = LassoLars() models['svmr'] = SVR() models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['huber'] = HuberRegressor() models['et'] = ExtraTreesRegressor(n_estimators=n_trees, random_state=42) elif cluster == 9: models['cart'] = RandomizedSearchCV(DecisionTreeRegressor(), dtree_param, n_jobs=1, n_iter=100) models['bag'] = BaggingRegressor(n_estimators=n_trees) models['rf'] = RandomForestRegressor(n_estimators=n_trees) models['et'] = ExtraTreesRegressor(n_estimators=n_trees) elif cluster == 11: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['en'] = RandomizedSearchCV(ElasticNet(), elas_param, scoring='neg_mean_squared_error', n_jobs=1, n_iter=100, cv=10, random_state=42) models['extra'] = ExtraTreeRegressor(random_state=42) models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) elif cluster == 12: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) models['knn'] = KNeighborsRegressor(n_neighbors=3) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) elif cluster == 14: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['lasso'] = RandomizedSearchCV(Lasso(), lasso_param, n_jobs=1, n_iter=100) models['cart'] = RandomizedSearchCV(DecisionTreeRegressor(), dtree_param, n_jobs=1, n_iter=100) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) elif cluster == 15: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['en'] = RandomizedSearchCV(ElasticNet(), elas_param, scoring='neg_mean_squared_error', n_jobs=1, n_iter=100, cv=10, random_state=42) models['huber'] = HuberRegressor() models['extra'] = ExtraTreeRegressor(random_state=42) models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) elif cluster == 17: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) models['extra'] = ExtraTreeRegressor(random_state=42) models['bag'] = BaggingRegressor(n_estimators=n_trees, random_state=42) elif cluster == 18: models['lasso'] = RandomizedSearchCV(Lasso(), lasso_param, n_jobs=1, n_iter=100, random_state=42) models['ridge'] = RandomizedSearchCV(Ridge(), ridge_param, n_jobs=1, n_iter=100, random_state=42) models['cart'] = RandomizedSearchCV(DecisionTreeRegressor(), dtree_param, n_jobs=1, n_iter=100, random_state=42) models['extra'] = ExtraTreeRegressor(random_state=42) elif cluster == 20: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['huber'] = HuberRegressor() models['cart'] = RandomizedSearchCV(DecisionTreeRegressor(), dtree_param, n_jobs=1, n_iter=100, random_state=42) models['bag'] = BaggingRegressor(n_estimators=n_trees) models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) models['et'] = ExtraTreesRegressor(n_estimators=n_trees, random_state=42) elif cluster == 21: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['lasso'] = RandomizedSearchCV(Lasso(), lasso_param, n_jobs=1, n_iter=100, random_state=42) models['svmr'] = SVR() models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) elif cluster == 23: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['huber'] = HuberRegressor() models['bag'] = BaggingRegressor(n_estimators=n_trees) models['svmr'] = SVR() models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) elif cluster == 24: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['lasso'] = RandomizedSearchCV(Lasso(), lasso_param, n_jobs=1, n_iter=100, random_state=42) models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3, random_state=42) models['extra'] = ExtraTreeRegressor(random_state=42) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) elif cluster == 26: models['lr'] = RandomizedSearchCV(LinearRegression(), lr_param, n_jobs=1, random_state=42) models['lasso'] = RandomizedSearchCV(Lasso(), lasso_param, n_jobs=1, n_iter=100, random_state=42) models['en'] = RandomizedSearchCV(ElasticNet(), elas_param, scoring='neg_mean_squared_error', n_jobs=1, n_iter=100, cv=10, random_state=42) models['extra'] = ExtraTreeRegressor(random_state=42) models['ada'] = AdaBoostRegressor(n_estimators=n_trees, random_state=42) elif cluster == 27: models['svmr'] = SVR() models['knn'] = KNeighborsRegressor(n_neighbors=3) models['bag'] = BaggingRegressor(n_estimators=n_trees) models['cart'] = RandomizedSearchCV(DecisionTreeRegressor(), dtree_param, n_jobs=1, n_iter=100, random_state=42) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) return models
try_different_method(model) ##SVR回归 from sklearn import svm model = svm.SVR() try_different_method(model) ##KNN回归 from sklearn import neighbors model = neighbors.KNeighborsRegressor() try_different_method(model) ##Adaboost回归 from sklearn import ensemble model = ensemble.AdaBoostRegressor(n_estimators=50) try_different_method(model) ##GBRT回归 from sklearn import ensemble model = ensemble.GradientBoostingRegressor(n_estimators=100) try_different_method(model) ##ExtraTree极端随机树回归 from sklearn.tree import ExtraTreeRegressor model = ExtraTreeRegressor() try_different_method(model)
def loadData(): train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') #id=train['Id'] #train['Id'] #print train.head(3) #由于将全部的数值型变量映射为正太分布的 train["SalePrice"] = np.log1p(train["SalePrice"]) label = train['SalePrice'] del train['SalePrice'] #将训练集与测试集融合 train = pd.concat([train, test]) #索引为列的名字,值为类型 #数值型特征 numeric_feats = train.dtypes[train.dtypes != "object"].index skewed_feats = train[numeric_feats].apply( lambda x: skew(x.dropna())) #计算偏度 skewed_feats = skewed_feats[skewed_feats > 0.75] #偏度大于0.75的进行正态变换 skewed_feats = skewed_feats.index train[skewed_feats] = np.log1p(train[skewed_feats]) #分类特征进行哑编码 train = pd.get_dummies(train) #用均值填充空值 train = train.fillna(train.mean()) test = train[train['Id'] >= 1461] train = train[train['Id'] < 1461] del train['Id'] sub = test[['Id']] del test['Id'] #模型选择 X_train, X_test, Y_train, Y_test = train_test_split(train, label, test_size=0.33) regs = [ ['LassoCV', LassoCV(alphas=[1, 0.1, 0.001, 0.0005])], ['LinearRegression', LinearRegression()], ['Ridge', Ridge()], ['ElasticNet', ElasticNet()], ['RANSACRegressor', RANSACRegressor()], ['HuberRegressor', HuberRegressor()], ['DecisionTreeRegressor', DecisionTreeRegressor()], ['ExtraTreeRegressor', ExtraTreeRegressor()], ['AdaBoostRegressor', AdaBoostRegressor(n_estimators=150)], ['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators=150)], [ 'GradientBoostingRegressor', GradientBoostingRegressor(n_estimators=150) ], ['RandomForestRegressor', RandomForestRegressor(n_estimators=150)], [ 'XGBRegressor', XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) ], ] preds = [] for reg_name, reg in regs: print reg_name reg.fit(X_train, Y_train) y_pred = reg.predict(X_test) if np.sum(y_pred < 0) > 0: print 'y_pred have ' + str( np.sum(y_pred < 0) ) + " are negtive, we replace it witt median value of y_pred" y_pred[y_pred < 0] = np.median(y_pred) score = np.sqrt(mean_squared_error(np.log(y_pred), np.log(Y_test))) print preds.append([reg_name, y_pred]) final_results = [] for comb_len in range(1, len(regs) + 1): print "Model num:" + str(comb_len) results = [] for comb in itertools.combinations(preds, comb_len): #选取一个模型的组合,比如comb_len=2的时候,comb为(['Lasso',y_pred],['Ridge',y_pred] pred_sum = 0 model_name = [] for reg_name, pre in comb: pred_sum += pre model_name.append(reg_name) pred_sum /= comb_len model_name = '+'.join(model_name) score = np.sqrt( mean_squared_error(np.log(np.expm1(pred_sum)), np.log(np.expm1(Y_test)))) results.append([model_name, score]) #操作每一个融合模型的分数 results = sorted(results, key=lambda x: x[1]) for model_name, score in results: print model_name + ":" + str(score) print final_results.append(results[0]) print "best set of models" print for i in final_results: print i #选择模型 result = 0 choose_model = [ LassoCV(alphas=[1, 0.1, 0.001, 0.0005]), GradientBoostingRegressor(n_estimators=150), XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) ] for model in choose_model: reg = model.fit(train, label) pre = reg.predict(test) result += pre result /= 3 #写入文件 result = np.expm1(result) sub['SalePrice'] = result list = [[int(x[0]), x[1]] for x in sub.values] with open("submission.csv", 'wb') as f: writer = csv.writer(f) writer.writerow(['Id', 'SalePrice']) for i in range(len(list)): writer.writerow(list[i])
test_size=0.50, random_state=42) """##Model Selection""" from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor() rfr.fit(X_train, y_train) r2_score(y_test, rfr.predict(X_test)) mean_squared_error(y_test, rfr.predict(X_test)) X_train.columns.shape forest.feature_importances_.shape forest = ExtraTreeRegressor() forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X_train.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the impurity-based feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(X_train.shape[1]),
print('Done formatting') #14 Day Period Tests X, y = formatData(dailyOpen, dailyClose, dailyHigh, dailyLow,dailyVolume) testX, testy = dropOut(X,y) print("done") X = preprocessing.scale(np.asarray(X)) X_scale = preprocessing.scale(X) y = np.asarray(y) testX = preprocessing.scale(np.asarray(testX)) testy = np.asarray(testy) clf = DecisionTreeRegressor(max_depth= None, min_samples_split = 2, random_state = 0).fit(X,y) clfE = ExtraTreeRegressor(max_depth=None, min_samples_split=2, random_state=0).fit(X,y) scores = cross_val_score(clf, X, y, cv = 5) scoresE = cross_val_score(clfE, X, y, cv = 5) print('Training Decision',scores.mean()) print('Training Extra', scoresE.mean()) unseen = cross_val_score(clf, testX, testy, cv = 5) unseenE = cross_val_score(clfE, testX, testy, cv = 5) print('New Data Decision', unseen.mean()) print('New Data Extra', unseenE.mean()) defaultPrdict = clf.predict(testX) #defaultPrdictLog = clf.predict_proba(testX) extraPrdict = clfE.predict(testX) #extraPrdictLog = clfE.predict_proba(testX)
def extra_tree_regressor(self): x_train, x_test, y_train, y_test = self.preprocessing() model = ExtraTreeRegressor() y_pred = model.fit(x_train, y_train).predict(x_test) self.printing(y_test, y_pred, 'Extra Tree')
def SVR_train(*data): X, Y = data ####3.1决策树回归#### from sklearn import tree model_DecisionTreeRegressor = tree.DecisionTreeRegressor() ####3.2线性回归#### from sklearn import linear_model model_LinearRegression = linear_model.LinearRegression() ####3.3SVM回归#### from sklearn import svm model_SVR = svm.SVR() model_SVR2 = svm.SVR(kernel='rbf', C=100, gamma=0.1) ####3.4KNN回归#### from sklearn import neighbors model_KNeighborsRegressor = neighbors.KNeighborsRegressor() ####3.5随机森林回归#### from sklearn import ensemble model_RandomForestRegressor = ensemble.RandomForestRegressor( n_estimators=20) # 这里使用20个决策树 ####3.6Adaboost回归#### from sklearn import ensemble model_AdaBoostRegressor = ensemble.AdaBoostRegressor( n_estimators=50) # 这里使用50个决策树 ####3.7GBRT回归#### from sklearn import ensemble model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor( n_estimators=100) # 这里使用100个决策树 ####3.8Bagging回归#### from sklearn.ensemble import BaggingRegressor model_BaggingRegressor = BaggingRegressor() ####3.9ExtraTree极端随机树回归#### from sklearn.tree import ExtraTreeRegressor model_ExtraTreeRegressor = ExtraTreeRegressor() # Create the (parametrised) models # print("Hit Rates/Confusion Matrices:\n") models = [ ("model_DecisionTreeRegressor", model_DecisionTreeRegressor), ("model_LinearRegression", model_LinearRegression), ( "model_SVR", model_SVR2 #model_SVR ), ("model_KNeighborsRegressor", model_KNeighborsRegressor), ("model_RandomForestRegressor", model_RandomForestRegressor), ("model_AdaBoostRegressor", model_AdaBoostRegressor), ("model_GradientBoostingRegressor", model_GradientBoostingRegressor), ("model_BaggingRegressor", model_BaggingRegressor), ("model_ExtraTreeRegressor", model_ExtraTreeRegressor) ] for m in models: #X = X.reset_index(drop=True) #print(X) # y = y.reset_index(drop=True) # print(y) from sklearn.model_selection import KFold kf = KFold(n_splits=2, shuffle=False) for train_index, test_index in kf.split(X): # print(train_index, test_index) # print(X.loc[[0,1,2]]) X_train, X_test, y_train, y_test = X[train_index], X[ test_index], Y[train_index], Y[ test_index] # 这里的X_train,y_train为第iFold个fold的训练集,X_val,y_val为validation set #print(X_test, y_test) #print(X_train, y_train) print('======================================') import datetime starttime = datetime.datetime.now() print("正在训练%s模型:" % m[0]) m[1].fit(X_train, y_train) # Make an array of predictions on the test set pred = m[1].predict(X_test) # Output the hit-rate and the confusion matrix for each model score = m[1].score(X_test, y_test) print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test))) # print("%s\n" % confusion_matrix(y_test, pred, labels=[-1.0, 1.0]))#labels=["ant", "bird", "cat"] from sklearn.metrics import r2_score r2 = r2_score(y_test, pred) print('r2: ', r2) endtime = datetime.datetime.now() print('%s训练,预测耗费时间,单位秒:' % m[0], (endtime - starttime).seconds) #result = m[1].predict(X_test) import matplotlib.pyplot as plt plt.figure() plt.plot(np.arange(len(pred)), y_test, 'go-', label='true value') plt.plot(np.arange(len(pred)), pred, 'ro-', label='predict value') plt.title('score: %f' % score) plt.legend() plt.show()
def getModel(x, y): et = ExtraTreeRegressor() et.fit(x, y) #joblib.dump(et,'./model/et')#保存模型 return et
print('mean_absolute_error', mean_absolute_error(y_test, DTR_prediction)) print('mean_squared_error', mean_squared_error(y_test, DTR_prediction)) # # ДОМАШКА # In[847]: from sklearn.tree import ExtraTreeRegressor # In[848]: ETR = ExtraTreeRegressor() # In[849]: ETR # In[856]: ETR.fit(x, y) # In[857]:
build_auto(GradientBoostingRegressor(n_estimators=31, random_state=13), "GradientBoostingAuto") build_auto(IsolationForest(n_estimators=31, random_state=13), "IsolationForestAuto") build_auto( LGBMRegressor(objective="regression", n_estimators=31, random_state=13), "LightGBMAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(RandomForestRegressor(n_estimators=17, random_state=13), "RandomForestAuto", compact=False, flat=False) build_auto( VotingRegressor(estimators=[ ("major", DecisionTreeRegressor(max_depth=8, random_state=13)), ("minor", ExtraTreeRegressor(max_depth=5, random_state=13)) ], weights=[0.7, 0.3]), "VotingEnsembleAuto") build_auto( XGBRegressor(objective="reg:squarederror", n_estimators=31, random_state=13), "XGBoostAuto") sparsify("Auto") auto_X, auto_y = load_auto("AutoNA") if ("Auto" in datasets) or ("AutoNA" in datasets): build_auto( LGBMRegressor(objective="regression", n_estimators=31, random_state=13), "LightGBMAutoNA")
mse_t = [] rmse_t = [] mae_t = [] mdae_t = [] evs_t = [] r2_t = [] for tr_i, ts_i in rkf.split(data): print(i, j, k, c) train, test = data.iloc[tr_i], data.iloc[ts_i] train_x = train.drop(columns=['Rainfall']) train_y = train['Rainfall'] test_x = test.drop(columns=['Rainfall']) test_y = test['Rainfall'] model = ExtraTreeRegressor(criterion='mse', splitter='best', max_depth=i, min_samples_leaf=j, min_samples_split=k) model.fit(train_x, train_y) ts_p = model.predict(test_x) mse_t.append(mse(test_y, ts_p)) rmse_t.append(rmse(test_y, ts_p)) mae_t.append(mae(test_y, ts_p)) mdae_t.append(mdae(test_y, ts_p)) evs_t.append(evs(test_y, ts_p)) r2_t.append(r2(test_y, ts_p)) c += 1 dep_f.append(i) saml_f.append(j) sams_f.append(k) mse_f.append(np.mean(mse_t))
from sklearn import ensemble # 6 model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor() model_heads.append("Gradient Boosting Regression\t") models.append(model_GradientBoostingRegressor) from sklearn.ensemble import BaggingRegressor # 7 model_BaggingRegressor = BaggingRegressor() model_heads.append("Bagging Regression\t\t\t\t") models.append(model_BaggingRegressor) from sklearn.tree import ExtraTreeRegressor # 8 model_ExtraTreeRegressor = ExtraTreeRegressor() model_heads.append("ExtraTree Regression\t\t\t") models.append(model_ExtraTreeRegressor) import xgboost as xgb # 9 model_XGBoostRegressor = xgb.XGBRegressor() model_heads.append("XGBoost Regression\t\t\t\t") models.append(model_XGBoostRegressor) ##########Model Adding Ends########### def load_data(x_path='./X_train.csv', y_path='./y_train.csv', x_test_path='./X_test.csv'): """
def get_models(self, list_chosen): """Generate a library of base learners (Prophet works only if the data have the target in a pandas column named 'y' and a feature column with the tima data named 'ds') :param list_chosen: list with the names of the models to load :return: models, a dictionary with as index the name of the models, as elements the models""" linreg = LinearRegression(normalize=True, fit_intercept=True) dtr = DecisionTreeRegressor(random_state=self.SEED, min_samples_split=(0.018), min_samples_leaf=(0.007), max_depth=25) svrr = SVR(kernel='linear', epsilon=5) br = BaggingRegressor(n_estimators=350, max_samples=0.9, max_features=0.7, bootstrap=False, random_state=self.SEED) ada = AdaBoostRegressor(n_estimators=7, loss='exponential', learning_rate=0.01, random_state=self.SEED) rf = RandomForestRegressor(n_estimators=1000, max_depth=30, max_leaf_nodes=1000, random_state=self.SEED) gbr = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, random_state=self.SEED) xgbr1 = xgb.XGBRegressor(random_state=self.SEED) mdl = LGBMRegressor(n_estimators=1000, learning_rate=0.01) las = Lasso() rid = Ridge() en = ElasticNet() huber = HuberRegressor(max_iter=2000) lasl = LassoLars(max_iter=2000, eps=1, alpha=0.5, normalize=False) pa = PassiveAggressiveRegressor(C=1, max_iter=4000, random_state=self.SEED) sgd = SGDRegressor(max_iter=2000, tol=1e-3) knn = KNeighborsRegressor(n_neighbors=20) ex = ExtraTreeRegressor() exs = ExtraTreesRegressor(n_estimators=1000) pro = Prophet(changepoint_prior_scale=0.01) models_temp = { 'BaggingRegressor': br, 'RandomForestRegressor': rf, 'GradientBoostingRegressor': gbr, 'XGBRegressor': xgbr1, 'LGBMRegressor': mdl, 'ExtraTreesRegressor': exs, 'LinearRegression': linreg, 'SVR': svrr, 'AdaBoostRegressor': ada, 'LassoLars': lasl, 'PassiveAggressiveRegressor': pa, 'SGDRegressor': sgd, 'DecisionTreeRegressor': dtr, 'lasso': las, 'ridge': rid, 'ElasticNet': en, 'HuberRegressor': huber, 'KNeighborsRegressor': knn, 'ExtraTreeRegressor': ex, 'Prophet': pro } models = dict() for model in list_chosen: if model in models_temp: models[model] = models_temp[model] return models
]) BL_LT_prepared = full_pipeline.fit_transform(BL_LT) # 전처리 수행 BL_LT_prepared_train, \ BL_LT_prepared_test, \ BL_LT_labels_train, \ BL_LT_labels_test = train_test_split( BL_LT_prepared, BL_LT_labels, test_size = 0.10, random_state = 42) # 훈련:테스트 = 9:1 비율로 분리 from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import ExtraTreeRegressor from sklearn.svm import SVR ada_et_tree_reg = AdaBoostRegressor( ExtraTreeRegressor(max_depth=200, random_state=42), n_estimators=60, learning_rate=0.5, random_state=42) ada_et_tree_reg.fit(BL_LT_prepared_train,BL_LT_labels_train) # AdaBoostRegressor(ExtraTreeRegressor) 알고리즘 학습 수행 BL_LT_predicted = ada_et_tree_reg.predict(BL_LT_prepared_test) # AdaBoostRegressor(ExtraTreeRegressor) 알고리즘 테스트 수행 ada_et_tree_mape_sub = (np.abs((BL_LT_predicted - BL_LT_labels_test) / BL_LT_labels_test).mean(axis=0)) # AdaBoostRegressor(ExtraTreeRegressor) 알고리즘 MAPE 산출 ada_svr_reg = AdaBoostRegressor( SVR(C=1000,degree=3,kernel='rbf'), n_estimators=30, learning_rate=0.1, random_state=42) ada_svr_reg.fit(BL_LT_prepared_train, BL_LT_labels_train) # AdaBoostRegressor(SVR) 알고리즘 학습 수행 BL_LT_predicted= ada_svr_reg.predict(BL_LT_prepared_test) # AdaBoostRegressor(SVR) 알고리즘 테스트 수행 ada_svr_mape_sub = (np.abs((BL_LT_predicted - BL_LT_labels_test) / BL_LT_labels_test).mean(axis=0)) # AdaBoostRegressor(SVR) 알고리즘 MAPE 산출 from sklearn.svm import SVR svm_rbf_reg = SVR(C=10, cache_size=200, coef0=0.0, degree=3,
def build_voting_tree_regressor(X,y,max_features,max_depth,min_samples_split): clf = ExtraTreeRegressor(max_features=max_features,max_depth=max_depth,min_samples_split=min_samples_split) clf = clf.fit(X,y) return clf
def test_extra_tree_regressor(self): model = ExtraTreeRegressor() dump_single_regression(model) dump_multiple_regression(model)