hour_list.append(date.hour) rental_data["month"] = np.array(month_list) rental_data["day"] = np.array(day_list) rental_data["hour"] = np.array(hour_list) del rental_data["datetime"] rental_data = rental_data.iloc[np.random.permutation(len(rental_data))] rental_counts = rental_data["count"].values train_data,test_data,train_counts,test_counts = cross_validation.train_test_split(rental_data.values,rental_counts,test_size=0.2) rf = RandomForestRegressor(n_estimators=101) ada = AdaBoostRegressor(n_estimators=101) grad = GradientBoostingRegressor(n_estimators=101) bagging = BaggingRegressor(n_estimators=101) svr = SVR() regressors = [rf,ada,grad,bagging,svr] regressor_names = ["Random Forests","Adaboost Regressor","Gradient Boost Regressor","Bagging Regressor","Support Vector Regressor"] for regressor,regressor_name in zip(regressors,regressor_names): regressor.fit(train_data,train_counts) predicted_counts = regressor.predict(test_data) print "-----------------------------------------\n" print "Mean Absolute Error for ",regressor_name," : ",metrics.mean_absolute_error(test_counts,predicted_counts) print "Median Absolute Error for ",regressor_name," : ",metrics.median_absolute_error(test_counts,predicted_counts) print "Mean Squared Error for ",regressor_name," : ",metrics.mean_squared_error(test_counts,predicted_counts) print "R2 Score for ",regressor_name, " : ",metrics.r2_score(test_counts,predicted_counts)
reducer__svd_solver=['auto']) reducers_cfg[GenericUnivariateSelect.__name__] = dict( reducer__score_func=[f_regression], reducer__mode=['k_best'], reducer__param=[]) reducers_cfg[RFE.__name__] = dict(reducer__n_features_to_select=[], reducer__step=[0.1]) ######################### ####### Models ########## ######################### #models = [LinearSVC(),MLPClassifier(),GradientBoostingClassifier(),RandomForestClassifier(),LogisticRegression()] #models = [AdaBoostClassifier(),BaggingClassifier(),ExtraTreesClassifier(),GradientBoostingClassifier(),RandomForestClassifier(),PassiveAggressiveClassifier(),LogisticRegression(),RidgeClassifier(),SGDClassifier(),GaussianNB(),MultinomialNB(),KNeighborsClassifier(),RadiusNeighborsClassifier(),NearestCentroid(),MLPClassifier(),SVC(),LinearSVC(),NuSVC(),DecisionTreeClassifier(),ExtraTreeClassifier()] #models_reg = [AdaBoostRegressor(),BaggingRegressor(),ExtraTreesRegressor(),GradientBoostingRegressor(),RandomForestRegressor(),ElasticNet(),HuberRegressor(),Lasso(),LassoLars(),LinearRegression(),PassiveAggressiveRegressor(),Ridge(),SGDRegressor(),OrthogonalMatchingPursuit(),RANSACRegressor(),KNeighborsRegressor(),RadiusNeighborsRegressor(),MLPRegressor(),SVR(),LinearSVR(),NuSVR(),DecisionTreeRegressor(),ExtraTreeRegressor()] models_reg = [ BaggingRegressor(), ExtraTreesRegressor(), GradientBoostingRegressor() ] models_class = [GradientBoostingClassifier()] models_class_cfg = {} models_cfg = {} #full params - dont work ''' models_cfg[BaggingClassifier.__name__] = dict( model__n_estimators = [10, 50, 100, 130], model__bootstrap = [True, False], model__bootstrap_features = [True, False], )
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 5, verbose = True): dict_models = {} for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]: t_start = time.clock() classifier.fit(X_train, Y_train) t_end = time.clock() t_diff = t_end - t_start train_score = classifier.score(X_train, Y_train) test_score = classifier.score(X_test, Y_test) dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff} if verbose: print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff)) return dict_models def display_dict_models(dict_models, sort_by='test_score'): cls = [key for key in dict_models.keys()] test_s = [dict_models[key]['test_score'] for key in cls] training_s = [dict_models[key]['train_score'] for key in cls] training_t = [dict_models[key]['train_time'] for key in cls] df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),4)), columns = ['classifier', 'train_score', 'test_score', 'train_time']) for ii in range(0,len(cls)): df_.loc[ii, 'classifier'] = cls[ii] df_.loc[ii, 'train_score'] = training_s[ii] df_.loc[ii, 'test_score'] = test_s[ii] df_.loc[ii, 'train_time'] = training_t[ii] display(df_.sort_values(by=sort_by, ascending=False)) # classification trees def classification_tree(X_train, y_train): tree_clf = DecisionTreeClassifier(max_depth = 2) tree_clf.fit(X_train,Y_train) from sklearn.tree import export_graphviz export_graphviz(tree_clf,out_file = "classification_tree.dot",max_depth = None,feature_names = None, rounded = True, filled = True) # ensemble learners def ensemble_learner(x, y): log_clf = LogisticRegression() rnd_reg = RandomForestRegressor() svm_reg = LinearSVR() voting_clf = VotingClassifier( estimators = [('lr', log_clf), ('rf', rnd_reg), ('svc', svm_reg)], voting = 'soft') #bootstrap aggegation from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import accuracy_score def bagging_regressor(x, y): for max_number_samples = len(x): if n_estimators < len(y): bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_estimators, max_samples = 50, bootstrap = True, n_jobs = -1) bag_reg.fit(x, y) # n_jobs denotes the number of processors dedicated to the the process y_pred = bag_reg.predict(x)
nJOBS = 8 cnt = 0 for attrib in attribs: workARR = attrib label = labels[cnt] print(label) randForrC = RandomForestClassifier(n_jobs=nJOBS, n_estimators=500) randForrR = RandomForestRegressor(n_jobs=nJOBS, n_estimators=500) adaBoostC = AdaBoostClassifier(n_estimators=250) adaBoostR = AdaBoostRegressor(n_estimators=250) bagCoobN = BaggingClassifier(n_estimators=500, n_jobs=nJOBS) bagRoobN = BaggingRegressor(n_estimators=500, n_jobs=nJOBS) bagCoobY = BaggingClassifier(n_estimators=250, oob_score=True, n_jobs=nJOBS) bagRoobY = BaggingRegressor(n_estimators=250, oob_score=True, n_jobs=nJOBS) bernNB = BernoulliNB() gausRidge = linear_model.Ridge(max_iter=1e9, tol=1e-6) gradBoostC = GradientBoostingClassifier(n_estimators=500, max_depth=2500) gradBoostR = GradientBoostingRegressor(n_estimators=500, max_depth=2500) sdgC = linear_model.SGDClassifier(n_jobs=nJOBS) sdgR = linear_model.SGDRegressor()
# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이트의 타겟변수와 비교하여 성능 평가 print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE ## 학습 데이터를 선형 회귀 모형에 적합 후 평가 데이터로 검증 (Scikit-Learn) from sklearn.linear_model import LinearRegression regression_model = LinearRegression() # 선형 회귀 모형 linear_model1 = regression_model.fit(train_x, train_y) # 학습 데이터를 선형 회귀 모형에 적합 predict1 = linear_model1.predict(test_x) # 학습된 선형 회귀 모형으로 평가 데이터 예측 print("RMSE: {}".format(sqrt(mean_squared_error(predict1, test_y)))) # RMSE 결과 ## Bagging 을 이용하여 선형 회귀 모형에 적합 후 평가 (Sampling 10번) from sklearn.ensemble import BaggingRegressor bagging_model = BaggingRegressor(base_estimator = regression_model, # 선형회귀모형 n_estimators = 5, # 5번 샘플링 verbose = 1) # 학습 과정 표시 linear_model2 = bagging_model.fit(train_x, train_y) # 학습 진행 predict2 = linear_model2.predict(test_x) # 학습된 Bagging 선형 회귀 모형으로 평가 데이터 예측 print("RMSE: {}".format(sqrt(mean_squared_error(predict2, test_y)))) # RMSE 결과 ## 그렇다면 Sampling을 많이 해보자! bagging_model2 = BaggingRegressor(base_estimator = regression_model, # 선형 회귀모형 n_estimators = 30, # 30번 샘플링 verbose = 1) # 학습 과정 표시 linear_model3 = bagging_model2.fit(train_x, train_y) # 학습 진행 predict3 = linear_model3.predict(test_x) # 학습된 Bagging 선형 회귀 모형으로 평가 데이터 예측 print("RMSE: {}".format(sqrt(mean_squared_error(predict3, test_y)))) # RMSE 결과 ## 학습 데이터를 의사결정나무모형에 적합 후 평가 데이터로 검증
def __init__(self): ''' Define the Classifiers to be Used for @Classifiers: List of Tuples @Pipeline: Channel of Estimators @Employ the use of GridSearchCV Predicting Returns ''' self.N_NEIGBORS = 10 self.KERNELS = ['linear', 'rbf'] self.GAMMA = [0.0001, 0.001, 0.01, 1] self.CRITERION = ['gini', 'entropy'] self.MAX_DEPTH = 5 self.MAX_FEATURES = ['auto', 'sqrt', 'log2'] self.N_VALIDATION = 2 self.N_COMPONENTS = 2 self.BEST_ACCURACY = 0.0 self.BEST_CLASSIFIER = 0 self.BEST_GRIDSEARCH = '' #Support vector regressor self.pipe_SVR = Pipeline([('normalizer', StandardScaler()), ('clf', SVR())]) self.pipe_SVR_PCA = Pipeline([('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', SVR())]) #Adaboost regressor self.pipe_AdaBoostRegressor = Pipeline([('normalizer', StandardScaler()), ('clf', AdaBoostRegressor())]) self.pipe_AdaBoostRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', AdaBoostRegressor()) ]) #RandomForest Regressor self.pipe_RandomForestRegressor = Pipeline([ ('normalizer', StandardScaler()), ('clf', RandomForestRegressor()) ]) self.pipe_RandomForestRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', RandomForestRegressor()) ]) #Gradient boosting regressor self.pipe_GradientBoostingRegressor = Pipeline([ ('normalizer', StandardScaler()), ('clf', GradientBoostingRegressor()) ]) self.pipe_GradientBoostingRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', GradientBoostingRegressor()) ]) #Bagging regressor self.pipe_BaggingRegressor = Pipeline([('normalizer', StandardScaler()), ('clf', BaggingRegressor())]) self.pipe_BaggingRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', BaggingRegressor()) ]) #Extratrees regressor self.pipe_ExtraTreesRegressor = Pipeline([ ('normalizer', StandardScaler()), ('clf', ExtraTreesRegressor()) ]) self.pipe_ExtraTreesRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', ExtraTreesRegressor()) ]) #DecisionTreeRegressor self.pipe_DecisionTreeRegressor = Pipeline([ ('normalizer', StandardScaler()), ('clf', DecisionTreeRegressor()) ]) self.pipe_DecisionTreeRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', DecisionTreeRegressor()) ]) #KNeighborsRegressor self.pipe_KNeighborsRegressor = Pipeline([ ('normalizer', StandardScaler()), ('clf', KNeighborsRegressor()) ]) self.pipe_KNeighborsRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', KNeighborsRegressor()) ]) #RadiusNeighborsRegressor self.pipe_RadiusNeighborsRegressor = Pipeline([ ('normalizer', StandardScaler()), ('clf', RadiusNeighborsRegressor()) ]) self.pipe_RadiusNeighborsRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', RadiusNeighborsRegressor()) ]) #LinearRegression self.pipe_LinearRegression = Pipeline([('normalizer', StandardScaler()), ('clf', LinearRegression())]) self.pipe_LinearRegression_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', LinearRegression()) ]) #LogisticRegression self.pipe_LogisticRegression = Pipeline([ ('normalizer', StandardScaler()), ('clf', LogisticRegression()) ]) self.pipe_LogisticRegression_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', LogisticRegression()) ]) #RANSACRegressor self.pipe_RANSACRegressor = Pipeline([('normalizer', StandardScaler()), ('clf', RANSACRegressor())]) self.pipe_RANSACRegressor_PCA = Pipeline([ ('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', RANSACRegressor()) ]) #Ridge self.pipe_Ridge = Pipeline([('normalizer', StandardScaler()), ('clf', Ridge())]) self.pipe_Ridge_PCA = Pipeline([('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', Ridge())]) #Lasso self.pipe_Lasso = Pipeline([('normalizer', StandardScaler()), ('clf', Lasso())]) self.pipe_Ridge_PCA = Pipeline([('normalizer', StandardScaler()), ('PCA', PCA(n_components=self.N_COMPONENTS)), ('clf', Lasso())]) self.pipe_KNN_param = [{ 'clf__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'clf__leaf_size': [1, 2, 3, 5], 'clf__weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kdtree', 'brute'] }] self.pipe_SVR_params = [{ 'clf__kernel': self.KERNELS, 'clf__C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'clf__gamma': self.GAMMA }] self.pipe_AdaBoostRegressor_param = [{ 'clf__n_estimators': np.arange(1, 50), 'clf__learning_rate': [0.01, 0.05, 0.1, 0.3, 1], 'clf__loss': ['linear', 'square', 'exponential'] }] self.pipe_RandomForestRegressor_params = [{ 'clf__criterion': self.CRITERION, 'clf__max_depth': np.arange(2, 10), 'clf__min_samples_split': np.arange(2, 10, 1), 'clf__min_samples_leaf': np.arange(2, 10, 1), 'clf__max_leaf_nodes': np.arange(2, 10, 1) }] self.pipe_GradientBoostingRegressor_param = [{ 'clf__max_depth': np.arange(2, 10) }] self.pipe_GaussianNB_params = [{'clf__priors': [None]}] self.pipe_GaussianProcessClassifier_params = [{ 'clf__kernel': [1**2 * RBF(1.0)] }] self.pipe_LogisticRegression_params = [{ 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 0.5, 0.1], 'clf__solver': ['liblinear'] }] self.QuadraticDiscriminantAnalysis_params = [{'clf__priors': [None]}]
def sklearnBagging(params): X = params['X_train'] Y = params['y_train'] name = params['name'] model = BaggingRegressor() return MachinLearningModel(model, X, Y, modelType="Linear", name=name)
X_train, y_train = generate(n_samples=n_train, noise=noise) X_test, y_test = generate(n_samples=n_test, noise=noise) # One decision tree regressor dtree = DecisionTreeRegressor().fit(X_train, y_train) d_predict = dtree.predict(X_test) plt.figure(figsize=(10, 8)) plt.plot(X_test, f(X_test), 'b') plt.scatter(X_train, y_train, c='b', s=20) plt.plot(X_test, d_predict, 'g', lw=2) plt.xlim([-5, 5]) plt.title("Decision tree, MSE = %.2f" % np.sum((y_test - d_predict)**2)) # Bagging decision tree regressor bdt = BaggingRegressor(DecisionTreeRegressor()).fit(X_train, y_train) bdt_predict = bdt.predict(X_test) plt.figure(figsize=(10, 8)) plt.plot(X_test, f(X_test), 'b') plt.scatter(X_train, y_train, c='b', s=20) plt.plot(X_test, bdt_predict, 'y', lw=2) plt.xlim([-5, 5]) plt.title("Bagging decision tree, MSE = %.2f" % np.sum( (y_test - bdt_predict)**2)) # Random forest rf = RandomForestRegressor(n_estimators=10).fit(X_train, y_train) rf_predict = rf.predict(X_test) plt.figure(figsize=(10, 8))
del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, consARR, test_size=1500) nJOBS = int(sys.argv[1]) nEST = int(sys.argv[2]) bagOUT = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST, oob_score=True) #bagOUT.fit(likesMAT, consARR) bagOUT.fit(X_train, y_train) y_pred = bagOUT.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("cons, bagOUT: ", str(nEST), " ", myRMSE) # joblib.dump(bagOUT, "/Users/jamster/bagOUT-A-cons.xz", compress=9) # impbagOUT = joblib.load("/Users/jamster/bagOUT-A-cons.xz")
def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("This method needs samples of at least 2 classes" " in the data, but the data contains only one" " class: %r" % self.classes_[0]) if not isinstance(self.max_depth_duplication, int) \ and self.max_depth_duplication is not None: raise ValueError("max_depth_duplication should be an integer" ) if not set(self.classes_) == set([0, 1]): warn("Found labels %s. This method assumes target class to be" " labeled as 1 and normal data to be labeled as 0. Any label" " different from 0 will be considered as being from the" " target class." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError('max_samples (%s) is not supported.' 'Valid choices are: "auto", int or' 'float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn("max_samples (%s) is greater than the " "total number of samples (%s). max_samples " "will be set to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples self.rules_ = {} self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] # default columns names : feature_names_ = [BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str)] if self.feature_names is not None: self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(self.feature_names)} else: self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(feature_names_)} self.feature_names_ = feature_names_ clfs = [] regs = [] self._max_depths = self.max_depth \ if isinstance(self.max_depth, Iterable) else [self.max_depth] for max_depth in self._max_depths: bagging_clf = BaggingClassifier( base_estimator=DecisionTreeClassifier( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) bagging_reg = BaggingRegressor( base_estimator=DecisionTreeRegressor( max_depth=max_depth, max_features=self.max_features, min_samples_split=self.min_samples_split), n_estimators=self.n_estimators, max_samples=self.max_samples_, max_features=self.max_samples_features, bootstrap=self.bootstrap, bootstrap_features=self.bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose) clfs.append(bagging_clf) regs.append(bagging_reg) # define regression target: if sample_weight is not None: if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = ( pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow((weights).mean(), 0.5) * (y == 0)) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid else: y_reg = y # same as an other classification bagging for clf in clfs: clf.fit(X, y) self.estimators_ += clf.estimators_ self.estimators_samples_ += clf.estimators_samples_ self.estimators_features_ += clf.estimators_features_ for reg in regs: reg.fit(X, y_reg) self.estimators_ += reg.estimators_ self.estimators_samples_ += reg.estimators_samples_ self.estimators_features_ += reg.estimators_features_ rules_ = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~samples if sum(mask) == 0: warn("OOB evaluation not possible: doing it in-bag." " Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to" " not perform well! Please use max_samples < 1.") mask = samples rules_from_tree = self._tree_to_rules( estimator, np.array(self.feature_names_)[features]) # XXX todo: idem without dataframe X_oob = pandas.DataFrame((X[mask, :])[:, features], columns=np.array( self.feature_names_)[features]) if X_oob.shape[1] > 1: # otherwise pandas bug (cf. issue #16363) y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob)) for r in set(rules_from_tree)] rules_ += rules_from_tree # Factorize rules before semantic tree filtering rules_ = [ tuple(rule) for rule in [Rule(r, args=args) for r, args in rules_]] # keep only rules verifying precision_min and recall_min: for rule, score in rules_: if score[0] >= self.precision_min and score[1] >= self.recall_min: if rule in self.rules_: # update the score to the new mean c = self.rules_[rule][2] + 1 b = self.rules_[rule][1] + 1. / c * ( score[1] - self.rules_[rule][1]) a = self.rules_[rule][0] + 1. / c * ( score[0] - self.rules_[rule][0]) self.rules_[rule] = (a, b, c) else: self.rules_[rule] = (score[0], score[1], 1) self.rules_ = sorted(self.rules_.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) # Deduplicate the rule using semantic tree if self.max_depth_duplication is not None: self.rules_ = self.deduplicate(self.rules_) self.rules_ = sorted(self.rules_, key=lambda x: - self.f1_score(x)) self.rules_without_feature_names_ = self.rules_ # Replace generic feature names by real feature names self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_] return self
plt.plot(t, x, 'r-', lw=1, label=u'原始数据') plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值') plt.legend(loc='upper right') plt.title(u'异常检测', fontsize=18) plt.grid(b=True) # 预测 plt.subplot(133) select = np.ones(N, dtype=np.bool) select[abnormal] = False t = np.arange(N) dtr = DecisionTreeRegressor(criterion='mse', max_depth=10) br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3) br.fit(t[select].reshape(-1, 1), x[select]) y = br.predict(np.arange(N).reshape(-1, 1)) y[select] = x[select] plt.plot(x, 'g--', lw=1, label=u'原始值') # 原始值 plt.plot(y, 'r-', lw=1, label=u'校正值') # 校正值 plt.legend(loc='upper right') plt.title(u'异常值校正', fontsize=18) plt.grid(b=True) plt.tight_layout(1.5, rect=(0, 0, 1, 0.95)) plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22) plt.show()
from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn.model_selection import GridSearchCV from sklearn import metrics #Step 1:Loading data X, y = load_boston(return_X_y=True) #Step 2:Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40) #step3:Training--BaggingRegressor regression = BaggingRegressor(random_state=40) param_grid = { 'base_estimator': [DecisionTreeRegressor(criterion='mse', splitter='best')], 'n_estimators': [x for x in np.arange(10, 101, 30)], 'max_samples': [0.3, 0.7, 1.0], 'max_features': [3, 6, 9, 13], 'bootstrap_features': [True, False] }, search = GridSearchCV(estimator=regression, param_grid=param_grid, cv=5, refit=True, verbose=1, n_jobs=-1) search.fit(X_train, y_train)
def test_sparse_regression(): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng) class CustomSVR(SVR): """SVC variant that records the nature of the training set""" def fit(self, X, y): super(CustomSVR, self).fit(X, y) self.data_type_ = type(X) return self parameter_sets = [ { "max_samples": 0.5, "max_features": 2, "bootstrap": True, "bootstrap_features": True }, { "max_samples": 1.0, "max_features": 4, "bootstrap": True, "bootstrap_features": True }, { "max_features": 2, "bootstrap": False, "bootstrap_features": True }, { "max_samples": 0.5, "bootstrap": True, "bootstrap_features": False }, ] for sparse_format in [csc_matrix, csr_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) for params in parameter_sets: # Trained on sparse format sparse_classifier = BaggingRegressor(base_estimator=CustomSVR(), random_state=1, **params).fit( X_train_sparse, y_train) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_results = BaggingRegressor(base_estimator=CustomSVR(), random_state=1, **params).fit( X_train, y_train).predict(X_test) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert_array_equal(sparse_results, dense_results) assert all([t == sparse_type for t in types]) assert_array_equal(sparse_results, dense_results)
import numpy as np from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split import operator import copy x, y = datahelper.get_xy('data/', num_hours=3, error_minutes=15) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) scores = {} models = [] for n in range(2, 20): estimator = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=4), max_samples=0.5, n_estimators=n) estimator.fit(x_train, y_train) scores[n] = estimator.score(x_test,y_test) models.append(copy.copy(estimator)) sorted_by_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True) print('Results of 5 best # of estimators:\n') for i in range(0, 5): n, score = sorted_by_scores[i] print("№ estimators = ", n) y_predicted = models[n-2].predict(x_test) print('R^2 = ' + str(r2_score(y_test, y_predicted))) print('MSE = ' + str(np.sqrt(mean_squared_error(y_test, y_predicted))))
import numpy as np import pandas as pd from simulator import simulate from sklearn import metrics from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split np.random.seed(123456) lr = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=1)) data = pd.read_csv('BTC-USD.csv') data = data.dropna() data.Date = pd.to_datetime(data.Date) data.set_index('Date', drop=True, inplace=True) diffs = (data.Close.diff()/data.Close).values[1:] diff_len = len(diffs) def create_x_data(lags=1): diff_data = np.zeros((diff_len, lags)) for lag in range(1, lags+1): this_data = diffs[:-lag] diff_data[lag:, lag-1] = this_data return diff_data
def test_parallel(): """Check parallel computations.""" rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] # fit到RandomForestRegressor之中 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6, solver='liblinear', multi_class='auto', max_iter=1000) bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1) bagging_clf.fit(X, y) print(bagging_clf) test = data_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Sex_.*|Pclass_.*') predictions = bagging_clf.predict(test.values) result = pd.DataFrame({ 'PassengerId': data_test['PassengerId'].values, 'Survived': predictions.astype(np.int32) }) result.to_csv("logistic_regression_predictions.csv", index=False)
mlp = MLPRegressor() mlpFit = mlp.fit(x_train, y_train) regr = AdaBoostRegressor(random_state=0, n_estimators=100) regrFit = regr.fit(x_train, y_train) clfRidge = Ridge(alpha=1.0) clfRidgeFit = clfRidge.fit(x_train, y_train) clfBayesian = linear_model.BayesianRidge() clfBayesianFit = clfBayesian.fit(x_train, y_train) reg = linear_model.LassoLars(alpha=0.01) regFit = reg.fit(x_train, y_train) bag = BaggingRegressor() bagFit = bag.fit(x_train, y_train) DT_MAD = mean_absolute_error(y_test, DT_regressionFit.predict(x_test)) SVR_MAD = mean_absolute_error(y_test, svr_regressionFit.predict(x_test)) KNN_MAD = mean_absolute_error(y_test, neighFit.predict(x_test)) MLP_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test)) regr_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test)) clfRidge_MAD = mean_absolute_error(y_test, clfRidgeFit.predict(x_test)) clfBayesion_MAD = mean_absolute_error(y_test, clfBayesianFit.predict(x_test)) reg_MAD = mean_absolute_error(y_test, regFit.predict(x_test)) bag_MAD = mean_absolute_error(y_test, bagFit.predict(x_test)) print('Regression Tree MAD: ' + str(DT_MAD)) print('Support Vector Regression MAD ' + str(SVR_MAD)) print('KNN MAD ' + str(KNN_MAD))
print('Iteration :',i) (XTrain, XTest, YTrain, YTest) = train_test_split(X, Y, test_size=20, random_state=123) # Linear Regressions lreg = LinearRegression() lreg.fit(XTrain, YTrain) # Decision Tree Regression dtr = DecisionTreeRegressor(max_leaf_nodes=34) dtr.fit(XTrain, YTrain) # ADA BOOST REGRESSION sen = AdaBoostRegressor(n_estimators=200) sen.fit(XTrain, YTrain) # Bagging Regression breg = BaggingRegressor(n_estimators=100) breg.fit(XTrain, YTrain) # Random Forest rfreg = RandomForestRegressor(n_estimators=10) rfreg.fit(XTrain, YTrain) dscores.append(dtr.score(XTest, YTest) * 100) bscores.append(breg.score(XTest, YTest) * 100) ascores.append(sen.score(XTest, YTest) * 100) rscores.append(rfreg.score(XTest, YTest) * 100) lscores.append(lreg.score(XTest, YTest) * 100) plt = matplotlib.pyplot plt.figure(figsize=(15,15)) plt.scatter(range(num_iters), dscores, color='k', label='Decision Tree Regressor')
def process(i): current_dat = header.iloc[i] current_dat_name = current_dat['bench.id'] # Define Datasets # print(str(i)+': '+current_dat_name) filename = '/Users/apple/Documents/AD_Datasets/' + dataname + '/benchmarks/' + current_dat_name + '.csv' with open(filename, 'r') as csvfile: reader = csv.reader(csvfile) data = list(reader) data = np.array(data) X_train = data[1:, 6:].astype('double') anomaly_type = data[1:, 5] y_label = np.zeros(len(anomaly_type)) # normal_ind = np.where(anomaly_type == 'nominal')[0] anomaly_ind = np.where(anomaly_type == 'anomaly')[0] y_label[anomaly_ind] = 1 # X_normal = X_train[normal_ind,:] # X_outlier = X_train[anomaly_ind,:] # contamination = len(anomaly_ind)/len(y_label) rng = np.random.RandomState(42) # BaggedDTM # ################################################################################################# # # max_samples = min(2048,X_train.shape[0]) # # y = np.random.uniform(size=X_train.shape[0]) # # bag_neigh = max(10, int(np.floor(0.03 * max_samples))) # # clf_bagDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,contamination=0.1), # # n_estimators=100, max_samples=max_samples, bootstrap=False, random_state=rng) # # y_score_BDTM = clf_bagDTM.fit(X_train, y).predict(X_train) # # # fpr_DTM, tpr_DTM, thresholds_DTM = roc_curve(y_label, -DTM_score) # # auc_BDTM_score = roc_auc_score(y_label, -y_score_BDTM) # # ap_BDTM_score = average_precision_score(y_label, -y_score_BDTM) # sp ################################################################################################# max_samples = min(20, X_train.shape[0]) y = np.random.uniform(size=X_train.shape[0]) bag_neigh = 1 clf_spDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh, contamination=0.1), n_estimators=1, max_samples=max_samples, bootstrap=False, random_state=rng) y_score_spDTM = clf_spDTM.fit(X_train, y).predict(X_train) auc_spDTM_score = roc_auc_score(y_label, -y_score_spDTM) ap_spDTM_score = average_precision_score(y_label, -y_score_spDTM) # aNNE ################################################################################################# clf_aNNE = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh, contamination=0.1), n_estimators=100, max_samples=max_samples, bootstrap=False, random_state=rng) y_score_aNNE = clf_aNNE.fit(X_train, y).predict(X_train) auc_aNNE_score = roc_auc_score(y_label, -y_score_aNNE) ap_aNNE_score = average_precision_score(y_label, -y_score_aNNE) return [auc_spDTM_score, auc_aNNE_score], [ap_spDTM_score, ap_aNNE_score]
rand_pred = random_forest.predict(X_test) print('train score for random_forest:', random_forest.score(X_train, y_train)) print('test score for random_forest:', random_forest.score(X_test, y_test)) #predicted price y_pred = random_forest.predict(X_test) # print("Predicted Prices for Single Family House in Ca", y_pred) from sklearn.model_selection import cross_val_score clf = RandomForestRegressor() scores = cross_val_score(clf, X_test, y_test, cv=5) #bagging bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #mse in $ mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) print("Predicted Prices for Single Family House in Ca", y_test, y_pred) print("crossvalidation:", scores.mean())
from sklearn.ensemble import RandomForestRegressor from sklearn.multioutput import MultiOutputRegressor # to set number of jobs to the number of cores, use n_jobs=-1 model = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('GradientBoostingRegressor') model = MultiOutputRegressor(AdaBoostRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('AdaBoostRegressor') model = MultiOutputRegressor(BaggingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('BaggingRegressor') model = MultiOutputRegressor(ExtraTreesRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('ExtraTreesRegressor') model = MultiOutputRegressor(RandomForestRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:]) models.append(model) modeldims.append(2) modelnames.append('RandomForestRegressor') model = MultiOutputRegressor(SVR(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
# # ## Bagging in scikit-learn # # Scikit-learn implements the bagging procedure as a "meta-estimator", that is # an estimator that wraps another estimator: it takes a base model that is # cloned several times and trained independently on each bootstrap sample. # # The following code snippet shows how to build a bagging ensemble of decision # trees. We set `n_estimtators=100` instead of 3 in our manual implementation # above to get a stronger smoothing effect. # %% from sklearn.ensemble import BaggingRegressor bagged_trees = BaggingRegressor( base_estimator=DecisionTreeRegressor(max_depth=3), n_estimators=100, ) _ = bagged_trees.fit(data_train, target_train) # %% [markdown] # # Let us visualize the predictions of the ensemble on the same test data: # %% sns.scatterplot(x=data_train["Feature"], y=target_train, color="black", alpha=0.5) bagged_trees_predictions = bagged_trees.predict(data_test) plt.plot(data_test, bagged_trees_predictions)
def linear_regression_algo(self): X = [] Y = [] with open('../Data/full_table.csv', 'r') as file: for line in csv.reader(file, delimiter=','): if len(line) == 13: try: zhvi = float(line[5]) property_type = line[6] room_type = line[7] accommodates = int(line[8]) bathrooms = float(line[9]) beds = int(line[10]) bed_type = line[11] price = float(line[12]) x = { 'zhvi': zhvi, 'property_type': property_type, 'room_type': room_type, 'accommodates': accommodates, 'bathrooms': bathrooms, 'beds': beds, 'bed_type': bed_type } y = price X.append(x) Y.append(y) except: pass # The DictVectorizer converts data from a dictionary to an array vec = DictVectorizer() # Convert X to Array X = vec.fit_transform(X).toarray() # Split X and Y into training and testing sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=43) # Linear Regression model = linear_model.LinearRegression() model.fit(X_train, Y_train) Y_pred = model.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Linear Regression') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Boosting model_boost = AdaBoostRegressor(linear_model.LinearRegression()) model_boost.fit(X_train, Y_train) Y_pred = model_boost.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Linear Regression (with AdaBoost)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2)) # With Bagging model_bag = BaggingRegressor(linear_model.LinearRegression()) model_bag.fit(X_train, Y_train) Y_pred = model_bag.predict(X_test) mse = mean_squared_error(Y_test, Y_pred) mae = mean_absolute_error(Y_test, Y_pred) r2 = r2_score(Y_test, Y_pred) print('Linear Regression (with Bagging)') print('Mean Squared Error: {0}'.format(mse)) print('Mean Average Error: {0}'.format(mae)) print('R2 Score: {0}'.format(r2))
print("AUC scores of downstream classifiers on test data : ") for i in range(0, len(learners)): score = learners[i].fit(X_syn, y_syn) pred_probs = learners[i].predict_proba(X_test) auc_score = roc_auc_score(y_test, pred_probs[:, 1]) print('-' * 40) print('{0}: {1}'.format(names[i], auc_score)) else: names = ['Ridge', 'Lasso', 'ElasticNet', 'Bagging', 'MLP'] learners.append((Ridge())) learners.append((Lasso())) learners.append((ElasticNet())) learners.append((BaggingRegressor())) learners.append((MLPRegressor())) print("RMSE scores of downstream regressors on test data : ") for i in range(0, len(learners)): score = learners[i].fit(X_syn, y_syn) pred_vals = learners[i].predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, pred_vals)) print('-' * 40) print('{0}: {1}'.format(names[i], rmse)) if opt.model != 'real-data': if opt.save_synthetic: if not os.path.isdir(opt.output_data_path): raise Exception('Output directory does not exist')
####3.4KNN回归#### from sklearn import neighbors model_KNeighborsRegressor = neighbors.KNeighborsRegressor() ####3.5随机森林回归#### from sklearn import ensemble model_RandomForestRegressor = ensemble.RandomForestRegressor( n_estimators=20) #这里使用20个决策树 ####3.6Adaboost回归#### from sklearn import ensemble model_AdaBoostRegressor = ensemble.AdaBoostRegressor( n_estimators=50) #这里使用50个决策树 ####3.7GBRT回归#### from sklearn import ensemble model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor( learning_rate=0.2, n_estimators=200) #这里使用100个决策树 ####3.8Bagging回归#### from sklearn.ensemble import BaggingRegressor model_BaggingRegressor = BaggingRegressor() ####3.9ExtraTree极端随机树回归#### from sklearn.tree import ExtraTreeRegressor model_ExtraTreeRegressor = ExtraTreeRegressor() ###########4.具体方法调用部分########## try_different_method(model_LinearRegression)
#bootstrap aggegation from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import accuracy_score def bagging_regressor(x, y): for max_number_samples = len(x): if n_estimators < len(y): bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_estimators, max_samples = 50, bootstrap = True, n_jobs = -1) bag_reg.fit(x, y) # n_jobs denotes the number of processors dedicated to the the process y_pred = bag_reg.predict(x) else: # out of bag evaluation bag_reg = BaggingRegressor( DecisionTreeRegressor(), n_estimators, bootstrap = True, n_jobs = -1, oob_score = True) bag_reg.fit(x, y) bag_reg.oob_score_ #seqeuencial decision tree regressor/ gradient boosting def decision_tree_stacking(x, y): " a number of first level individual learners are generated from training data set" "and then combined by a metalearner" tree_reg1 = DecisionTreeRegressor(max_depth = 2) tree_reg1.fit(x, y) # now train a second DecisionTreeRegressor on the residual errors made by the first predictor y2 = y- tree_reg1.predict(x)
def __init__(self, **args): """Init model.""" self.model_lf = BaggingRegressor(**copy.deepcopy(args)) self.model_hf = BaggingRegressor(**copy.deepcopy(args))
from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import BaggingRegressor from sklearn.ensemble import AdaBoostRegressor from xgboost import XGBRegressor train_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_train.csv",index_col = 0) test_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_test.csv",index_col = 0) numeric_cols = train_df.columns[train_df.dtypes != 'object'] y_train=train_df["SalePrice"] X_train=train_df.drop(['SalePrice'],axis=1) test_df=test_df.drop(['MSSubClass_90'],axis=1) ridge = Ridge(alpha = 15) # bagging 把很多小的分类器放在一起,每个train随机的一部分数据,然后把它们的最终结果综合起来(多数投票) # bagging 算是一种算法框架 params = [1,10,15,20,25,30,40] test_scores = [] for param in params: clf = BaggingRegressor(base_estimator = ridge,n_estimators = param) test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv = 10,scoring = 'neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params,test_scores) plt.title('n_estimators vs CV Error') plt.show() br = BaggingRegressor(base_estimator = ridge,n_estimators = 25) br.fit(X_train,y_train) y_final = np.expm1(br.predict(test_df))
def fit(self, label_correctness_file, point_labels_file, users_file, user_quality_features_file='all_users.csv'): users = pd.read_csv(users_file) users = users.set_index('user_id') y_train = users['accuracy'] users_for_training = users[users['labels_validated'] > 25].index self.label_correctness = extract_label_features( point_labels_file, label_correctness_file) # Splits the users into training & testing groups user_quality_features = pd.read_csv( user_quality_features_file).set_index('user_id') half = int(len(users_for_training) / 2) users_labels_train = users_for_training[:half] users_labels_test = users_for_training[half:] # mask = np.random.permutation(np.arange(len(users_for_training))) # users_labels_train = users_for_training[mask[:int(proportion_labels * len(mask))]] # users_labels_test = users_for_training[mask[int(proportion_labels * len(mask)):]] train_labels = self.label_correctness.copy() train_labels = train_labels[~pd.isna(train_labels['correct'])] train_labels = train_labels[~(pd.isna(train_labels[features]).any( axis=1))] # en = OrdinalEncoder() # en.fit(pd.concat((train_labels[['CLASS_DESC']], test_labels[['CLASS_DESC']]))) # train_labels[['CLASS_DESC']] = en.transform(train_labels[['CLASS_DESC']]) self.rfe_labels = RFECV( estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='precision') self.clf_labels = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=30) self.clf_accuracy = BaggingRegressor(random_state=0, n_jobs=-1, n_estimators=30) self.rfe_accuracy = RFECV( estimator=RandomForestClassifier(n_estimators=10), step=1, cv=StratifiedKFold(5), scoring='f1') print('Training label classifier...') self.rfe_labels.fit( train_labels[train_labels['user_id'].isin(users_labels_train)] [features].values, train_labels[train_labels['user_id'].isin( users_labels_train)]['correct'].astype(int)) self.clf_labels.fit( train_labels[train_labels['user_id'].isin(users_labels_train)] [features].values[:, self.rfe_labels.support_], train_labels[train_labels['user_id'].isin( users_labels_train)]['correct'].astype(int)) train_labels = train_labels.join(pd.Series( data=self.clf_labels.predict_proba( train_labels[train_labels['user_id'].isin(users_labels_test)] [features].values[:, self.rfe_labels.support_])[:, 1], index=train_labels[train_labels['user_id'].isin( users_labels_test)].index).rename('prob'), how='outer') prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)] .groupby('user_id').apply(lambda x:\ prob_hist(x['prob'].values)).rename('prob')) prob_hist_predictions = prob_hist_predictions.join( user_quality_features) print('Training accuracy classifier...') self.rfe_accuracy.fit( np.concatenate((dearray(prob_hist_predictions['prob']), prob_hist_predictions.drop(columns='prob').values), axis=1), y_train.loc[prob_hist_predictions.index] > 65) self.clf_accuracy.fit( np.concatenate((dearray(prob_hist_predictions['prob']), prob_hist_predictions.drop(columns='prob').values), axis=1)[:, self.rfe_accuracy.support_], y_train.loc[prob_hist_predictions.index])