def build_model_and_evaluate_rms(prev_pred=None): model = Model3() X_combined, y = model.combined_features(target="personality") # combining the prediction of previous tasks to predict another task if prev_pred is not None: X_combined = pd.concat([X_combined, prev_pred]) # X, y = utils.extract_data(X_combined, label="personality") X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.20, random_state = 2) reg = RegressorChain(XGBRegressor(n_estimators=200, max_depth=2, objective="reg:squarederror"), order = [0,3,1,4,2]) reg = reg.fit(X_train, y_train) y_pred = reg.predict(X_test) # Calculating RMSE for all personality # order: rmse = [] for i,value in enumerate(utils.regressor_labels): rmse.append(sqrt(mean_squared_error(y_pred[:,i], y_test[value]))) return rmse, reg
class LassoModel(RegModels): def __init__(self, params): super(LassoModel, self).__init__(params) self.name = "Lasso" def train(self): self.model = Lasso() self.model = RegressorChain(self.model) self.model.fit(self.train_x, self.train_y) self.train_output = self.model.predict(self.train_x)
def predict(ticker, interval): period = "max" if interval in ["1d", "5d"] else "1mo" df = pd.DataFrame( yf.Ticker(ticker).history(interval=interval, period=period)) df.dropna(inplace=True) last_timestamp = int(df.index[-1].timestamp() * 1000) print("LAST_TIMESTAMP:", last_timestamp) #Reshape the data data = df["Close"].values X = [] y = [] for i in range(0, len(data) - LOOK_BACK - PREDICT_FORWARD): X.append(data[i:i + LOOK_BACK]) y.append(data[i + LOOK_BACK:i + LOOK_BACK + PREDICT_FORWARD]) print("X_LENGTH:", len(X)) print("y_LENGTH:", len(y)) # define base model model = LinearSVR(dual=False, loss="squared_epsilon_insensitive") # define the chained multioutput wrapper model wrapper = RegressorChain(model) # fit the model on the whole dataset wrapper.fit(X, y) # make prediction historic_data = data[len(data) - LOOK_BACK:] predictions = wrapper.predict([historic_data])[0].tolist() payload = [] time_increment = 0 if interval == "5m": time_increment = FIVE_MINUTES elif interval == "30m": time_increment = THIRTY_MINUTES elif interval == "1d": time_increment = ONE_DAY elif interval == "5d": time_increment = FIVE_DAYS print("TIME_INCREMENT:", time_increment) for p in predictions: last_timestamp += time_increment payload.append({"date": last_timestamp, "price": p}) return {"response_code": 200, "payload": payload}
def __init__(self, X, Y, model_type, plot_individual_bool=False, plot_summary_one_bool=False, output_path=False): """ Initiate the RunRegression class Args: X: <np.Array> input independent variable(s) data Y: <np.Array> input dependent variable(s) data model_type: <str> regression type, must be one in dict_reg_type variable above plot_individual_bool: <bool> whether to also plot the individual x vs y series plot_summary_one_bool: <bool> whether to plot the summary comparison chart on one graph or separate the different y-series into their own charts output_path: <str> path to save output figure results to, if False results are not saved """ self.X = X self.Y = Y self.model = dict_reg_type[model_type]() self.model_type = model_type self.plot_individual_bool = plot_individual_bool self.plot_summary_one_bool = plot_summary_one_bool self.output_path = output_path # check whether the type involves a multi-output wrapper multi_output_wrapper = model_type.split('_')[0] if multi_output_wrapper in [DirectMultiOutput, ChainedMultiOutput]: self.model = MultiOutputRegressor( self.model) if multi_output_wrapper == DirectMultiOutput else RegressorChain(self.model)
def test_multioutput(): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.multioutput import MultiOutputRegressor, RegressorChain # create regression data X, y = make_regression(n_targets=3) # split into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) # train the model model = MultiOutputRegressor(AutoML(task="regression", time_budget=1)) model.fit(X_train, y_train) # predict print(model.predict(X_test)) # train the model model = RegressorChain(AutoML(task="regression", time_budget=1)) model.fit(X_train, y_train) # predict print(model.predict(X_test))
def findNextTick(df, type): nextStrings = [] #Creating a column for next value (This is what we are predicting) for i in predictionLabels: nextString = "next" + str(i) df[nextString] = df[i].shift(-1) nextStrings.append(nextString) X_pred = df[-1:].drop(nextStrings, axis=1) #Setting up a variable for prediction. df = df[0:-1] #Taking all but the last value for training X = df.drop(nextStrings, axis=1) #Dropping the answers y = df[nextStrings] #Creating an answer list r1 = LinearRegression(n_jobs=-1) r2 = tree.DecisionTreeRegressor() r3 = ensemble.RandomForestRegressor(n_jobs=-1) estimators = [ ('r1', r1), ('r2', r2), ('r3', r3) ] if(type == 0): regressor = ensemble.StackingRegressor( estimators=estimators, final_estimator=ensemble.RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) ) elif(type == 1): regressor = ensemble.VotingRegressor( estimators=estimators ) print("I got here!") regressor = RegressorChain(regressor) regressor.fit(X, y) #training the algorithm y_pred = list(regressor.predict(X_pred)) y_pred.insert(0,X_pred.iloc[0][predictionLabels]) y_pred = np.asarray(y_pred) x_predTime = list(X_pred.index) x_predTime.append(x_predTime[0] + 1) x_predTime = np.asarray(x_predTime) print(y_pred) print(x_predTime) return {"Y":y_pred,"X":x_predTime}
def test_base_chain_fit_and_predict_with_sparse_data_and_cv(): # Fit base chain with sparse data cross_val_predict X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) base_chains = [ClassifierChain(LogisticRegression(), cv=3), RegressorChain(Ridge(), cv=3)] for chain in base_chains: chain.fit(X_sparse, Y) Y_pred = chain.predict(X_sparse) assert_equal(Y_pred.shape, Y.shape)
def test_regressor_chain_w_fit_params(): # Make sure fit_params are properly propagated to the sub-estimators rng = np.random.RandomState(0) X, y = datasets.make_regression(n_targets=3) weight = rng.rand(y.shape[0]) class MySGD(SGDRegressor): def fit(self, X, y, **fit_params): self.sample_weight_ = fit_params['sample_weight'] super().fit(X, y, **fit_params) model = RegressorChain(MySGD()) # Fitting with params fit_param = {'sample_weight': weight} model.fit(X, y, **fit_param) for est in model.estimators_: assert est.sample_weight_ is weight
def chainregressor(X, Y): # Fit estimators ESTIMATORS = { "Extra trees + chain": ExtraTreesRegressor(n_estimators=10, max_features=X.shape[1], random_state=0), "K-nn + chain": KNeighborsRegressor(), "Linear regression + chain": LinearRegression(), "Ridge + chain": RidgeCV(), } kf = KFold(n_splits=5, shuffle=True) # Define the split - into kf_split = kf.get_n_splits( X) # returns the number of splitting iterations in the cross-validator accuracy = [] r2score = [] meansquared_error = [] coefficients = 0 rng = np.random.RandomState(1) meansquared_error_es = dict() r2score_es = dict() for name, estimator in ESTIMATORS.items(): meansquared_error = [] r2score = [] estimator = RegressorChain(estimator) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] estimator.fit(X_train, y_train) y_pred = estimator.predict(X_test) meansquared_error.append(mean_squared_error(y_test, y_pred)) r2score.append(r2_score(y_test, y_pred)) meansquared_error_es[name] = statistics.mean(meansquared_error) r2score_es[name] = statistics.mean(r2score) print(meansquared_error_es) print(r2score_es)
def test_sklearn_regressor_chain(self): for n_targets in [2, 3, 4]: for model_class in [ DecisionTreeRegressor, ExtraTreesRegressor, RandomForestRegressor, LinearRegression ]: order = [i for i in range(n_targets)] random.shuffle(order) model = RegressorChain(model_class(), order=order) X, y = datasets.make_regression(n_samples=50, n_features=10, n_informative=5, n_targets=n_targets, random_state=2021) X = X.astype('float32') y = y.astype('float32') model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch") self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-5, atol=1e-5)
def test_base_chain_random_order(): # Fit base chain with random order X, Y = generate_multilabel_dataset_with_correlations() for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]: chain_random = clone(chain).set_params(order="random", random_state=42) chain_random.fit(X, Y) chain_fixed = clone(chain).set_params(order=chain_random.order_) chain_fixed.fit(X, Y) assert_array_equal(chain_fixed.order_, chain_random.order_) assert list(chain_random.order) != list(range(4)) assert len(chain_random.order_) == 4 assert len(set(chain_random.order_)) == 4 # Randomly ordered chain should behave identically to a fixed order # chain with the same order. for est1, est2 in zip(chain_random.estimators_, chain_fixed.estimators_): assert_array_almost_equal(est1.coef_, est2.coef_)
def test_base_chain_fit_and_predict(): # Fit base chain and verify predict performance X, Y = generate_multilabel_dataset_with_correlations() chains = [RegressorChain(Ridge()), ClassifierChain(LogisticRegression())] for chain in chains: chain.fit(X, Y) Y_pred = chain.predict(X) assert Y_pred.shape == Y.shape assert ([c.coef_.size for c in chain.estimators_ ] == list(range(X.shape[1], X.shape[1] + Y.shape[1]))) Y_prob = chains[1].predict_proba(X) Y_binary = (Y_prob >= .5) assert_array_equal(Y_binary, Y_pred) assert isinstance(chains[1], ClassifierMixin)
def test_base_chain_crossval_fit_and_predict(): # Fit chain with cross_val_predict and verify predict # performance X, Y = generate_multilabel_dataset_with_correlations() for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]: chain.fit(X, Y) chain_cv = clone(chain).set_params(cv=3) chain_cv.fit(X, Y) Y_pred_cv = chain_cv.predict(X) Y_pred = chain.predict(X) assert Y_pred_cv.shape == Y_pred.shape assert not np.all(Y_pred == Y_pred_cv) if isinstance(chain, ClassifierChain): assert jaccard_score(Y, Y_pred_cv, average="samples") > 0.4 else: assert mean_squared_error(Y, Y_pred_cv) < 0.25
def __init__(self, fl, max_depth=8, num_est=300, chain=False): """ Initialises new DTR model :param fl: fl class :param max_depth: max depth of each tree :param num_est: Number of estimators in the ensemble of trees :param chain: regressor chain (True) or independent multi-output (False) """ self.labels_dim = fl.labels_dim self.labels_scaler = fl.labels_scaler if chain: self.model = RegressorChain( AdaBoostRegressor(DecisionTreeRegressor(max_depth=max_depth), n_estimators=num_est)) else: self.model = MultiOutputRegressor( AdaBoostRegressor(DecisionTreeRegressor(max_depth=max_depth), n_estimators=num_est)) self.normalise_labels = fl.normalise_labels
def test_sklearn_regressor_chain(self): for n_targets in [2, 3, 4]: for model_class in [DecisionTreeRegressor, ExtraTreesRegressor, RandomForestRegressor, LinearRegression]: seed = random.randint(0, 2**32 - 1) order = [i for i in range(n_targets)] random.Random(seed).shuffle(order) if model_class != LinearRegression: model = RegressorChain(model_class(random_state=seed), order=order) else: model = RegressorChain(model_class(), order=order) X, y = datasets.make_regression( n_samples=50, n_features=10, n_informative=5, n_targets=n_targets, random_state=seed ) X = X.astype("float32") y = y.astype("float32") model.fit(X, y) torch_model = hummingbird.ml.convert(model, "torch", extra_config={constants.TREE_OP_PRECISION_DTYPE: "float64"}) self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-4, atol=1e-4, err_msg="{}/{}/{}".format(n_targets, model_class, seed))
kneighborsregressor=KNeighborsRegressor, mlpregressor=MLPRegressor, stackingregressor=StackingRegressor, mlxtendstackingregressor=StackingRegressor, mlxtendstackingcvregressor=StackingCVRegressor, votingregressor=VotingRegressor) _ESTIMATOR_DICT = dict(regression=_REGRESSOR_DICT) _KERNEL_DICT = dict(dotproduct=DotProduct, rbf=RBF, whitekernel=WhiteKernel) # We need to identify chaining as the fit method capitilizes # the target Y, whereas other estimators conventionally write # the target as y. _CHAIN_FLAG = [ RegressorChain(base_estimator=DummyRegressor()).__class__, ClassifierChain(base_estimator=DummyClassifier()).__class__ ] # We need to identify CatBoost as the predict method utilizes the # parameter data for the conventional design matrix parameter X. _CATBOOST_FLAG = cat.CatBoostRegressor().__class__ _MULTI_TARGET = ['continuous-multioutput', 'multiclass-multioutput'] _OPTIMIZE_METHOD = [ 'Nelder-Mead', 'Powell', 'CG', 'BFGS', 'Newton-CG', 'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'trust-constr', 'dogleg', 'trust-ncg', 'trust-exact', 'trust-krylov', 'custom' ]
print('test Score: ', rf.score(X_test, rf_y_test)) # In[ ]: # Chained Models for Each Output (RegressorChain) # https://machinelearningmastery.com/multi-output-regression-models-with-python/ # Another approach to using single-output regression models for multioutput regression is to create a linear # sequence of models. # The first model in the sequence uses the input and predicts one output; the second model uses the input and # the output from the first model to make a prediction; the third model uses the input and output from the # first two models to make a prediction, and so on. from sklearn.multioutput import RegressorChain wrapper = RegressorChain(rf) wrapper.fit(X_train, y_train) rf_y_test_pred = wrapper.predict(X_test) # summarize prediction print(rf_y_test_pred[0:5]) print(rf_y_test_pred.astype('int')[0:5]) rf_y_test_pred = rf_y_test_pred.astype('int') # In[ ]: # Use the R forest's predict method on the test data rf_y_test_pred = rf.predict(X_test) print(rf_y_test_pred[0:5]) print(rf_y_test_pred.astype('int')[0:5]) rf_y_test_pred = rf_y_test_pred.astype('int')
logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) POTENTIAL_TRANSFORMER = { 'OneHotEncoder': ce.OneHotEncoder, 'OrdinalEncoder': ce.OrdinalEncoder, 'TargetEncoder': ce.TargetEncoder, 'JamesSteinEncoder': ce.JamesSteinEncoder, 'MinMaxScaler': MinMaxScaler, 'StandardScaler': StandardScaler } POTENTIAL_MODELS = { 'XGBRegressor': XGBRegressor, 'XGBRegressorChain': RegressorChain(XGBRegressor), 'ExtraTreesRegressor': ExtraTreesRegressor, } def _initialize_model(fc_model, params): initialized_params = { key: value for key, value in params.items() if key in fc_model().get_params() } model = fc_model(**initialized_params) return model def get_model(params): params = params if params else {}
imputer = Imputer(missing_values='NaN', strategy='mean') #imputer = imputer.fit(dataset) dataset_r8 = imputer.fit_transform(dataset) from sklearn.preprocessing import MinMaxScaler sc_X = MinMaxScaler() dataset_r8 = sc_X.fit_transform(dataset_r8) dataset_r8 = pd.DataFrame(dataset_r8) X = dataset_r8.iloc[:, :4] y = dataset_r8.iloc[:, 4:] # define base model model = LinearSVR(max_iter=5000) #model = LinearRegression() # define the chained multioutput wrapper model wrapper = RegressorChain(model) # fit the model on the whole dataset #wrapper.fit(X, y) # make a single prediction from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) # fit model wrapper.fit(X_train, y_train) y_pred = wrapper.predict(X_test) from sklearn.metrics import mean_squared_error rmse = (mean_squared_error(y_true=y_test,
'forest__max_features': [50, 100, 150, 200, 250, None], } elif multi_model == 'DecisionTree': # Decision tree regressor (supports multi-output natively) tree = DecisionTreeRegressor() # Creating a pipeline pipe = Pipeline(steps=[('tree', tree)]) param_dists = { 'tree__criterion': ['mse', 'mae'], 'tree__max_depth': [2, 4, 6, 8, None], 'tree__min_samples_leaf': stats.randint(low=1, high=9), } elif multi_model == 'ChainSVR': # Support Vector Regression (does NOT support multi-output natively) # Building a regressor chain from the SVM base estimators. svr = RegressorChain(base_estimator=SVR(kernel='rbf', cache_size=512)) # Creating a pipeline pipe = Pipeline(steps=[('preprocess', 'passthrough'), ('svr', svr)]) # Parameters of pipeline for the randomized search with cross-validation param_dists = { 'preprocess': [None, StandardScaler()], 'svr__base_estimator__C': stats.loguniform(1e0, 1e3), 'svr__base_estimator__epsilon': stats.loguniform(1e-5, 1e-2), 'svr__base_estimator__gamma': ['scale', 'auto'], } elif multi_model == 'MultiSVR': # Support Vector Regression (does NOT support multi-output natively) # Creating a multi-output regressor from the SVR base estimators. svr = MultiOutputRegressor(estimator=SVR(kernel='rbf')) # Creating a pipeline pipe = Pipeline(steps=[('preprocess', 'passthrough'), ('svr', svr)])
alpha_g_fit=alpha_g_fit, alpha_g_split=alpha_g_split, g_cap=g_cap, realize_iter=realize_iter, median_predict=median_predict, bij_novelty=bij_novelty) elif estimator_name == 'TBAB': regressor = AdaBoostRegressor(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, loss=loss, random_state=seed, bij_novelty=bij_novelty) elif estimator_name == 'TBRC': regressor = RegressorChain(base_estimator=base_estimator, order=order, cv=cv, random_state=seed) else: regressor = base_estimator t0 = t.time() regressor.fit(x_train, y_train, tb=tb_train) t1 = t.time() print('\nFinished DecisionTreeRegressor in {:.4f} s'.format(t1 - t0)) # Save estimator dump(regressor, case.resultPaths[time] + estimator_name + '.joblib') score_test = regressor.score(x_test, y_test, tb=tb_test) score_train = regressor.score(x_train, y_train, tb=tb_train)
def training(X_train, Y_train, flag, combine_type="multi_output"): """ :param combine_type: "multi_output" or "chain" """ algo_clf = None if flag == 0: algo_clf = KernelRidge() elif flag == 1: algo_clf = LinearSVR() elif flag == 2: algo_clf = SVR() elif flag == 3: algo_clf = NuSVR() elif flag == 4: algo_clf = LinearRegression() elif flag == 5: algo_clf = Ridge() elif flag == 6: algo_clf = Lasso() elif flag == 7: algo_clf = ElasticNet() elif flag == 8: algo_clf = Lars() elif flag == 9: algo_clf = LassoLars() elif flag == 10: algo_clf = BayesianRidge() elif flag == 11: algo_clf = SGDRegressor(loss="squared_loss") elif flag == 12: algo_clf = SGDRegressor(loss="huber") elif flag == 13: algo_clf = SGDRegressor(loss="epsilon_insensitive") elif flag == 14: algo_clf = KNeighborsRegressor() elif flag == 15: algo_clf = GaussianProcessRegressor() elif flag == 16: algo_clf = DecisionTreeRegressor() elif flag == 17: algo_clf = RandomForestRegressor(n_estimators=500) elif flag == 18: algo_clf = ExtraTreesRegressor(n_estimators=500) elif flag == 19: algo_clf = BaggingRegressor(n_estimators=500) elif flag == 20: algo_clf = AdaBoostRegressor(n_estimators=500) elif flag == 21: algo_clf = GradientBoostingRegressor(n_estimators=500) elif flag == 22: algo_clf = HistGradientBoostingRegressor() if combine_type == "multi_output": clf = MultiOutputRegressor(algo_clf).fit(X_train, Y_train) elif combine_type == "chain": clf = RegressorChain(algo_clf).fit(X_train, Y_train) else: raise Exception("Unimplemented!!!") return clf, algo_clf.__class__.__name__
feature_cols = ['day', 'hour', 'minute', 'weekday'] # FEATURES #feature_cols = ['hour','minute','weekday'] X = dataframe.loc[:, feature_cols] if not args.all_ap: print("One label") y = dataframe.AcadBldg18AP2 # Only one AP first as target else: print("Multi-label") y = dataframe.loc[:, "AcadBldg10AP1":"SocBldg9AP1"] # ALL APS #print(dataframe.head(-1)) if args.chained: print("Using Regressor Chain") model = RegressorChain(model, random_state=1) mode = "rc" elif args.binary: print("Using Binary Relevance") model = MultiOutputRegressor(model) mode = "br" else: print("Using Raw Model") timeSeriesSplitCV(model, X, y, args.split_num, args.show_plot) #real = y.tolist() # Scatter plot # fig, ax = plt.subplots() # ax.scatter(real, predicted)
print(f'Result: {mean(n_scores):.3f}, ({std(n_scores):.3f})') # Wrapper for algorithms that don't support Multi-Output Regression # Support Vector Machine from sklearn.svm import LinearSVR model = LinearSVR() # Won't work: # model.fit(X, y) # Exception! ValueError, bad input shape # Option 1: MultiOutput Regressor. Create seperate model for each output. # Works well if outputs are independent or mostly independent from sklearn.multioutput import MultiOutputRegressor wrapper = MultiOutputRegressor(model) # fit model wrapper.fit(X, y) # predict yhat = wrapper.predict(data_in) print('MultiOutputRegressor with Support Vector Regressor Model') print(yhat[0]) # Option 2: RegressorChain from sklearn.multioutput import RegressorChain model = LinearSVR() wrapper = RegressorChain(model) # define chain order here wrapper.fit(X, y) yhat = wrapper.predict(data_in) print('RegressorChain with Support Vector Regressor Model') print(yhat[0])
# In[ ]: # Using best hyper-parameters from the single-step ahead regression multi_svr = MultiOutputRegressor(estimator=SVR(kernel='rbf', **svr.best_params_), n_jobs=-1) multi_svr.fit(X_train.values, y_train.values) # In[ ]: # A multi-step model that arranges regressions into a chain. Each model makes a prediction # in the order specified by the chain (i.e. order of columns in the target matrix) using # all of the available features provided to the model plus the predictions of models that # are earlier in the chain. Order of columns is arranged by time-lags. Base model is SVM! chain_svr = RegressorChain(base_estimator=SVR(kernel='rbf', **svr.best_params_)) chain_svr.fit(X_train.values, y_train.values) # ## DecisionTree multi-step regressor # In[ ]: # DecisionTreeRegressor supports multi-step output out-of-the-box! # Grid search with cross-validation parameters = [{'criterion':['mse', 'mae'], 'max_depth':[1, 5, None], 'max_features':['auto', 'log2', 0.5], 'max_leaf_nodes':[2, None]}] tree = GridSearchCV(estimator=DecisionTreeRegressor(),
def fit(self, X, y, sample_weight=None): if sample_weight is not None: raise NotImplementedError() if self.base_estimator is None: base_estimator = BoostedRegressor([ LinearRegression(frame_out=True), AGPR(n_restarts_optimizer=250), ]) else: base_estimator = self.base_estimator random_state = None if isinstance(self.random_state, int): random_state = self.random_state elif self.random_state is not None: random_state = self.random_state.randint(2**30) self.estimators = [] rc = RegressorChain( base_estimator, order='random', cv=self.cv, ) for n in range(self.n_chains): n_rc = clone(rc) if random_state is not None: n_rc.random_state = random_state + n i = (f'chain{n}', n_rc) self.estimators.append(i) self._pre_fit(X, y) #return super().fit(X, y, sample_weight=sample_weight) """ common fit operations. """ if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') if (self.weights is not None and len(self.weights) != len(self.estimators)): raise ValueError('Number of `estimators` and weights must be equal' '; got %d weights, %d estimators' % (len(self.weights), len(self.estimators))) names, clfs = zip(*self.estimators) self._validate_names(names) n_isnone = np.sum( [clf in (None, 'drop') for _, clf in self.estimators]) if n_isnone == len(self.estimators): raise ValueError( 'All estimators are None or "drop". At least one is required!') # self.estimators_ = Parallel(n_jobs=self.n_jobs)( # delayed(_parallel_fit_estimator)(clone(clf), X, y, # sample_weight=sample_weight) # for clf in clfs if clf not in (None, 'drop') # ) self.estimators_ = [] for (clfname, clf) in self.estimators: e = clone(clf).fit(X, y) self.estimators_.append(e) self.named_estimators_ = Bunch() for k, e in zip(self.estimators, self.estimators_): self.named_estimators_[k[0]] = e return self
y = df[target_columns].values chain_order = [4, 3, 1, 2, 0] estimators = { "K-nn": KNeighborsRegressor(), "Ridge": Ridge(), "Lasso": Lasso(), "ElasticNet": ElasticNet(random_state=0), "MultiO/P AdaB": MultiOutputRegressor(AdaBoostRegressor(n_estimators=5)), "RegChain K-nn": RegressorChain(KNeighborsRegressor(), order=chain_order), "RandomForestRegressor": RandomForestRegressor(max_depth=4), "Decision Tree Regressor": DecisionTreeRegressor(max_depth=5), "Extra trees": ExtraTreesRegressor(n_estimators=10), "MultiO/P GBR": MultiOutputRegressor(GradientBoostingRegressor(n_estimators=5)) } scores_df = defined_cross_val_score(x, y, estimators, n_folds=3, std_threshold=None) scores_df = scores_df.sort_values(by='mean_score')
def train(self): self.model = Lasso() self.model = RegressorChain(self.model) self.model.fit(self.train_x, self.train_y) self.train_output = self.model.predict(self.train_x)