def voting_compile_fit(self): #This funtion does compiling and fitting on VotingRegressor prev_mse = 0 i = 0 #We do n fitting and compling to find the best VotingRegressor while (i < self.n_repetition): if i == 0: self.voting_reg = VotingRegressor(estimators=self.reg_models) self.voting_reg.fit(self.X_train, self.y_train.values.ravel()) y_pred = self.voting_reg.predict(self.X_test) prev_mse = mean_squared_error(self.y_test, y_pred) print(i + 1, ". ", "Voting_reg", prev_mse / 1000000) else: current_reg = VotingRegressor(estimators=self.reg_models) current_reg.fit(self.X_train, self.y_train.values.ravel()) y_pred = current_reg.predict(self.X_test) mse = mean_squared_error(self.y_test, y_pred) print(i + 1, ". ", "Voting_reg", mse / 1000000) if mse < prev_mse: self.voting_reg = current_reg prev_mse = mse i = i + 1
def test_weights_regressor(): """Check weighted average regression prediction on boston dataset.""" reg1 = DummyRegressor(strategy='mean') reg2 = DummyRegressor(strategy='median') reg3 = DummyRegressor(strategy='quantile', quantile=.2) ereg = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=[1, 2, 10]) X_r_train, X_r_test, y_r_train, y_r_test = \ train_test_split(X_r, y_r, test_size=.25) reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test) reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test) reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test) ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test) avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]) assert_almost_equal(ereg_pred, avg, decimal=2) ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=None) ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=[1, 1, 1]) ereg_weights_none.fit(X_r_train, y_r_train) ereg_weights_equal.fit(X_r_train, y_r_train) ereg_none_pred = ereg_weights_none.predict(X_r_test) ereg_equal_pred = ereg_weights_equal.predict(X_r_test) assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def create_model(tp='rf', rand=0): """Initialize a machine-learning model Parameters: tp - machine-learning approach: 'rf', 'gb', 'nn', 'voting', 'tree' or 'lm' Returns a sklearn model """ if tp == 'rf': sklearn_model = RandomForestRegressor(random_state=rand, n_estimators=500) elif tp == 'lm': sklearn_model = LinearRegression() elif tp == 'tree': sklearn_model = DecisionTreeRegressor(random_state=rand, max_depth=10) elif tp == 'gb': sklearn_model = GradientBoostingRegressor(random_state=rand, n_estimators=500) elif tp == 'nn': sklearn_model = MLPRegressor(random_state=rand, max_iter=500, hidden_layer_sizes=(500, )) elif tp == 'voting': reg1 = GradientBoostingRegressor(random_state=rand, n_estimators=500) reg2 = RandomForestRegressor(random_state=rand, n_estimators=500) reg3 = DecisionTreeRegressor(random_state=rand, max_depth=10) sklearn_model = VotingRegressor( estimators=[('gb', reg1), ('rf', reg2), ('tree', reg3)]) else: print('Wrong model type!!!') return [] if tp == 'nn': model = dc.models.SklearnModel(sklearn_model, use_weights=False) else: model = dc.models.SklearnModel(sklearn_model) return model
def reg_fit_predict(self, x_train, x_test, y_train, y_test, est_name, report_flg=True): if est_name == 'vote': if len(self.regression_estimators.keys()) > 1: print(self.regression_estimators.items()) model = VotingRegressor( estimators=self.regression_estimators.items()) else: print('Caution: No models') return else: model = self.base_regression_estimators[est_name] model.fit(x_train, y_train) # predict test data y_pred = model.predict(x_test) # report scores if report_flg == True: self.reg_score_report(y_test, y_pred) # add model to dict self.regression_estimators[est_name] = model return
def rainfall_runoff(precip_file, delineated_file, discharge_file, plot_fname): # give precipitation data and delineated watershed data as input # inputs should be .mat only precip_mat = loadmat(precip_file)['basin_daily_precipitation'] basin_mat_delineated = loadmat(delineated_file)['basin_mat_delineated'] # read discharge data as .xls input discharge_df = pd.ExcelFile(discharge_file) discharge_df = discharge_df.parse(0) discharge_df = discharge_df.fillna(0) # Replace the nan values with 0's basin_num = 5 reg1 = RandomForestRegressor(n_estimators=100, random_state=42) reg4 = BaggingRegressor(n_estimators=100, random_state=50) voting_reg = VotingRegressor([('br', reg4), ('rf', reg1)]) X, y = get_data(discharge_df, basin_num, precip_mat, basin_mat_delineated, False) voting_reg.fit(X, y) y_pred = voting_reg.predict(X) plt.scatter(y_pred, y_pred - y, c='r') plt.title("Runoff prediction data using a voting-regressor") plt.xlabel("Predicted Output") plt.ylabel("Error in prediction") print(plot_fname) plt.savefig(plot_fname)
def voting(): # dtr model tuned_parameters = [{ 'criterion': ['mse', 'mae'], 'max_depth': np.arange(1, 10) }] dtr = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=5) # rfr model tuned_parameters = { 'min_samples_split': [3, 6, 9], 'n_estimators': [10, 50, 100] } rfr = GridSearchCV(RandomForestRegressor(), param_grid=tuned_parameters, cv=5) # build voting model voting_reg = VotingRegressor(estimators=[('dtr_reg', dtr), ('rfr_reg', rfr)], weights=[1, 2]) # fit the model using some training data voting_reg.fit(X_train, Y_train) # print the mean accuracy of testing predictions train_score = voting_reg.score(X_test, Y_test) # print the mean accuracy of testing predictions print("Accuracy score for final voting= " + str(round(train_score, 4)))
def voting_test(): # create_holdout_data(outfile='./data/holdout_split.pkl') x_train, x_holdout, y_train, y_holdout = create_holdout_data( ratio=.10, seed=13, targets='distress_TQ', ) estimators = [ ('svm', SVR(kernel='rbf')), ('etree', ExtraTreesRegressor(1000, 'mae', random_state=13)), # ('gb', GradientBoostingRegressor()) ] params = { # 'svm__kernel': ('linear', 'rbf'), 'svm__C': (1, 10, 100, 100), 'svm__gamma': (1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1), # 'svm__degree': (2, 3, 4, 5), # 'etree__n_estimators': (100, 500, 1000), # 'etree__criterion': ('mse', 'mae'), } reg = VotingRegressor(estimators=estimators) grid = GridSearchCV(estimator=reg, param_grid=params, cv=3, verbose=2) grid.fit(x_train, y_train) print(grid.best_params_) gridfile = './data/distress_TQ_VotingRegressor_GridSearchCV.pkl' with open(gridfile, 'wb') as file: pkl.dump(grid, file)
def trail_main(): n_folds = 10 train_path = 'data/assign3_students_train.txt' test_path = 'data/assign3_students_test.txt' train_data = read_process_data(train_path) test_data = read_process_data(test_path) models_dict = get_models() scores_dict = {} learned_models_dict = {} for df_key, df_val in train_data.items(): X_train, X_test, y_train, y_test = get_final_score_tts( df_val.copy(), test_data[df_key].copy(), n_best=15) voting_list = [] for model_key, model_val in models_dict.items(): model = model_val.fit(X_train, y_train) name = f'{df_key}_{model_key}' learned_models_dict[name] = model voting_list.append((name, model)) # print(f"{name}, Train MSE ", mean_squared_error(y_train, model.predict(X_train))) # print(f"{name}, Train RScore ", r2_score(y_train, model.predict(X_train))) # print(f"{name}, Test RScore ", r2_score(y_test, model.predict(X_test))) print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") print(f"{name}, Test MSE ", mean_squared_error(y_test, model.predict(X_test))) print(f"{name}, Test Score", model.score(X_test, y_test)) print('=' * 75, '\n') model = VotingRegressor(voting_list) model = model.fit(X_train, y_train) print('=' * 75, '\n') print(f"{df_key}, Voting Test MSE = ", mean_squared_error(y_test, model.predict(X_test))) print(f"{df_key}, Voting Test Score", model.score(X_test, y_test)) print('=' * 75, '\n\n')
def model_fit_save(train_x, train_y, test_x, test_y): ## Training the model r1 = LinearRegression() #r2 = RandomForestRegressor(n_estimators=10, random_state=1) r3 = SVR(kernel='rbf') er = VotingRegressor([ ('lr', r1), #('rf', r2), ('svr_rbf', r3) ]) er.fit(train_x, train_y) ### Evaluating based on the train data y_pred = er.predict(test_x) print('Mean Absolute Error:', mean_absolute_error(test_y, y_pred)) print('Mean Squared Error:', mean_squared_error(test_y, y_pred)) print('Root Mean Squared Error:', np.sqrt(mean_squared_error(test_y, y_pred))) ## Saving the model # Save the model as a pickle in a file joblib.dump(er, 'model.pkl')
def _get_base_ensembler(self, models): # @TODO Might want to reflect choice of ensemble / model n_jobs here? # If wrapping in ensemble, set n_jobs for ensemble # and each indv model, make sure 1 for model in models: try: model[1].n_jobs = 1 except AttributeError: pass # Ensemble of des ensembles case if hasattr(model[1], 'estimators'): for estimator in model[1].estimators: try: estimator.n_jobs = 1 except AttributeError: pass if self.spec['problem_type'] == 'regression': return VotingRegressor(models, n_jobs=self.spec['n_jobs']) return VotingClassifier(models, voting='soft', n_jobs=self.spec['n_jobs'])
def generate_ensemble_regressor(models_to_combine, X_train, y_train, X_test, y_test): print(' - Generating ensemble model') ensemble = VotingRegressor(estimators=models_to_combine) cv = KFold(n_splits=5) results = cross_validate(ensemble, X_train, y_train, cv=cv, return_estimator=True, scoring='r2') best_model = None best_scorer = 0 for m, s in zip(results['estimator'], results['test_score']): if (best_model is None or best_scorer < s): best_scorer = s best_model = m y_pred = best_model.predict(X_test) print(' - Cross-validation results:') print(' - r2:', np.max(results['test_score'])) print(" - Test set results:") print(" - r2:", metrics.r2_score(y_test, y_pred)) return best_model
def train(self): self.gripperjack = self.gripperjack[0] self.location = self.location[0] generator = pg.generator_factory(self.type) self.df: pd.DataFrame = generator.generate(self.gripperjack, self.location, 1) print(self.df.columns) self.df = self.df.drop(columns=['Timestamp']).dropna() print('DATAFRAME IS LOADED IN') x = None x_train = None x_test = None y = None y_train = None y_test = None regressor = None y = self.df.pop('next') x = self.df x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True) r = [('K Neighbour Regressor', KNeighborsRegressor(n_neighbors=15, n_jobs=5, leaf_size=50)), ('Random Forrest Regressor', RandomForestRegressor(n_estimators=200, n_jobs=5)), ('Ada Regressor', AdaBoostRegressor(n_estimators=100, learning_rate=0.1))] regressor = VotingRegressor(r, weights=[0.1, 1, 0.1]) regressor.fit(x_train, y_train) print('===================') print('SCORE X/Y TEST') print(regressor.score(x_test, y_test)) dump_location = 'Recources\\regressor_dumps\\' + self.type + '\\' + str( self.gripperjack) + '\\' + self.location print('==================') print('ACCURACY') y_pred = regressor.predict(x_test) mae = metrics.mean_absolute_error(y_test, y_pred) mape = (mae / (y.max() - y.min())) * 100 print('MAE') print(mae) print('MAPE') print(mape) if not os.path.exists(dump_location): os.makedirs(dump_location) pickle.dump(regressor, open(dump_location + '\\regressor.sav', 'wb')) return mape
def train(features: List[str]): in_cols = [ "climate_vs", "climate_def", "climate_vap", "climate_aet", "precipitation", "landcover_5", ] target_col = "burn_area" date_split = "2013-01-01" train_all = get_training_dataset() train_ = train_all.loc[train_all.date < date_split] valid_ = train_all.loc[train_all.date > date_split] X_train, y_train = train_[in_cols], train_[target_col] X_valid, y_valid = valid_[in_cols], valid_[target_col] xgb_model = xgb.XGBRegressor(n_estimators=300, max_depth=3, colsample_bytree=0.5, objective='reg:squarederror') xgb_model.fit(X_train, y_train) # cat_model=CatBoostRegressor(iterations=300, depth=5, learning_rate=0.1, loss_function='RMSE') # cat_model.fit(X_train, y_train,eval_set=(X_valid, y_valid),plot=True) lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=8, num_leaves=6, objective="regression") lgb_model.fit(X_train, y_train) # voting_regressor = VotingRegressor([('xgb', xgb_model), ('cat', cat_model), ('lgb', lgb_model)]) voting_regressor = VotingRegressor([('xgb', xgb_model), ('lgb', lgb_model)]) voting_regressor.fit(X_train, y_train) return voting_regressor
def vote_prediction(X_train, X_test, y_train, y_test, alpha, l1_ratio, n_estimators, max_depth, c, gamma): # def vote_prediction(X_train, X_test, y_train, y_test, forest, svr): print("******************* VOTING ******************", end="\n\n") # forest = RandomForestRegressor(n_estimators=242, max_depth=5) # elasic_net = ElasticNet(alpha=0.141, l1_ratio=1.0) forest = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth) # elasic_net = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) # linear_regressor = LinearRegression() svr = SVR(kernel='rbf', C=c, gamma=gamma) voting_regressor = VotingRegressor(estimators=[ ('rf', forest), # ('enet', elasic_net), # ('lr', linear_regressor), ('svr', svr) ]) voting_regressor = voting_regressor.fit(X_train, y_train) y_pred = voting_regressor.predict(X_test) evaluate('Voting', y_test, y_pred, write_predictions=True) print("\n*********************************************", end="\n\n")
def ensemble_lgb_regressor(self): try: root_dir = ('/Users/lujingze/Programming/SWFusion/' 'regression/tc/lightgbm/model/') model_dir = { 'SG-FL': (f"""{root_dir}na_101.845662_fl_smogn_""" f"""final_threshold_square_2/"""), 'MSE': f'{root_dir}na_2.188733/', } er_name = '' estimators = [] for idx, (name, out_dir) in enumerate(model_dir.items()): er_name += f'{name}_' save_file = [f for f in os.listdir(out_dir) if f.endswith('.pkl') and f.startswith(f'{self.basin}')] if len(save_file) != 1: self.logger.error('Count of Bunch is not ONE') exit(1) with open(f'{out_dir}{save_file[0]}', 'rb') as f: best_result = pickle.load(f) estimators.append((name, best_result.model)) er_name = er_name[:-1] er = VotingRegressor(estimators) er.fit(self.X_train, self.y_train) os.makedirs(f'{root_dir}{er_name[:-1]}/', exist_ok=True) y_pred = er.predict(self.X_test) y_pred.to_pickle(f'{er_dir}y_pred.pkl') except Exception as msg: breakpoint() exit(msg)
def voting_regressor(self): estimators_num = 10 regs = { 'GBR': GradientBoostingRegressor( random_state=1, n_estimators=estimators_num), 'RF': RandomForestRegressor( random_state=1, n_estimators=estimators_num, n_jobs=-1), 'LR': LinearRegression(), } ereg_estimators = [] ereg_name = '' for idx, (name, reg) in enumerate(regs.items()): ereg_estimators.append((name, reg)) ereg_name += f'{name}_' ereg = VotingRegressor(estimators=ereg_estimators, n_jobs=-1) ereg.fit(self.X_train, self.y_train) y_pred = ereg.predict(self.X_test) root_dir = ('/Users/lujingze/Programming/SWFusion/' 'regression/tc/lightgbm/model/') ereg_dir = f'{root_dir}{ereg_name[:-1]}/' os.makedirs(ereg_dir, exist_ok=True) dump(ereg, f'{ereg_dir}voting_model.joblib') with open(f'{ereg_dir}test_pred.pkl', 'wb') as f: pickle.dump(y_pred, f)
def run_regressors(): pyplot.plot(y_test, label='Actual') pyplot.legend() pyplot.xlabel('Time') pyplot.ylabel('USD/TRY') pyplot.show() # Voting Regressor reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10) reg2 = RandomForestRegressor(random_state=1, n_estimators=10) reg3 = LinearRegression() model = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)]) model = model.fit(normalized_train_x, numpy.ravel(y_train)) train_predict, test_predict = make_predictions(model, normalized_train_x, normalized_test_x) score_regressions('Voting Regressor', y_train, train_predict, y_test, test_predict) # score_classifications('Voting Regressor', y_train, train_predict, y_test, test_predict) # plot_graph(y_test, test_predict, 'Voting Regressor') voting = test_predict xgb = execute_model('Extreme Gradient Boost Regressor', {}, XGBRegressor) linearRegression = execute_model('Linear Regression Regressor', linearRegressionParameters, LinearRegression) ridge = execute_model('Ridge Regressor', ridgeParameters, Ridge) bayesianRidge = execute_model('Bayesian Ridge Regressor', bayesianRidgeParameters, BayesianRidge) lasso = execute_model('Lasso Regressor', lassoParameters, Lasso) lassoLars = execute_model('Lasso Lars Regressor', lassoLarsParameters, LassoLars) tweedie = execute_model('Tweedie Regressor', tweedieParameters, TweedieRegressor) svr = execute_model('SVR Regressor', svrParameters, SVR) sgd = execute_model('SGD Regressor', sgdParameters, SGDRegressor) kNeighbors = execute_model('K Neighbors Regressor', kNeighborsParameters, KNeighborsRegressor) gaussian = execute_model('Gaussian Process Regressor', gaussianProcessorParameters, GaussianProcessRegressor) mlp = execute_model('MLP Regressor ( FeedForward ANN )', mlpParameters, MLPRegressor)
def main(): print(__doc__) import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.ensemble import VotingRegressor from sklearn.datasets import make_regression test_paths_file = "../test_paths_1.npy" val_paths_file = "../val_paths_1.npy" test_paths = np.load(test_paths_file, mmap_mode="r") val_paths = np.load(val_paths_file, mmap_mode="r") X, y = load_diabetes(return_X_y=True) # Train classifiers reg1 = GradientBoostingRegressor(random_state=1) reg2 = RandomForestRegressor(random_state=1) reg3 = LinearRegression() reg1.fit(X, y) reg2.fit(X, y) reg3.fit(X, y) ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)]) ereg.fit(X, y)
def full_train(): """ Function to train the model on all of the available data. The trained model is saved as a pickle file. Returns ------- Nothing is directly returned. The function saves the model in a pickle file for later usage in predictions. """ X, y = df_prep_split()[0], df_prep_split()[1] ridge_reg = Ridge() forest_reg = RandomForestRegressor( max_features=8, n_estimators=100, n_jobs=-1 ) # downscaled n_estimators from 500 due to memory issues on server boost_reg = GradientBoostingRegressor() ensemble_reg = VotingRegressor(estimators=[("ridge", ridge_reg), ("RF", forest_reg), ("GB", boost_reg)], n_jobs=-1) ensemble_reg.fit(X, y) PATH = os.environ.get( 'HOME') + "/app/model.pickle" # CHANGE PATH TO SERVER DIR return pickle.dump(ensemble_reg, open(PATH, "wb"))
def voting_predictions(data, base_models, val=True): data = data_copy(data) Xtrain, Xtest, y = data index = 0 vote_params = [] for base_model in base_models: name = 'model' + str(index) index += 1 model = base_model[0] params = base_model[1] model = model(**params) result = (name, model) vote_params.append(result) votemodel = VotingRegressor(vote_params) votemodel.fit(Xtrain, y) y_pred = votemodel.predict(Xtest) y_pred = np.exp(y_pred) if val: k_fold_crossval(data, model=votemodel) y_pred = np.exp(y_pred) return y_pred
def voting_regressor(X, y): regressors = [ MyDummyRegressor(config=1, random_state=0).fit(X, y) for _ in range(5) ] vr = VotingRegressor(estimators=None) vr.estimators_ = regressors return vr
def voting_regressor_ensemble_3(self): lr, lr_pred = self.linear_regr() rf, rf_pred = self.random_forest_regr() er = VotingRegressor([ ('lr', lr), ('rf', rf), ], n_jobs=-1) return er.fit(self.x_train, self.y_train).predict(self.x_test)
def get_voting(self): if self.case == 'classifier': ensemble = VotingClassifier(estimators=self.models, voting=self.voting, weights=self.weights) else: ensemble = VotingRegressor(estimators=self.models, weights=self.weights) return ensemble
def test_voting_regression(self): model = VotingRegressor([('lr', LinearRegression()), ('dt', DecisionTreeRegressor())]) model, _ = fit_regression_model(model) names = list(enumerate_model_names(model)) assert len(names) == 3 assert [_[0] for _ in names] == ['', 'lr', 'dt'] assert all(map(lambda x: isinstance(x, tuple), names)) assert all(map(lambda x: len(x) == 2, names))
def create_model_sec2(): # Pré-processamento (s/ fit): text_col = 'tit_org_ementa_text' colunas_relevantes = ['tipo_edicao'] + [text_col] stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam'] dou_extractor = PreprocessDOU(colunas_relevantes, ' xxnuloxx ') proc_text = PreProcessText(cargo_tokens=True, lowercase=True, remove_punctuation=True, keep_cash=True, stopwords=stopwords, stemmer=None, strip_accents=False, only_letters=False, text_cols=[text_col]) # Fit processing and model: keywords = ['xxdasdoisxx', 'xxdastresxx', 'xxdasumxx', 'xxfcpedoisxx', 'xxfcpetresxx', 'xxfcpeumxx', 'substituto','substituta', 'substituir', 'substituto eventual', 'substituta eventual'] # 0.925 anti_keywords = ['cargo', 'ambiente', 'comissão', 'comissionada', 'educação', 'gabinete', 'meio', 'pessoa', 'Tecnologia', 'Tecnologia da', 'da Informação', 'Pessoa com', 'Igualdade', 'geral', 'cargo de', 'regional', 'comissão de', 'comissionada de', 'comissionado', 'comissionado de', 'eventual'] keywords_df = pd.DataFrame({text_col: keywords}) anti_keywords_df = pd.DataFrame({text_col: anti_keywords}) proc_keywords = list(proc_text.transform(keywords_df)[text_col].values) proc_anti_keywords = list(proc_text.transform(anti_keywords_df)[text_col].values) vectorizer = WeightedVectorizer(lowercase=False, binary=True, ngram_range=(1,2), max_df=0.2, min_df=1, keywords=proc_keywords, anti_keywords=proc_anti_keywords, keywords_weight=10) encoder_extra = OneHotEncoder(drop='first') processor = ColumnTransformer([('vec', vectorizer, text_col), ('extra', encoder_extra, ['tipo_edicao'])]) #classifier = Ridge(20) classifier = VotingRegressor([('ridge', Ridge(80)), ('svr', SVR(C=30)), ('forest', RandomForestRegressor(max_depth=6, min_samples_split=2, n_estimators=11))]) pipeline = Pipeline([('dou', dou_extractor), ('pretext', proc_text), ('proc', processor), ('fit', classifier)]) return pipeline
def voting_regressor_ensemble_1(self): lr, lr_pred = self.linear_regr() lasso, lasso_pred = self.lasso_regr() rf, rf_pred = self.random_forest_regr() er = VotingRegressor([ ('lr', lr), ('lasso', lasso), ("rf", rf) ], n_jobs=-1) return er.fit(self.x_train, self.y_train).predict(self.x_test)
def run_ensemble_run(self, model_name = 'Ensemble'): reg1 = SVR(C=10, kernel= "rbf", epsilon = 0.1, gamma = 'auto') reg2 = KNeighborsRegressor(n_neighbors = 11) reg3 = RandomForestRegressor(n_estimators = 100) model = VotingRegressor([('RF', reg3)]) model.fit(self.X_train, self.Y_train) self.evaluate_regression(self.Y_train, model.predict(self.X_train), self.dates_train, model_name+'-OnTrain', slicer = 1) self.evaluate_regression(self.Y_test, model.predict(self.X_test), self.dates_test, model_name+'-OnTest', slicer = 1)
def make_voting_regressor(y, x_vars): estimator_list = [("mlp", MLPRegressor(random_state=1, max_iter=250)), ("random_forest", RandomForestRegressor(n_jobs=1)), ("nearest_neighbor", KNeighborsRegressor(n_neighbors=4)), ("decision_tree", DecisionTreeRegressor(random_state=0)), ("gradient_boost", GradientBoostingRegressor(random_state=0))] ereg = VotingRegressor(estimators=estimator_list) ereg.fit(x_vars, y) return ereg, {}
def __init__(self, config, train_values, train_labels, test_values, logger): super().__init__(config, train_values, train_labels, test_values, logger) self.model = VotingRegressor([ ('random_forest', SVR(kernel='rbf', gamma=0.1)), ('krr', KernelRidge(kernel='rbf', gamma=0.1)), ('ada', AdaBoostRegressor()), ('rf', RandomForestRegressor()), ('et', ExtraTreesRegressor()) ])
def create_prediction_pipeline(self) -> Pipeline: feature_engineering = FeatureEngineering(**self.feat_eng_parameters) x_boost = XGBRegressor(**self.algo_hyperparams["x_boost"]) rf = RandomForestRegressor(**self.algo_hyperparams["rf"]) vr = VotingRegressor([("x_boost", x_boost), ("rf", rf)]) return Pipeline( steps=[ ("feature_engineering", feature_engineering), ("voting_regressor", vr), ] )