def test_huber_equals_lr_for_high_epsilon(): # Test that Ridge matches LinearRegression for large epsilon X, y = make_regression_with_outliers() lr = LinearRegression(fit_intercept=True) lr.fit(X, y) huber = HuberRegressor(fit_intercept=True, epsilon=1e3, alpha=0.0) huber.fit(X, y) assert_almost_equal(huber.coef_, lr.coef_, 3) assert_almost_equal(huber.intercept_, lr.intercept_, 2)
def test_huber_warm_start(): X, y = make_regression_with_outliers() huber_warm = HuberRegressor( fit_intercept=True, alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1) huber_warm.fit(X, y) huber_warm_coef = huber_warm.coef_.copy() huber_warm.fit(X, y) # SciPy performs the tol check after doing the coef updates, so # these would be almost same but not equal. assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1) assert huber_warm.n_iter_ == 0
def test_huber_warm_start(): X, y = make_regression_with_outliers() huber_warm = HuberRegressor( fit_intercept=True, alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1) huber_warm.fit(X, y) huber_warm_coef = huber_warm.coef_.copy() huber_warm.fit(X, y) # SciPy performs the tol check after doing the coef updates, so # these would be almost same but not equal. assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1) # No n_iter_ in old SciPy (<=0.9) # And as said above, the first iteration seems to be run anyway. if huber_warm.n_iter_ is not None: assert_equal(1, huber_warm.n_iter_)
def test_huber_and_sgd_same_results(): # Test they should converge to same coefficients for same parameters X, y = make_regression_with_outliers(n_samples=10, n_features=2) # Fit once to find out the scale parameter. Scale down X and y by scale # so that the scale parameter is optimized to 1.0 huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, epsilon=1.35) huber.fit(X, y) X_scale = X / huber.scale_ y_scale = y / huber.scale_ huber.fit(X_scale, y_scale) assert_almost_equal(huber.scale_, 1.0, 3) sgdreg = SGDRegressor( alpha=0.0, loss="huber", shuffle=True, random_state=0, max_iter=10000, fit_intercept=False, epsilon=1.35, tol=None) sgdreg.fit(X_scale, y_scale) assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
def test_huber_better_r2_score(): # Test that huber returns a better r2 score than non-outliers""" X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100) huber.fit(X, y) linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y mask = np.abs(linear_loss) < huber.epsilon * huber.scale_ huber_score = huber.score(X[mask], y[mask]) huber_outlier_score = huber.score(X[~mask], y[~mask]) # The Ridge regressor should be influenced by the outliers and hence # give a worse score on the non-outliers as compared to the huber regressor. ridge = Ridge(fit_intercept=True, alpha=0.01) ridge.fit(X, y) ridge_score = ridge.score(X[mask], y[mask]) ridge_outlier_score = ridge.score(X[~mask], y[~mask]) assert_greater(huber_score, ridge_score) # The huber model should also fit poorly on the outliers. assert_greater(ridge_outlier_score, huber_outlier_score)
def test_huber_sparse(): X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=True, alpha=0.1) huber.fit(X, y) X_csr = sparse.csr_matrix(X) huber_sparse = HuberRegressor(fit_intercept=True, alpha=0.1) huber_sparse.fit(X_csr, y) assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
def get_outliers_by_huber(self, table, column_indexes): ''' Get outliers using huber regression, which outperforms RANSAC, but doesn't scale well when the number of samples are very large. Huber outputs both perfect precision (100%) and recall (100%) in our experiments. ''' X = table[ :, column_indexes[ :-1]].astype(float) X = utils.enforce_columns(X) y = table[ :, column_indexes[-1]].astype(float) # preprocessing could make HUBER fail on some dataset in our experiments #x = preprocessing.minmax_scale(x) #y = preprocessing.minmax_scale(y) model_huber = HuberRegressor() model_huber.fit(X, y) outlier_mask = model_huber.outliers_ outliers = [idx for idx, val in enumerate(outlier_mask) if val] residuals = abs(model_huber.predict(X) - y) confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9 return (outliers, confidences)
def test_huber_scaling_invariant(): # Test that outliers filtering is scaling independent. X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100) huber.fit(X, y) n_outliers_mask_1 = huber.outliers_ assert_false(np.all(n_outliers_mask_1)) huber.fit(X, 2. * y) n_outliers_mask_2 = huber.outliers_ assert_array_equal(n_outliers_mask_2, n_outliers_mask_1) huber.fit(2. * X, 2. * y) n_outliers_mask_3 = huber.outliers_ assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
def test_huber_scaling_invariant(): """Test that outliers filtering is scaling independent.""" rng = np.random.RandomState(0) X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, epsilon=1.35) huber.fit(X, y) n_outliers_mask_1 = huber.outliers_ huber.fit(X, 2. * y) n_outliers_mask_2 = huber.outliers_ huber.fit(2. * X, 2. * y) n_outliers_mask_3 = huber.outliers_ assert_array_equal(n_outliers_mask_2, n_outliers_mask_1) assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
def test_huber_sample_weights(): # Test sample_weights implementation in HuberRegressor""" X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=True, alpha=0.1) huber.fit(X, y) huber_coef = huber.coef_ huber_intercept = huber.intercept_ huber.fit(X, y, sample_weight=np.ones(y.shape[0])) assert_array_almost_equal(huber.coef_, huber_coef) assert_array_almost_equal(huber.intercept_, huber_intercept) X, y = make_regression_with_outliers(n_samples=5, n_features=20) X_new = np.vstack((X, np.vstack((X[1], X[1], X[3])))) y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]])) huber.fit(X_new, y_new) huber_coef = huber.coef_ huber_intercept = huber.intercept_ huber.fit(X, y, sample_weight=[1, 3, 1, 2, 1]) assert_array_almost_equal(huber.coef_, huber_coef, 3) assert_array_almost_equal(huber.intercept_, huber_intercept, 3) # Test sparse implementation with sample weights. X_csr = sparse.csr_matrix(X) huber_sparse = HuberRegressor(fit_intercept=True, alpha=0.1) huber_sparse.fit(X_csr, y, sample_weight=[1, 3, 1, 2, 1]) assert_array_almost_equal(huber_sparse.coef_, huber_coef, 3)
Class1 = RANSACRegressor(random_state=42) Class1.fit(X_train, y_train) Class1_predictions = Class1.predict(X_test) Class1_accuracy = accuracy_score(y_true, Class1_predictions, normalize=True, sample_weight=None) Class2 = TheilSenRegressor(random_state=42) Class2.fit(X_train, y_train) Class2_predictions = Class1.predict(X_test) Class2_accuracy = accuracy_score(y_true, Class2_predictions, normalize=True, sample_weight=None) Class3 = LinearRegression() Class3.fit(X_train, y_train) Class3_predictions = Class3.predict(X_test) Class3_accuracy = accuracy_score(y_true, Class3_predictions, normalize=True, sample_weight=None) Class4 = HuberRegressor(alpha=0.0, epsilon=epsilon) Class4.fit(X_train, y_train) Class4_predictions = Class4.predict(X_test) Class4_accuracy = accuracy_score(y_true, Class4_predictions, normalize=True, sample_weight=None) #Print different accuracies print("First Accuracy: ", Class1_accuracy) print("Second Accuracy: ", Class2_accuracy) print("Third Accuracy: ", Class3_accuracy) print("Fourth Accuracy: ", Class4_accuracy) return
def trainCV(X, y, random, splits): kf = KFold(n_splits = splits) nSplits = kf.get_n_splits(X) nFold = 0 l_lgbm = [] l_ridge = [] l_huber = [] y_lgbm = np.zeros(len (y)) y_ridge = np.zeros(len (y)) y_huber = np.zeros(len (y)) for train_index, valid_index in kf.split(X): if is_stop(): break print ("FOLD# " + str(nFold)) train_X = X[train_index] train_y = y[train_index] valid_X = X[valid_index] valid_y = y[valid_index] price_valid_real = np.expm1(valid_y) d_train = lgb.Dataset(train_X, label=train_y) d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] params = { 'learning_rate': 0.01, 'application': 'regression', 'num_leaves': 311, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 0, 'nthread': 4, 'max_bin': 255 } model_lgbm = lgb.train(params, train_set=d_train, num_boost_round=810, valid_sets=watchlist, verbose_eval=50, early_stopping_rounds=400) preds_lgbm = model_lgbm.predict(valid_X) y_lgbm[valid_index] = preds_lgbm price_lgbm_pred = np.expm1(preds_lgbm) o_lgbm = rmsle_func(price_lgbm_pred, price_valid_real) print ("LGBM RMSLE: " + str(o_lgbm)) l_lgbm.append(o_lgbm) model_ridge = Ridge(alpha=.05, copy_X=True, fit_intercept=True, max_iter=50, normalize=False, random_state=101, solver='auto', tol=0.001) model_ridge.fit(train_X, train_y) preds_ridge = model_ridge.predict(valid_X) y_ridge[valid_index] = preds_ridge price_ridge_pred = np.expm1(preds_ridge) o_ridge = rmsle_func(price_ridge_pred, price_valid_real) print ("RIDGE RMSLE: " + str(o_ridge)) l_ridge.append(o_ridge) model_huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=58, epsilon=363) model_huber.fit(train_X, train_y) preds_huber = model_huber.predict(valid_X) y_huber[valid_index] = preds_huber price_huber_pred = np.expm1(preds_huber) o_huber = rmsle_func(price_huber_pred, price_valid_real) print ("HUBER RMSLE: " + str(o_huber)) l_huber.append(o_huber) nFold = nFold + 1 a_lgbm = np.array(l_lgbm) a_ridge = np.array(l_ridge) a_huber = np.array(l_huber) print ("LGBM RMSLE = " + str (a_lgbm.mean()) + " +/- " + str(a_lgbm.std())) print ("RIDGE RMSLE = " + str (a_ridge.mean()) + " +/- " + str(a_ridge.std())) print ("HUBER RMSLE = " + str (a_huber.mean()) + " +/- " + str(a_huber.std())) return [y_lgbm, y_ridge, y_huber]
# stacking train_stack = np.vstack([oof_lgb, oof_lgb1, oof_xgb, oof_cat]).transpose() test_stack = np.vstack( [predictions_lgb, predictions_lgb1, predictions_xgb, predictions_cat]).transpose() folds_stack = StratifiedKFold(n_splits=10, shuffle=True, random_state=8888) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y_train)): print("fold :", fold_ + 1) trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx] val_data, val_y = train_stack[val_idx], y_train[val_idx] stacking = HuberRegressor(epsilon=1.03, alpha=1e-5) stacking.fit(trn_data, trn_y) oof_stack[val_idx] = stacking.predict(val_data) predictions += stacking.predict(test_stack) / folds_stack.n_splits print("stacking MAE score: {:<8.8f}".format( mean_absolute_error(oof_stack, y_train))) print("stacking CV score: {:<8.8f}".format( 1 / (mean_absolute_error(oof_stack, y_train) + 1))) print(predictions_lgb.mean(), predictions_lgb1.mean(), y_train.mean(), predictions.mean()) result['score'] = predictions result['score'] = round(result['score']).map(int) result.to_csv('../result/stacking.csv', index=None)
def test_huber_bool(): # Test that it does not crash with bool data X, y = make_regression(n_samples=200, n_features=2, noise=4.0, random_state=0) X_bool = X > 0 HuberRegressor().fit(X_bool, y)
def forecaster(returns, ff, loss='MSE'): output = [] factorLoadings = [] varianceOfErrors = [] df = ff.merge(returns, left_index=True, right_index=True) name = returns.columns.tolist()[0] df[name] = df[name] - df['RF'] regressors = ['Mkt.Rf', 'HML', 'Mom', 'RMW', 'CMA'] for j in range(120, len(df.index.tolist())): trainData = df.iloc[(j - 120):j, :] trainX = trainData[regressors] trainY = trainData[[name]] model = LinearRegression() if loss == 'MSE': model = LinearRegression() if loss == 'Ridge': model = Ridge() if loss == 'Lasso': model = Lasso() if loss == 'Hub': model = HuberRegressor() if True == trainY.isnull().values.any(): output.append(np.nan) factorLoadings.append(np.zeros((1, 5))) varianceOfErrors.append(np.nan) continue model.fit(trainX, trainY) res = '' if loss == 'LAD': model = QuantReg(endog=trainY, exog=trainX) res = model.fit(q=0.5) if loss == '1Q': model = QuantReg(endog=trainY, exog=trainX) res = model.fit(q=0.25) if loss == '3Q': model = QuantReg(endog=trainY, exog=trainX) res = model.fit(q=0.75) if loss in ['LAD', '1Q', '3Q']: factorLoadings.append(np.array(res.params)) else: factorLoadings.append(model.coef_) if loss not in ['Lasso', 'Hub', 'LAD', '1Q', '3Q']: varianceOfErrors.append( np.var(trainY - model.predict(trainX)).tolist()[0]) if loss in ['Lasso', 'Hub']: varianceOfErrors.append( np.var(np.array(trainY) - model.predict(trainX))) if loss in ['LAD', '1Q', '3Q']: varianceOfErrors.append( np.var( model.predict(res.params, exog=trainX) - np.array(trainY))) testData = pd.DataFrame(df.iloc[j, :]).T testX = testData[regressors] if loss in ['LAD', '1Q', '3Q']: prediction = model.predict(res.params, exog=testX) else: prediction = model.predict(testX) if loss in ['Lasso', 'Hub', 'LAD', '1Q', '3Q']: output.append(prediction[0]) else: output.append(prediction[0][0]) return (name, output, factorLoadings, varianceOfErrors)
def regress(X_train, y_train): # comment out any classifier that should not be used classifiers = [ (SGDRegressor(), "SGDRegressor", 1 * global_data_scale), (LinearRegression(), "LinearRegression", 1 * global_data_scale), (Ridge(), "Ridge", 1 * global_data_scale), (Lasso(), "Lasso", 1 * global_data_scale), (ElasticNet(), "ElasticNet", 1 * global_data_scale), (Lars(), "Lars", 1 * global_data_scale), (OrthogonalMatchingPursuit(), "OrthogonalMatchingPursuit", 1 * global_data_scale), (BayesianRidge(), "BayesianRidge", 1 * global_data_scale), (ARDRegression(), "ARDRegression", 1 * global_data_scale), ### NOTE the scoring might be different of PassiveAggressiveRegressor (PassiveAggressiveRegressor(), "PassiveAggressiveRegressor", 1 * global_data_scale), ### NOTE the scoring might be different of RANSACRegressor (RANSACRegressor(), "RANSACRegressor", 1 * global_data_scale), (TheilSenRegressor(), "TheilSenRegressor", 1 * global_data_scale), (HuberRegressor(), "HuberRegressor", 1 * global_data_scale), (DecisionTreeRegressor(), "DecisionTreeRegressor", 1 * global_data_scale), (GaussianProcessRegressor(), "GaussianProcessRegressor", 1 * global_data_scale), (MLPRegressor(), "MLPRegressor", 1 * global_data_scale), (KNeighborsRegressor(), "KNeighborsRegressor", 1 * global_data_scale), (RadiusNeighborsRegressor(), "RadiusNeighborsRegressor", 1 * global_data_scale), (SVR(), "SVR", 1 * global_data_scale), (NuSVR(), "NuSVR", 1 * global_data_scale), (LinearSVR(), "LinearSVR", 1 * global_data_scale), (KernelRidge(), "KernalRidge", 1 * global_data_scale), (IsotonicRegression(), "IsotonicRegression", 1 * global_data_scale) ] # set the list of the values that should be used in grid search params_dict = { "SGDRegressor": { "penalty": ["l2", "l1"], "alpha": [.001, .0001, .00001], "l1_ratio": [.15, .2, .25], "fit_intercept": [True, False], "max_iter": [1000], "shuffle": [True, False], "epsilon": [.05, .1, .2], "learning_rate": ["constant", "optimal", "invscaling", "adaptive"], "eta0": [.005, .01, .02], "power_t": [.2, .25, .3] }, "LinearRegression": { "fit_intercept": [True, False], "normalize": [True, False] }, "Ridge": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "tol": [.01, .001, .0001], "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] }, "Lasso": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "positive": [True, False], "precompute": [True, False] }, "ElasticNet": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "precompute": [True, False], "positive": [True, False], "selection": ["cyclic", "random"] }, "Lars": { "fit_intercept": [True, False], "normalize": [True, False], "precompute": [True, False], "n_nonzero_coefs": [np.inf] }, "OrthogonalMatchingPursuit": { "n_nonzero_coefs": [np.inf, None], "precompute": [True, False], "fit_intercept": [True, False], "normalize": [True, False] }, "BayesianRidge": { "tol": [.01, .001, .0001], "alpha_1": [1e-5, 1e-6, 1e-7], "alpha_2": [1e-5, 1e-6, 1e-7], "lambda_1": [1e-5, 1e-6, 1e-7], "lambda_2": [1e-5, 1e-6, 1e-7], "fit_intercept": [True, False], "normalize": [True, False] }, "ARDRegression": { "tol": [.01, .001, .0001], "alpha_1": [1e-5, 1e-6, 1e-7], "alpha_2": [1e-5, 1e-6, 1e-7], "lambda_1": [1e-5, 1e-6, 1e-7], "lambda_2": [1e-5, 1e-6, 1e-7], "threshold_lambda": [1000, 10000, 100000], "fit_intercept": [True, False], "normalize": [True, False] }, "PassiveAggressiveRegressor": { "C": [.8, 1., 1.2 ], "tol": [1e-2, 1e-3, 1e-4], "n_iter_no_change": [3, 5, 8], "shuffle": [True, False], "average": [True, False] }, "RANSACRegressor": { "base_estimator": [LinearRegression()] }, "TheilSenRegressor": { "max_subpopulation": [1e3, 1e4, 1e5], "tol": [1e-2, 1e-3, 1e-4] }, "HuberRegressor": { "epsilon": [1.1, 1.35, 1.5], "alpha": [1e-3, 1e-4, 1e-5], "warm_start": [True, False], "fit_intercept": [True, False], "": [1e-4, 1e-5, 1e-6] }, "DecisionTreeRegressor": { "criterion": ["mse", "friedman_mse", "mae"], "splitter": ["best", "random"], "min_samples_split": [2, 3], "min_samples_leaf": [1, 2], "min_weight_fraction_leaf": [.0], "max_features": ["auto", "sqrt", "log2"], "min_impurity_split": [1e-6, 1e-7, 1e-8] }, "GaussianProcessRegressor": { "alpha": [1e-8, 1e-10, 1e-12], "optimizer": ["fmin_l_bfgs_b"], "normalize_y": [True, False] }, "MLPRegressor": { "hidden_layer_sizes": [(100,)], "activation": ["identity", "logistic", "tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [1e-3, 1e-4, 1e-5], # "learning_rate": ["constant", "invscaling", "adaptive"], # "learning_rate_init": [1e-2, 1e-3, 1e-4], # "power_t": [.3, .5, .8], # "shuffle": [True, False], # "tol": [1e-3, 1e-4, 1e-5], # "momentum": [.8, .9, .99], # "beta_1": [.8, .9, .99], # "beta_2": [.999], # "epsilon": [1e-7, 1e-8, 1e-9], # "n_iter_no_change": [10], # "max_fun": [15000] }, "KNeighborsRegressor": { "n_neighbors": [20, 10, 5, 3], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [20, 30, 40], "p": [1, 2] }, "RadiusNeighborsRegressor": { "radius": [.8, 1, 1.2], "n_neighbors": [20, 10, 5, 3], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [20, 30, 40], "p": [1, 2] }, "SVR": { "kernel": ["poly", "rbf", "sigmoid"], "degree": [2, 3, 5], "gamma": ["scale", "auto"], "coef0": [.0], "tol": [1e-2, 1e-3, 1e-4], "C": [.8, .1, 1.2], "epsilon": [.08, .1, .12], "shrinking": [True, False], "max_iter": [-1] }, "NuSVR": { "nu": [.2, .5, .8], "C": [.8, .1, 1.2], "kernel": ["poly", "rbf", "sigmoid"], "degree": [2, 3, 5], "gamma": ["scale", "auto"], "coef0": [.0], "shrinking": [True, False], "tol": [1e-2, 1e-3, 1e-4], "max_iter": [-1] }, "LinearSVR": { "epsilon": [.0], "tol": [1e-3, 1e-4, 1e-5], "C": [.8, .1, 1.2], "fit_intercept": [True, False], "dual": [True, False], "intercept_scaling": [.8, 1., 1.2] }, "KernelRidge": { "coef0": [.8, 1, 1.2], "degree": [2, 3, 5], }, "IsotonicRegression": { "increasing": [True, False], } } for model, params, frac in classifiers: full = pd.DataFrame(X_train).join(pd.DataFrame(y_train)) loan_data = full.sample(frac=frac, random_state=random_state) X = loan_data.drop("loan_status", axis=1) y = loan_data["loan_status"] grid = GridSearchCV(model, params_dict[params], verbose=verbose, cv=folds, n_jobs=workers) grid.fit(X, y) yield grid, params
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams)
def Huber_regressor(features, labels): from sklearn.linear_model import HuberRegressor model = HuberRegressor() model.fit(features, labels) pred = model.predict(features) AsGraph(labels, pred)
def runModel(data, config, retrain, runGPU, runNN, frequency, pre_dir): container = {} save_model = partial(_save_model, pre_dir=pre_dir) save_year_res = partial(_save_year_res, pre_dir=pre_dir) if runNN: nn_valid_r2 = [] nn_oos_r2 = [] bcktst_df = data[['Y']].copy() if frequency == 'M': date_range = pd.date_range('20131231', '20200831', freq='M') elif frequency == 'Q': date_range = pd.date_range('20131231', '20200630', freq='Q') elif frequency == 'Y': date_range = pd.date_range('20131231', '20181231', freq='Y') else: raise NotImplementedError() for year in tqdm(date_range): year = datetime.datetime.strftime(year, "%Y-%m") p_t = ['1900-01', str(year)] # period of training # p_t = [sub_months(year, 48), str(year)] # period of training if frequency == 'M': p_v = [add_months(year, 1), add_months(year, 3)] # period of valiation p_test = [add_months(year, 4), add_months(year, 4)] elif frequency == 'Q': p_v = [add_months(year, 1), add_months(year, 3)] # period of valiation p_test = [add_months(year, 4), add_months(year, 6)] elif frequency == 'Y': p_v = [add_months(year, 1), add_months(year, 12)] # period of valiation p_test = [add_months(year, 13), add_months(year, 24)] _Xt, _yt = split( data.loc(axis=0)[:, p_t[0]:p_t[1]].sample(frac=1, random_state=0)) _Xv, _yv = split( data.loc(axis=0)[:, p_v[0]:p_v[1]].sample(frac=1, random_state=0)) test_df = data.loc(axis=0)[:, p_test[0]:p_test[1]] _Xtest, _ytest = split(test_df) #OLS if config['runOLS3']: model_name = "OLS3" + f" {frequency}" data_ols3 = data[[ 'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve', 'Factor09_bm', 'Y' ]] _Xt, _yt = split(data_ols3.loc(axis=0)[:, p_t[0]:p_t[1]]) _Xv, _yv = split(data_ols3.loc(axis=0)[:, p_v[0]:p_v[1]]) _Xtest, _ytest = split( data_ols3.loc(axis=0)[:, p_test[0]:p_test[1]]) Xt = np.vstack((_Xt, _Xv)) yt = np.vstack((_yt, _yv)) Xtest, ytest = _Xtest, _ytest model_fit = LinearRegression().fit(Xt, yt.reshape(-1, )) elif config['runOLS3+H']: model_name = "OLS3+H" + f" {frequency}" data_ols3 = data[[ 'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve', 'Factor09_bm', 'Y' ]] _Xt, _yt = split(data_ols3.loc(axis=0)[:, p_t[0]:p_t[1]]) _Xv, _yv = split(data_ols3.loc(axis=0)[:, p_v[0]:p_v[1]]) _Xtest, _ytest = split( data_ols3.loc(axis=0)[:, p_test[0]:p_test[1]]) Xt = np.vstack((_Xt, _Xv)) yt = np.vstack((_yt, _yv)) Xtest, ytest = _Xtest, _ytest model_fit = HuberRegressor(epsilon=3).fit(Xt, yt.reshape(-1, )) elif config['runOLS5']: model_name = "OLS5" + f" {frequency}" data_ols5 = data[[ 'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve', 'Factor09_bm', 'Factor76_roeq', 'Factor05_agr', 'Y' ]] _Xt, _yt = split(data_ols5.loc(axis=0)[:, p_t[0]:p_t[1]]) _Xv, _yv = split(data_ols5.loc(axis=0)[:, p_v[0]:p_v[1]]) _Xtest, _ytest = split( data_ols5.loc(axis=0)[:, p_test[0]:p_test[1]]) Xt = np.vstack((_Xt, _Xv)) yt = np.vstack((_yt, _yv)) Xtest, ytest = _Xtest, _ytest model_fit = LinearRegression().fit(Xt, yt.reshape(-1, )) elif config['runOLS5+H']: model_name = "OLS5+H" + f" {frequency}" data_ols5 = data[[ 'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve', 'Factor09_bm', 'Factor76_roeq', 'Factor05_agr', 'Y' ]] _Xt, _yt = split(data_ols5.loc(axis=0)[:, p_t[0]:p_t[1]]) _Xv, _yv = split(data_ols5.loc(axis=0)[:, p_v[0]:p_v[1]]) _Xtest, _ytest = split( data_ols5.loc(axis=0)[:, p_test[0]:p_test[1]]) Xt = np.vstack((_Xt, _Xv)) yt = np.vstack((_yt, _yv)) Xtest, ytest = _Xtest, _ytest model_fit = HuberRegressor(epsilon=3).fit(Xt, yt.reshape(-1, )) elif config['runOLS']: model_name = "OLS" + f" {frequency}" Xt = np.vstack((_Xt, _Xv)) yt = np.vstack((_yt, _yv)) Xtest, ytest = _Xtest, _ytest model_fit = LinearRegression(n_jobs=-1).fit(Xt, yt.reshape(-1, )) save_model(model_name, year, model_fit) elif config['runOLSH']: # OLS + H model_name = "OLSH" + f" {frequency}" Xt = np.vstack((_Xt, _Xv)) yt = np.vstack((_yt, _yv)) Xtest, ytest = _Xtest, _ytest model_fit = HuberRegressor().fit(Xt, yt.reshape(-1, )) elif config['runENET']: from sklearn.linear_model import ElasticNet model_name = "ENET" + f" {frequency}" Xt, yt = _Xt, _yt Xv, yv = _Xv, _yv Xtest, ytest = _Xtest, _ytest lambda_ = [0.1, 0.01, 0.001, 0.0001] params = [{'lambda': i} for i in lambda_] out_cv = [] for p in tqdm(params): model_fit = ElasticNet(alpha=p['lambda'], l1_ratio=0.5, random_state=0) model_fit.fit(Xt, yt.reshape(-1, )) yv_hat = model_fit.predict(Xv).reshape(-1, 1) perfor = cal_r2(yv, yv_hat) out_cv.append(perfor) # print('params: ' + str(p) + '. CV r2-validation:' + str(perfor)) logger.info('params: ' + str(p) + '. CV r2-validation:' + str(perfor)) # tic = time.time() # print(f"{model} train time: ", tic - tis) best_p = params[np.argmax(out_cv)] print("best p", best_p) logger.info(f"{model_name} {year} {params} best hyperparamer ", best_p) model_fit = ElasticNet(alpha=best_p['lambda'], l1_ratio=0.5, random_state=0) model_fit.fit(Xt, yt) ytest_hat = model_fit.predict(Xtest).reshape(-1, 1) best_perfor = cal_r2(ytest, ytest_hat) print(f"{model_name} oss r2:", best_perfor) save_model(model_name, year, model_fit) elif config['runPLS']: from sklearn.cross_decomposition import PLSRegression model_name = "PLS" + f" {frequency}" Xt, yt = _Xt, _yt Xv, yv = _Xv, _yv Xtest, ytest = _Xtest, _ytest maxk = min(30, Xt.shape[1]) ks = np.arange(1, maxk, 2) params = [{'k': i} for i in ks] out_cv = [] for p in tqdm(params): pls = PLSRegression(n_components=p['k']) model_fit = pls.fit(Xt, yt) yv_hat = model_fit.predict(Xv) perfor = cal_r2(yv, yv_hat) out_cv.append(perfor) print('params: ' + str(p) + '. CV r2-validation:' + "{0:.3%}".format(perfor)) logging.info('params: ' + str(p) + '. CV r2-validation:' + "{0:.3%}".format(perfor)) best_p = params[np.argmax(out_cv)] print("best hyper-parameter", best_p) pls = PLSRegression(n_components=best_p['k']) model_fit = pls.fit(Xt, yt) ytest_hat = model_fit.predict(Xtest) best_perfor = cal_r2(ytest, ytest_hat) print(f"{model_name} oss r2 in {year}:", best_perfor) elif config['runPCR']: model_name = "PCR" + f" {frequency}" pca_name = "PCA" + f" {frequency}" # mtrain = np.mean(_yt) Xt, yt = _Xt, _yt Xv, yv = _Xv, _yv Xtest, ytest = _Xtest, _ytest # # prepare for PCR running # XTX = np.dot(Xt.T, Xt) # X=xtrain.'*xtrain; # _pca_val, _pca_vec = np.linalg.eig(XTX) # X*pca_vec = pca_vec*pca_val # idx = _pca_val.argsort()[::-1] # pca_val = _pca_val[idx] # pca_vec = _pca_vec[:, idx] # p1 = pca_vec[:, :maxk-5] # 选出最大的30个 # Z = np.dot(Xt, p1) # hyper-parameter maxk = min(30, Xt.shape[1]) ks = np.arange(1, maxk, 2) params = [{'k': i} for i in ks] out_cv = [] for p in tqdm(params): # xx = Z[:, :p['k']] # b = np.linalg.inv(xx.T@xx) @ (xx.T@yt) # b = (inv(xx.'*xx)*xx.') * Y; # bf = p1[:, :p['k']]@b #b = p1(:, 1: j)*b; # # yv_hat = Xv@bf + mtrain # yhatbig1 = xtest * b + mtrain; pca = PCA(n_components=p['k']) X_reduced = pca.fit_transform(Xt) model_fit = LinearRegression() model_fit = model_fit.fit(X_reduced, yt) xv_r = pca.transform(Xv) yv_hat = model_fit.predict(xv_r) perfor = cal_r2(yv, yv_hat) out_cv.append(perfor) print('params: ' + str(p) + '. CV r2-validation:' + "{0:.3%}".format(perfor)) logging.info('params: ' + str(p) + '. CV r2-validation:' + "{0:.3%}".format(perfor)) best_p = params[np.argmax(out_cv)] print("best hyper-parameter", best_p) # xx = Z[:, :best_p['k']] # b = np.linalg.inv(xx.T @ xx) @ (xx.T @ yt) # bf = p1[:, :best_p['k']] @ b # ytest_hat = (Xtest @ bf + mtrain).reshape(-1, 1) pca = PCA(n_components=best_p['k']) Xt = pca.fit_transform(Xt) model_fit = LinearRegression() model_fit = model_fit.fit(Xt, yt) Xtest = pca.transform(Xtest) ytest_hat = model_fit.predict(Xtest) best_perfor = cal_r2(ytest, ytest_hat) print(f"{model_name} oss r2 in {year}:", best_perfor) save_model(pca_name, year, pca) save_model(model_name, year, model_fit) elif runNN: import tensorflow as tf import tensorflow.keras as keras from keras.models import Sequential from keras.layers import Dense, LeakyReLU, BatchNormalization, Dropout from strategy_func import genNNmodel, _loss_fn if config["runNN1"]: i = 1 elif config["runNN2"]: i = 2 elif config["runNN3"]: i = 3 elif config["runNN4"]: i = 4 elif config["runNN5"]: i = 5 elif config["runNN6"]: i = 6 model_name = f"NN{i}" + f" {frequency}" nn_is_preds = [] nn_valid_preds = [] nn_oos_preds = [] model_cntn = [] for model_num in range(5): model_pt = gen_model_pt(model_name, year, pre_dir, runNN=True, model_num=model_num) _Xt, _yt = split( data.loc(axis=0)[:, p_t[0]:p_t[1]].sample( frac=1, random_state=model_num)) _Xv, _yv = split( data.loc(axis=0)[:, p_v[0]:p_v[1]].sample( frac=1, random_state=model_num + 1)) Xt, yt = _Xt, _yt Xv, yv = _Xv, _yv Xtest, ytest = _Xtest, _ytest if retrain: model_fit = train_NN_model(Xt, yt, Xv, yv, model_pt, model_num, i, runGPU) else: model_fit = load_NN_model(Xt, yt, Xv, yv, model_pt, model_num, i, runGPU) model_cntn.append(model_fit) # is_predictions = model_fit.predict(Xt) valid_pred = model_fit.predict(Xv) oos_pred = model_fit.predict(Xtest) # r2is = cal_r2(yt, is_predictions) r2valid = cal_r2(yv, valid_pred) r2oos = cal_r2(ytest, oos_pred) # nr2oos = cal_normal_r2(ytest, predictions) # print(f"model{model_num} train r2", "{0:.3%}".format(r2is)) print(f"model{model_num} valid r2", "{0:.3%}".format(r2valid)) print(f"model{model_num} test r2", "{0:.3%}".format(r2oos)) # nn_is_preds.append(is_predictions) nn_valid_r2.append(r2valid) nn_oos_r2.append(r2oos) # if r2valid < 0.11273255028948781: # nn_oos_preds.append(oos_pred) nn_valid_preds.append(valid_pred) nn_oos_preds.append(oos_pred) elif config['runRF']: logger.info(year) model_name = "RF" + f" {frequency}" Xt, yt = _Xt, _yt Xv, yv = _Xv, _yv Xtest, ytest = _Xtest, _ytest if not retrain: model_fit = tree_model_fast(model_name, year, pre_dir, Xt, yt, Xv, yv, runRF=True, runGBRT=False, runGBRT2=False) else: model_fit = tree_model(Xt, yt, Xv, yv, runRF=True, runGBRT=False, runGBRT2=False) save_model(model_name, year, model_fit) elif config['runGBRT']: model_name = "GBRT+H" + f" {frequency}" Xt, yt = _Xt, _yt Xv, yv = _Xv, _yv Xtest, ytest = _Xtest, _ytest if not retrain: model_fit = tree_model_fast(model_name, year, pre_dir, Xt, yt, Xv, yv, runRF=False, runGBRT=True, runGBRT2=False) else: model_fit = tree_model(Xt, yt, Xv, yv, runRF=False, runGBRT=True, runGBRT2=False) # Don't use pickle or joblib as that may introduces dependencies on xgboost version. # The canonical way to save and restore models is by load_model and save_model. model_pt = gen_model_pt(model_name, year, pre_dir) model_fit.save_model(model_pt) elif config['runGBRT2']: model_name = "GBRT+l2" + f" {frequency}" Xt, yt = _Xt, _yt Xv, yv = _Xv, _yv Xtest, ytest = _Xtest, _ytest if not retrain: model_fit = tree_model_fast(model_name, year, pre_dir, Xt, yt, Xv, yv, runRF=False, runGBRT=False, runGBRT2=True) else: model_fit = tree_model(Xt, yt, Xv, yv, runRF=False, runGBRT=False, runGBRT2=True) # Don't use pickle or joblib as that may introduces dependencies on xgboost version. # The canonical way to save and restore models is by load_model and save_model. model_pt = gen_model_pt(model_name, year, pre_dir) model_fit.save_model(model_pt) # predict and save if runNN: # yt_hat = np.mean(np.concatenate(nn_is_preds, axis=1), axis=1).reshape(-1, 1) yv_hat = np.mean(np.concatenate(nn_valid_preds, axis=1), axis=1).reshape(-1, 1) ytest_hat = np.mean(np.concatenate(nn_oos_preds, axis=1), axis=1).reshape(-1, 1) print(f"mean r2 in {year}among models", "{0:.3%}".format(np.mean(nn_oos_r2))) save_arrays(container, model_name, year, yv_hat, savekey='yv_hat') save_arrays(container, model_name, year, yv, savekey='yv') save_arrays(container, model_name, year, ytest_hat, savekey='ytest_hat') save_arrays(container, model_name, year, ytest, savekey='ytest') bcktst_df.loc[test_df.index, "predict"] = ytest_hat save_year_res(model_name, year, cal_r2(yv, yv_hat), cal_r2(ytest, ytest_hat)) else: yt_hat = model_fit.predict(Xt).reshape(-1, 1) ytest_hat = model_fit.predict(Xtest).reshape(-1, 1) save_arrays(container, model_name, year, yt_hat, savekey='yt_hat') save_arrays(container, model_name, year, yt, savekey='yt') save_arrays(container, model_name, year, ytest_hat, savekey='ytest_hat') save_arrays(container, model_name, year, ytest, savekey='ytest') bcktst_df.loc[test_df.index, "predict"] = ytest_hat save_year_res(model_name, year, cal_r2(yt, yt_hat), cal_r2(ytest, ytest_hat)) if runNN: model_dir = model_pt.parent return model_name, bcktst_df, container, nn_valid_r2, nn_oos_r2, model_dir else: return model_name, bcktst_df, container
def train1(X, y, random, is_output): X_Backup = X y_backup = y idx = list(range(len(y))) process_X, holdout_X, process_y, holdout_y, process_idx, holdout_idx = train_test_split(X, y, idx, test_size = 0.1, random_state = random) train_X, valid_X, train_y, valid_y, train_idx, valid_idx = train_test_split(process_X, process_y, process_idx, test_size = 0.1, random_state = random) d_train = lgb.Dataset(train_X, label=train_y) d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] params = { 'learning_rate': 0.03, 'application': 'regression', 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 0, 'nthread': 4, 'max_bin': 255 } eval_out = 50 if is_output: eval_out = 35 model_lgbm = lgb.train(params, train_set=d_train, num_boost_round=6310, valid_sets=watchlist, verbose_eval=eval_out,early_stopping_rounds=400) preds_lgbm = model_lgbm.predict(valid_X) price_lgbm_pred = np.expm1(preds_lgbm) price_valid_real = np.expm1(valid_y) o_lgbm = rmsle_func(price_lgbm_pred, price_valid_real) print ("LGBM RMSLE: " + str(o_lgbm)) preds_hold_out_lgbm = model_lgbm.predict(holdout_X) price_hold_out_lgbm = np.expm1(preds_hold_out_lgbm) price_hold_out_real = np.expm1(holdout_y) o_lgbm_holdout = rmsle_func(price_hold_out_lgbm, price_hold_out_real) print ("LGBM HOLDOUT RMSLE: " + str(o_lgbm_holdout)) model_ridge = Ridge(solver = "lsqr", fit_intercept=False) model_ridge.fit(train_X, train_y) preds_ridge = model_ridge.predict(valid_X) price_ridge_pred = np.expm1(preds_ridge) o_ridge = rmsle_func(price_ridge_pred, price_valid_real) print ("RIDGE RMSLE: " + str(o_ridge)) model_huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=80, epsilon=363) model_huber.fit(train_X, train_y) preds_huber = model_huber.predict(valid_X) price_huber_pred = np.expm1(preds_huber) o_huber = rmsle_func(price_huber_pred, price_valid_real) print ("HUBER RMSLE: " + str(o_huber)) y2 = np.power(np.log1p(price_lgbm_pred)-np.log1p(price_valid_real), 2) y2 = y2.values if is_output: error_dist(y2, 0.1) l = (-y2).argsort() # Todo: Display a set of predictions, one for each run model. if is_output: for x in l: s = get_by_validation_sequence(valid_idx, price_lgbm_pred, price_ridge_pred, price_huber_pred, x) print (s) return o
#importing the library import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.metrics import classification_report, confusion_matrix #loading the dataset train = pd.read_csv("C:/Users/HP/Desktop/train (1).csv") test = pd.read_csv("C:/Users/HP/Desktop/test (2).csv") train = train.dropna() test = test.dropna() train.head() X_train = np.array(train.iloc[:, :-1].values) y_train = np.array(train.iloc[:, 1].values) X_test = np.array(test.iloc[:, :-1].values) y_test = np.array(test.iloc[:, 1].values) #Huber Regressor from sklearn.linear_model import HuberRegressor model = HuberRegressor() model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = model.score(X_test, y_test) plt.plot(X_train, model.predict(X_train), color='y') plt.show() print(accuracy)
from sklearn.metrics import mean_squared_error models = [['DecisionTree :',DecisionTreeRegressor()], ['Linear Regression :', LinearRegression()], ['RandomForest :',RandomForestRegressor()], ['KNeighbours :', KNeighborsRegressor(n_neighbors = 2)], ['SVM :', SVR()], ['AdaBoostClassifier :', AdaBoostRegressor()], ['GradientBoostingClassifier: ', GradientBoostingRegressor()], ['Xgboost: ', XGBRegressor()], ['CatBoost: ', CatBoostRegressor(logging_level='Silent')], ['Lasso: ', Lasso()], ['Ridge: ', Ridge()], ['BayesianRidge: ', BayesianRidge()], ['ElasticNet: ', ElasticNet()], ['HuberRegressor: ', HuberRegressor()]] print("Results...") for name,model in models: model = model model.fit(X_train, y_train) predictions = model.predict(X_test) print(name, (np.sqrt(mean_squared_error(y_test, predictions)))) # Something as simple as Linear Regression performs the best in this case, which proves that complicated models doesnt always mean better results. There are situations when simple models are much better suited # **Generate Feature Importances**
mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto") build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto") build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto") build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(cv = 3), "LarsAuto") build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto") build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto") build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True) build_auto(RidgeCV(), "RidgeAuto") build_auto(StackingRegressor([("ridge", Ridge(random_state = 13)), ("lasso", Lasso(random_state = 13))], final_estimator = GradientBoostingRegressor(n_estimators = 7, random_state = 13)), "StackingEnsembleAuto") build_auto(TheilSenRegressor(n_subsamples = 31, random_state = 13), "TheilSenAuto") build_auto(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("knn", KNeighborsRegressor()), ("lr", LinearRegression())], weights = [3, 1, 2]), "VotingEnsembleAuto") build_auto(XGBRFRegressor(n_estimators = 31, max_depth = 6, random_state = 13), "XGBRFAuto") if "Auto" in datasets: build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto")
def test_huber_max_iter(): X, y = make_regression_with_outliers() huber = HuberRegressor(max_iter=1) huber.fit(X, y) assert huber.n_iter_ == huber.max_iter
lgb_model = LGBMRegressor(**lgb_params) rf_model = RandomForestRegressor(**rf_params) et_model = ExtraTreesRegressor() # SVR model ; SVM is too slow in more then 10000 set svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05) # DecsionTree model dt_model = DecisionTreeRegressor() # AdaBoost model ada_model = AdaBoostRegressor() stack = Ensemble(n_splits=7, stacker=HuberRegressor(), base_models=(nn, cb_model, gbr_model, rf_model, xgb_model, et_model, ada_model)) y_test = stack.fit_predict(x_train, y_train, x_test) from datetime import datetime print("submit...") pre = y_test sub = pd.read_csv('sample_submission.csv') for c in sub.columns[sub.columns != 'ParcelId']: sub[c] = pre submit_file = '{}.csv'.format(datetime.now().strftime('%Y%m%d_%H_%M')) sub.to_csv(submit_file, index=False, float_format='%.4f')
def test_huber_sample_weights(): # Test sample_weights implementation in HuberRegressor""" X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=True) huber.fit(X, y) huber_coef = huber.coef_ huber_intercept = huber.intercept_ # Rescale coefs before comparing with assert_array_almost_equal to make sure # that the number of decimal places used is somewhat insensitive to the # amplitude of the coefficients and therefore to the scale of the data # and the regularization parameter scale = max(np.mean(np.abs(huber.coef_)), np.mean(np.abs(huber.intercept_))) huber.fit(X, y, sample_weight=np.ones(y.shape[0])) assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale) assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale) X, y = make_regression_with_outliers(n_samples=5, n_features=20) X_new = np.vstack((X, np.vstack((X[1], X[1], X[3])))) y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]])) huber.fit(X_new, y_new) huber_coef = huber.coef_ huber_intercept = huber.intercept_ sample_weight = np.ones(X.shape[0]) sample_weight[1] = 3 sample_weight[3] = 2 huber.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale) assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale) # Test sparse implementation with sample weights. X_csr = sparse.csr_matrix(X) huber_sparse = HuberRegressor(fit_intercept=True) huber_sparse.fit(X_csr, y, sample_weight=sample_weight) assert_array_almost_equal(huber_sparse.coef_ / scale, huber_coef / scale)
other data points, e.g., due to measurement errors.""" from sklearn import linear_model from sklearn.linear_model import HuberRegressor m = 10 # we use 100 data points of the house sales database max_r = 10 # maximum number of features used X,y = GetFeaturesLabels(m,max_r) # read in 100 data points using 10 features linreg_time = np.zeros(max_r) # vector for storing the exec. times of LinearRegresion.fit() for each r linreg_error = np.zeros(max_r) # vector for storing the training error of LinearRegresion.fit() for each r for r in range(max_r): reg_hub = HuberRegressor(fit_intercept=False) start_time = time.time() reg_hub = reg_hub.fit(X[:,:(r+1)], y) end_time = (time.time() - start_time)*1000 linreg_time[r] = end_time pred = reg_hub.predict(X[:,:(r+1)]) linreg_error[r] = mean_squared_error(y, pred) plot_x = np.linspace(1, max_r, max_r, endpoint=True) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4)) axes[0].plot(plot_x, linreg_error, label='MSE', color='red') axes[1].plot(plot_x, linreg_time, label='time', color='green') axes[0].set_xlabel('features') axes[0].set_ylabel('empirical error') axes[1].set_xlabel('features') axes[1].set_ylabel('Time (ms)')
desig = np.array(df["designation"].tolist()) features = np.column_stack((BV, BR, BI, VR, VI, RI, totCounts, randomFeature)) features_train, features_test, temp_train, temp_test = train_test_split( features, temps, test_size=0.1) names = [ "Random Forest", "Ada Boost", "Huber", "Linear Regression", "K Neighbours", "RANSAC", "TheilSen", "Gaussian Process", "SVR" ] classifiers = [ RandomForestRegressor(), AdaBoostRegressor(), HuberRegressor(), LinearRegression(), KNeighborsRegressor(), RANSACRegressor(), TheilSenRegressor(), GaussianProcessRegressor(), SVR(kernel='rbf', gamma=0.1) ] fig, axes = plt.subplots(3, 3, sharex=True, sharey=True) fig.suptitle('Regressor Comparison', y=1.03, fontsize=18) fig.text(0.5, -0.02, 'Actual Temperature / K', ha='center') fig.text(-0.01, 0.5, 'Predicted Temperature / K', va='center',
def test_huber_sample_weights(): # Test sample_weights implementation in HuberRegressor""" X, y = make_regression_with_outliers() huber = HuberRegressor() huber.fit(X, y) huber_coef = huber.coef_ huber_intercept = huber.intercept_ # Rescale coefs before comparing with assert_array_almost_equal to make # sure that the number of decimal places used is somewhat insensitive to # the amplitude of the coefficients and therefore to the scale of the # data and the regularization parameter scale = max(np.mean(np.abs(huber.coef_)), np.mean(np.abs(huber.intercept_))) huber.fit(X, y, sample_weight=np.ones(y.shape[0])) assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale) assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale) X, y = make_regression_with_outliers(n_samples=5, n_features=20) X_new = np.vstack((X, np.vstack((X[1], X[1], X[3])))) y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]])) huber.fit(X_new, y_new) huber_coef = huber.coef_ huber_intercept = huber.intercept_ sample_weight = np.ones(X.shape[0]) sample_weight[1] = 3 sample_weight[3] = 2 huber.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale) assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale) # Test sparse implementation with sample weights. X_csr = sparse.csr_matrix(X) huber_sparse = HuberRegressor() huber_sparse.fit(X_csr, y, sample_weight=sample_weight) assert_array_almost_equal(huber_sparse.coef_ / scale, huber_coef / scale)
from sklearn.ensemble import GradientBoostingRegressor from sklearn.neural_network import MLPRegressor from xgboost import XGBRegressor aml_basic_regressors = [ ('model1', LinearRegression()), ('model2', Lasso()), ('model3', Ridge()), ('model4', ElasticNet()), ('model5', Lars()), ('model6', LassoLars()), ('model7', OrthogonalMatchingPursuit()), ('model8', BayesianRidge()), ('model9', ARDRegression()), ('model10', PassiveAggressiveRegressor()), ('model11', RANSACRegressor()), ('model12', TheilSenRegressor()), ('model13', HuberRegressor()), ('model14', KernelRidge()), ('model15', SVR()), ('model16', KNeighborsRegressor()), ('model17', DecisionTreeRegressor()), ('model18', RandomForestRegressor()), ('model19', ExtraTreesRegressor()), ('model20', AdaBoostRegressor()), ('model21', GradientBoostingRegressor()), ('model22', MLPRegressor()), ('model23', XGBRegressor()), ]
) #train the algorithm on training data and predict using the testing data y_predransac = ransac.predict(X_test) print('Betas: ', list(zip(ransac.coef_, X))) print('Beta0: %.2f' % ransac.intercept_) #Beta0 # 5.1.5.2 Theil-Sen regression ts = TheilSenRegressor() pred_ts = ts.fit(X_train, y_train).predict( X_test ) #train the algorithm on training data and predict using the testing data y_predts = ts.predict(X_test) print('Betas: ', list(zip(ts.coef_, X))) print('Beta0: %.2f' % ts.intercept_) #Beta0 # 5.1.5.3 Huber regression huber = HuberRegressor(alpha=0.0) pred_huber = huber.fit(X_train, y_train).predict( X_test ) #train the algorithm on training data and predict using the testing data y_predhuber = huber.predict(X_test) print('Betas: ', list(zip(huber.coef_, X))) print('Beta0: %.2f' % huber.intercept_) #Beta0 """# Regression Model selection After calculating different regression models it is necessary to compare models and evaluate which is the best given the database. - MAE - MSE - RMSE - R² - Adjusted R² """
def run(seed): # create folders for scores models and preds folder_models = './models/domain2_var1/scores/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain2_var1/scores/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Loading data...') # load biases ic_bias = read_pickle('./data/biases/ic_biases.pickle') ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle') fnc_bias = read_pickle('./data/biases/fnc_biases.pickle') fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle') pca_bias = read_pickle('./data/biases/200pca_biases.pickle') pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle') # load classifier and add extra sites2 extra_site = pd.DataFrame() extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy') # load competiton data ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv') fnc_df = pd.read_csv('./data/raw/fnc.csv') loading_df = pd.read_csv('./data/raw/loading.csv') labels_df = pd.read_csv('./data/raw/train_scores.csv') ids_df = ids_df.append(extra_site) print('Detected Site2 ids count: ', ids_df['Id'].nunique()) # load created features agg_df = pd.read_csv('./data/features/agg_feats.csv') im_df = pd.read_csv('./data/features/im_feats.csv') dl_df = pd.read_csv('./data/features/dl_feats.csv') pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv') for i in range(1, 6): part = pd.read_csv( './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i)) del part['Id'] pca_df = pd.concat((pca_df, part), axis=1) # merge data ic_cols = list(loading_df.columns[1:]) fnc_cols = list(fnc_df.columns[1:]) agg_cols = list(agg_df.columns[1:]) im_cols = list(im_df.columns[1:]) pca_cols = list(pca_df.columns[1:]) dl_cols = list(dl_df.columns[1:]) pca0_cols = [c for c in pca_cols if 'k0' in c] df = fnc_df.merge(loading_df, on='Id') df = df.merge(agg_df, how='left', on='Id') df = df.merge(im_df, how='left', on='Id') df = df.merge(pca_df, how='left', on='Id') df = df.merge(dl_df, how='left', on='Id') df = df.merge(labels_df, how='left', on='Id') del loading_df, fnc_df, agg_df, im_df, pca_df gc.collect() # split train and test df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0 df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1 train = df.query('is_test==0') del train['is_test'] test = df.query('is_test==1') del test['is_test'] y = train['domain2_var1'].copy().reset_index(drop=True) d21_index = list(train['domain2_var1'].dropna().index) # apply biases for c in ic_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c] for c in fnc_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c] for c in pca_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c] # save df for scaling df_scale = pd.concat([train, test], axis=0) # I. Create fnc score print('Creating FNC score...') # prepare datasets for fnc score train_for_score, test_for_score = scale_select_data( train, test, df_scale, fnc_cols) # define models names = ['ENet', 'BRidge'] names = [name + '_fnc_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # II. Create agg score print('Creating AGG score...') # prepare datasets for agg score train_for_score, test_for_score = scale_select_data( train, test, df_scale, agg_cols) # define models names = ['RGF', 'ENet', 'Huber'] names = [name + '_agg_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0), HuberRegressor(epsilon=2.5, alpha=1) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # III. Create pca score print('Creating PCA score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, pca_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_pca_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # IV. Create im score print('Creating IM score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, im_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_im_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # V. Create dl score print('Creating DL score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, dl_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_dl_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # VI. Training and predicting procedure print('Training has started...') # add scores for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']: train.loc[d21_index, prefix + '_score'] = np.load( folder_preds + '{}_score_seed{}.npy'.format(prefix, seed)) test.loc[:, prefix + '_score'] = np.load( folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed)) score_cols = [c for c in train.columns if c.endswith('_score')] # save df for scaling df_scale = pd.concat([train, test], axis=0) # create differents datasets # linear linear_cols = sorted( list(set(ic_cols + fnc_cols + pca0_cols) - set(['IC_20']))) train_linear, test_linear = scale_select_data(train, test, df_scale, linear_cols) # kernel kernel_cols = sorted(list(set(ic_cols + pca0_cols) - set(['IC_20']))) train_kernel, test_kernel = scale_select_data(train=train, test=test, df_scale=df_scale, cols=kernel_cols, scale_factor=0.2, scale_cols=pca0_cols, sc=StandardScaler()) # score sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20']))) train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols) # learning process on different datasets names = ['GP', 'SVM1', 'SVM2', 'Lasso', 'BgR'] names = [name + '_seed{}'.format(seed) for name in names] pack = [ GaussianProcessRegressor(DotProduct(), random_state=0), NuSVR(C=3, kernel='rbf'), NuSVR(C=3, kernel='rbf'), Lasso(alpha=0.1, random_state=0), BaggingRegressor(Ridge(alpha=1), n_estimators=100, max_samples=0.2, max_features=0.2, random_state=0) ] zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y) de_blend = zoo.blend_oof() preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2, names, is_blend=True) # rewrite folders for models and preds folder_models = './models/domain2_var1/stack/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain2_var1/stack/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Saving models to', folder_models) print('Saving predictions to', folder_preds) # save oofs and models zoo.save_oofs(names, folder=folder_preds) zoo.save_models(names, folder=folder_models) # stacking predictions print('Stacking predictions...') d21_prediction = pd.DataFrame() d21_prediction['Id'] = test['Id'].values d21_prediction['pred'] = preds d21_prediction.to_csv(folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed), index=False) print('domain2_var1 seed pred is saved as', folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed))
def preprocess(data, fps=100., old_fps=60, filter=None, verbosity=0, fps_threshold=.1): """ Normalize calcium traces and spike trains. This function does three things: 1. Remove any linear trends using robust linear regression. 2. Normalize the range of the calcium trace by the 5th and 80th percentile. 3. Change the sampling rate of the calcium trace and spike train. If C{filter} is set, the first step is replaced by estimating and removing a baseline using a percentile filter (40 seconds seems like a good value for the percentile filter). @type data: list @param data: list of dictionaries containing calcium/fluorescence traces @type fps: float @param fps: desired sampling rate of signals @type filter: float/none @param filter: percentile filter length in seconds @type filter: float/None @param filter: number of seconds used in percentile filter @type verbosity: int @param verbosity: if positive, print messages indicating progress @type fps_threshold: float @param fps_threshold: only resample if sampling rate differs more than this @rtype: list @return: list of preprocessed recordings """ seed(42) data = deepcopy(data) for k in range(len(data)): if verbosity > 0: print('Preprocessing calcium trace {0}...'.format(k)) data[k]['fps'] = float(data[k]['fps']) if filter is None: # remove any linear trends # x = arange(data[k]['calcium'].size) # a, b = robust_linear_regression(x, data[k]['calcium']) # data[k]['calcium'] = data[k]['calcium'] - (a * x + b) # using LinearRegression from sklearn X_temp = arange(0, len(data[k]['calcium'])).reshape(-1, 1) model = HuberRegressor() model.fit(X_temp, data[k]['calcium']) # calculate trend trend = model.predict(X_temp) # detrend data[k]['calcium'] = data[k]['calcium'] - trend else: data[k]['calcium'] = data[k]['calcium'] - \ percentile_filter(data[k]['calcium'], window_length=int(data[k]['fps'] * filter), perc=5) # normalize dispersion calcium05 = percentile(data[k]['calcium'], 5) calcium80 = percentile(data[k]['calcium'], 80) if calcium80 - calcium05 > 0.: data[k]['calcium'] = ((data[k]['calcium'] - calcium05) / float(calcium80 - calcium05)).reshape( (len(data[k]['calcium']), )) # compute spike times if binned spikes are given if 'spikes' in data[k] and 'spike_times' not in data[k]: spikes = asarray(data[k]['spikes'].ravel(), dtype='uint16') # compute spike times in milliseconds spike_times = where(spikes > 0)[0] spike_times = repeat(spike_times, spikes[spike_times]) spike_times = (spike_times + rand(*spike_times.shape)) * (1000. / data[k]['fps']) data[k]['spike_times'] = sort(spike_times).reshape(1, -1) # normalize sampling rate if fps is not None and fps > 0. and abs(data[k]['fps'] - fps) > fps_threshold: # number of samples after update of sampling rate num_samples = int( float(data[k]['calcium'].size) * fps / data[k]['fps'] + .5) if num_samples != data[k]['calcium'].size: # factor by which number of samples will actually be changed factor = num_samples / float(data[k]['calcium'].size) # resample calcium signal data[k]['calcium'] = resample(data[k]['calcium'].ravel(), num_samples).reshape(1, -1) data[k]['fps'] = data[k]['fps'] * factor else: # don't change sampling rate num_samples = data[k]['calcium'].size # compute binned spike trains if missing if 'spike_times' in data[k] and ('spikes' not in data[k] or num_samples != data[k]['spikes'].size): # spike times in bins spike_times = asarray(data[k]['spike_times'] * (data[k]['fps'] / 1000.), dtype=int).ravel() spike_times = spike_times[spike_times < num_samples] spike_times = spike_times[spike_times >= 0] # create binned spike train data[k]['spikes'] = zeros([1, num_samples], dtype='uint16') for t in spike_times: data[k]['spikes'][0, t] += 1 # make sure spike trains are row vectors if 'spikes' in data[k]: data[k]['spike_times'] = data[k]['spike_times'].reshape( -1, ) #data[k]['spike_times'].reshape(1, -1) data[k]['spikes'] = data[k]['spikes'].reshape( -1, ) #data[k]['spikes'].reshape(1, -1) # added by Gavin data[k]['calcium'] = data[k]['calcium'].reshape(-1, ) data[k]['spike_count'] = int(sum(data[k]['spikes'])) return data
): #para percorrer por todas as pastas da pasta os.chdir(folder) name_folder = folder.split("/")[6] train_data = np.array(pd.read_csv('train_data.csv', sep=';')) test_data = np.array(pd.read_csv('test_data.csv', sep=';')) train_labels = np.array(pd.read_csv('train_labels.csv', sep=';')) test_labels = np.array(pd.read_csv('test_labels.csv', sep=';')) inicio = time.time() # importar o modelo de regressão linear from sklearn.linear_model import HuberRegressor # treinar o modelo no conjunto de dados regression = HuberRegressor().fit(train_data, train_labels) # prever predictions_labels = regression.predict(test_data) fim = time.time() df_time = pd.DataFrame({'Execution Time:': [fim - inicio]}) output_path = os.path.join('/home/isadorasalles/Documents/Regressao/huber', 'time_' + name_folder) df_time.to_csv(output_path, sep=';') from sklearn import metrics df_metrics = pd.DataFrame({ 'Mean Absolute Error':
import os import pandas as pd from sklearn.model_selection import cross_val_score import random import math import numpy as np from sklearn import metrics data_train = pd.read_csv("train_dataset.csv") data_test = pd.read_csv("test_dataset.csv") feature = [] ###feature数据集 for i in data_train.columns: if (i != 'death_infection_rate') & (i != 'country') & (i != 'num') & ( i != 'sqrt-factor') & (i != 'ICU/thousand'): feature.append(i) train_feature = data_train[feature] train_target = data_train['death_infection_rate'] test_feature = data_test[feature] LiR = HuberRegressor() LiR.fit(train_feature, train_target) predictions_LiR = LiR.predict(test_feature) print(LiR.coef_) print(LiR.intercept_) result1 = [pd.DataFrame(data_test), pd.DataFrame(predictions_LiR)] result1_new = pd.concat(result1, axis=1) ###axis=1,按照列合并,=0按照行合并 result1_new.to_csv('CDR.csv', index=False)
from sklearn.svm import SVR from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error if __name__ == "__main__": dataset = pd.read_csv('./data/felicidad_corrupt.csv') print(dataset.head(5)) X = dataset.drop(['country', 'score'], axis=1) y = dataset[['score']] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) """ El valor epsilon por defecto es 1.35 y por conveniencia es mejor dejarlo asi ya que el 95% de los datos resulta mejor con este valor de epsilon """ estimadores = { 'SVR' : SVR(gamma='auto', C=1.0, epsilon=0.1), 'RANSAC': RANSACRegressor(), 'HBER': HuberRegressor(epsilon=1.35) } for name, estimador in estimadores.items(): estimador.fit(X_train, y_train) predictions = estimador.predict(X_test) print("="*64) print(name) print("MSE: ", mean_squared_error(y_test, predictions)) print("Score: ", estimador.score(X_test, y_test))
class TDPRegressor: def __init__(self, features=[], target=[], model='ols', tag='train'): self.tag = tag + '_' + model self.outdir = 'fig/final_v5/' + self.tag self.model = model import os os.system('mkdir -p ' + self.outdir) # setup analysis self.X = features self.y = target # Scale self.scaler = StandardScaler(with_mean=True, with_std=True).fit(self.X) if model == 'ols': self.regr = skl_lm.LinearRegression() elif model == 'huber': self.regr = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100, epsilon=1.35) print self def __repr__(self): return "Regression " + self.tag + " --- %.3d entries" % len(self.X) def Add(self, b): self.X = np.append(self.X, b.X, axis=0) self.y = np.append(self.y, b.y, axis=0) self.X_scaled = np.append(self.X_scaled, b.X_scaled, axis=0) self.yhat = np.append(self.yhat, b.yhat, axis=0) def transform(self): self.X_scaled = self.scaler.transform(self.X) def fit(self): # Fit X_scaled = self.X_scaled self.regr.fit(X_scaled, self.y) print(self.regr.intercept_) print(self.regr.coef_) def predict(self): X_scaled = self.X_scaled self.yhat = self.regr.predict(X_scaled) if len(self.y) > 0: self.CalcErrorMetric() def CalcErrorMetric(self): X = self.X X_scaled = self.X_scaled y = self.y lin_rmse = np.sqrt(mean_squared_error(y, self.yhat)) lin_ame = mean_absolute_error(y, self.yhat) lin_mad = mad(y - self.yhat) ymean = np.mean(y) self.frac_ame = lin_ame / ymean self.frac_err = lin_ame / ymean self.R2 = r2_score(self.y, self.yhat) print 'residual standard error (rse):', lin_rmse, 'residual mean_absolute_error:', lin_ame, 'residual mad', lin_mad, lin_ame, '<y>: ', ymean print 'ratio (err): ', self.frac_err print 'R^2 score: ', self.R2 def PlotInputs(self, xmin=0.5, xmax=5000, xc='linear'): # convenient X = self.X X_scaled = self.X_scaled y = self.y # vars to fit fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(7, 7)) axes.scatter(X[:, 0], y, color='red', marker='o', alpha=0.2) axes.set_xlabel('x', fontsize='xx-large') axes.set_ylabel('Time (min)', fontsize='xx-large') plt.xscale(xc) plt.yscale(xc) plt.xlim(xmin, xmax) plt.ylim(ymin=10, ymax=100000) plt.savefig(self.outdir + '/x_vs_t.png') def PlotPerformanceSingle(self, xmin=1, xmax=5000, xc='linear'): # convenient X = self.X y = self.y yhat = self.yhat fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(7, 7), sharex=False) axarr.scatter(X[:, 0], y, color='red', marker='o', alpha=0.2, label='data') axarr.scatter(X[:, 0], yhat, color='blue', marker='s', alpha=0.5, s=5) #ax.set_xlabel(r'$\Delta_i$', fontsize=15) axarr.set_xlabel('Volume (cm^3)', fontsize=20) axarr.set_ylabel('Time (min)', fontsize=20) axarr.yaxis.set_tick_params(labelsize=20) axarr.set_xscale('linear') axarr.set_yscale('linear') axarr.set_xlim(xmin, xmax) axarr.set_ylim(ymin=10, ymax=15000) fig.savefig(self.outdir + '/data_model_vs_x.png') def PlotPerformance(self, xmin=1, xmax=5000, xc='linear', plotLeg=True): # convenient X = self.X y = self.y / 60 yhat = self.yhat / 60 # plot residual fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(7, 7), sharex=False) #fig.subplots_adjust(hspace=0) # Two subplots, the axes array is 1-d axarr.scatter(X[:, 0], y, color='red', marker='o', alpha=0.2, label='data') axarr.scatter(X[:, 0], yhat, color='blue', marker='s', alpha=0.5, s=5) #ax.set_xlabel(r'$\Delta_i$', fontsize=15) axarr.set_xscale('linear') axarr.set_yscale('linear') axarr.set_xlim(xmin, xmax) axarr.set_ylim(ymin=0, ymax=250) axarr.xaxis.set_tick_params(labelsize=20) axarr.yaxis.set_tick_params(labelsize=20) axarr.set_xlabel('Volume (cm^3)', fontsize=20) axarr.set_ylabel('Build Time (hours)', fontsize=20) fig.savefig(self.outdir + '/data_model_data_vs_x.png') fig, axarr = plt.subplots(nrows=1, ncols=1, figsize=(7, 7), sharex=False) axarr.scatter(X[:, 0], y - yhat, color='red', alpha=0.2) axarr.set_xscale(xc) axarr.set_yscale('linear') axarr.set_xlim(xmin, xmax) axarr.set_ylim(-50, 50) axarr.xaxis.set_tick_params(labelsize=20) axarr.yaxis.set_tick_params(labelsize=20) axarr.set_xlabel('Volume (cm^3)', fontsize=20) axarr.set_ylabel('Build Time (hours)', fontsize=20) axarr.scatter(X[:, 0].T, X[:, 0].T * 0, color='blue', marker='s', alpha=0.5, s=5, label=self.tag + '\nFrac Error = ' + "%.3f" % self.frac_ame + '\n' + r'$R^2$ = ' + "%.3f" % self.R2) if plotLeg: axarr.legend(loc='lower left', framealpha=0, fontsize=16) fig.savefig(self.outdir + '/data_model_residual_vs_x.png') def export_model(self): from sklearn.externals import joblib joblib.dump([self.scaler, self.regr], self.outdir + '/' + self.model + '.pkl') def import_model(self, scaler, regr): self.scaler = scaler self.regr = regr print 'scaler:', scaler.mean_ print 'regr coefficients:', regr.intercept_, regr.coef_
def __init__(self): random_rate = 8240 clf1 = SGDClassifier(alpha=5e-05, average=False, class_weight='balanced', loss='log', n_iter=30, penalty='l2', n_jobs=-1, random_state=random_rate) clf2 = MultinomialNB(alpha=0.1) clf3 = LinearSVC(C=0.1, random_state=random_rate) clf4 = LogisticRegression(C=1.0, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate) clf5 = BernoulliNB(alpha=0.1) clf6 = VotingClassifier(estimators=[('sgd', clf1), ('mb', clf2), ('bb', clf3), ('lf', clf4), ('bnb', clf5)], voting='hard') clf7 = SGDClassifier(alpha=5e-05, average=False, class_weight='balanced', loss='log', n_iter=30, penalty='l1', n_jobs=-1, random_state=random_rate) clf8 = LinearSVC(C=0.9, random_state=random_rate) clf9 = LogisticRegression(C=0.5, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate) clf10 = MultinomialNB(alpha=0.9) clf11 = BernoulliNB(alpha=0.9) clf12 = LogisticRegression(C=0.2, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate, penalty='l1') clf13 = LogisticRegression(C=0.8, n_jobs=-1, max_iter=100, class_weight='balanced', random_state=random_rate, penalty='l1') clf14 = RidgeClassifier(alpha=8) clf15 = PassiveAggressiveClassifier(C=0.01, loss='squared_hinge', n_iter=20, n_jobs=-1) clf16 = RidgeClassifier(alpha=2) clf17 = PassiveAggressiveClassifier(C=0.5, loss='squared_hinge', n_iter=30, n_jobs=-1) clf18 = LinearSVC(C=0.5, random_state=random_rate) clf19 = MultinomialNB(alpha=0.5) clf20 = BernoulliNB(alpha=0.5) clf21 = Lasso(alpha=0.1, max_iter=20, random_state=random_rate) clf22 = Lasso(alpha=0.9, max_iter=30, random_state=random_rate) clf23 = PassiveAggressiveClassifier(C=0.1, loss='hinge', n_iter=30, n_jobs=-1, random_state=random_rate) clf24 = PassiveAggressiveClassifier(C=0.9, loss='hinge', n_iter=30, n_jobs=-1, random_state=random_rate) clf25 = HuberRegressor(max_iter=30) basemodel = [ ['sgd', clf1], ['nb', clf2], ['lsvc1', clf3], ['LR1', clf4], ['bb', clf5], ['vote', clf6], ['sgdl1', clf7], ['lsvc2', clf8], ['LR2', clf9], ['nb2', clf10], ['bb2', clf11], ['LR3', clf12], ['LR4', clf13], ['rc1', clf14], ['pac1', clf15], ['rc2', clf16], ['pac2', clf17], ['lsvc3', clf18], ['nb3', clf19], ['bb3', clf20], ['lr5', clf21], ['lr6', clf22], ['rc3', clf23], ['pac3', clf24], ['hub', clf25], ] ##################################### clf_svc = SVC(C=1, random_state=random_rate, cache_size=1000) self.base_models = basemodel self.LR = clf4 self.svc = clf_svc
def _ellip_smooth(R, E, deg): model = make_pipeline(PolynomialFeatures(deg), HuberRegressor(epsilon=2.)) model.fit(np.log10(R).reshape(-1, 1), _inv_x_to_eps(E)) return _x_to_eps(model.predict(np.log10(R).reshape(-1, 1)))
"kr": SklearnWrapper(KernelRidge(), accept_singleton=True), "rf": SklearnWrapper(RandomForestRegressor(), accept_singleton=True), "gb": SklearnWrapper(MultiOutputRegressor(GradientBoostingRegressor()), accept_singleton=True), "lr": SklearnWrapper(Pipeline([("poly", PolynomialFeatures(2)), ("regressor", MultiOutputRegressor(LinearRegression()))]), accept_singleton=True), "hr": SklearnWrapper(Pipeline([("poly", PolynomialFeatures(2)), ("regressor", MultiOutputRegressor(HuberRegressor()))]), accept_singleton=True), "ran": SklearnWrapper(Pipeline([("poly", PolynomialFeatures(2)), ("regressor", MultiOutputRegressor(RANSACRegressor()))]), accept_singleton=True), "gpr": SklearnWrapper(MultiOutputRegressor(GaussianProcessRegressor()), accept_singleton=True), "wei": SklearnWrapper(MultiOutputRegressor(WeightedCurver(maxfev=100000)), accept_singleton=True), "sum": SklearnWrapper(MultiOutputRegressor( SummedCurver(maxfev=2000, method="dogbox")),
y_outliers = rng.normal(0, 2.0, size=4) X_outliers[:2, :] += X.max() + X.mean() / 4. X_outliers[2:, :] += X.min() - X.mean() / 4. y_outliers[:2] += y.min() - y.mean() / 4. y_outliers[2:] += y.max() + y.mean() / 4. X = np.vstack((X, X_outliers)) y = np.concatenate((y, y_outliers)) plt.plot(X, y, 'b.') # Fit the huber regressor over a series of epsilon values. colors = ['r-', 'b-', 'y-', 'm-'] x = np.linspace(X.min(), X.max(), 7) epsilon_values = [1.35, 1.5, 1.75, 1.9] for k, epsilon in enumerate(epsilon_values): huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100, epsilon=epsilon) huber.fit(X, y) coef_ = huber.coef_ * x + huber.intercept_ plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon) # Fit a ridge regressor to compare it to huber regressor. ridge = Ridge(fit_intercept=True, alpha=0.0, random_state=0, normalize=True) ridge.fit(X, y) coef_ridge = ridge.coef_ coef_ = ridge.coef_ * x + ridge.intercept_ plt.plot(x, coef_, 'g-', label="ridge regression") plt.title("Comparison of HuberRegressor vs Ridge") plt.xlabel("X") plt.ylabel("y") plt.legend(loc=0)
# doesn't appear to be any trend with the year. fig, ax = plt.subplots() train[['SalePrice', 'YrSold']].boxplot(by='YrSold', column='SalePrice', ax=ax) plt.xlabel('Year sold') plt.ylabel('Price ($)') plt.suptitle("") plt.show() # %% From a human perspective, the living space looks like the strongest # indicator of price, lets see whether a basic fit can be made. Need to use # HuberRegressor because it is more robust to outliers. fig, ax = plt.subplots() ax.scatter(train.GrLivArea, train.SalePrice, alpha=0.2, label='Real data') clf = HuberRegressor() clf.fit(train.GrLivArea.values.reshape(-1, 1), train.SalePrice.values.reshape(-1, 1)) salePredictGrLivArea = clf.predict(train.GrLivArea.values.reshape(-1, 1)) ax.plot(train.GrLivArea.values.reshape(-1, 1), clf.predict(train.GrLivArea.values.reshape(-1, 1)), 'black', label='Linear fit') plt.xlabel('Living area') plt.ylabel('Price ($)') plt.legend() plt.show() # %% Lets look at correlations in the dataset (at least between numeric values) # We only care about correlations with sale price, so lets visualise that. # It turns out a number of variables have a large positive correlation