def get_models(models=dict()): # linear models models['lr'] = LinearRegression() models['lasso'] = Lasso() models['ridge'] = Ridge() models['en'] = ElasticNet() models['huber'] = HuberRegressor() models['lars'] = Lars() models['llars'] = LassoLars() models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3) models['ranscac'] = RANSACRegressor() models['sgd'] = SGDRegressor(max_iter=1000, tol=1e-3) print('Defined %d models' % len(models)) return models
def test_vs_huber(): reg1 = RobustWeightedRegressor( max_iter=100, weighting="huber", k=5, c=1, burn_in=0, sgd_args={"learning_rate": "adaptive"}, # test sgd_args random_state=rng, ) reg2 = HuberRegressor() reg1.fit(X_rcy, y_rcy) reg2.fit(X_rcy, y_rcy) assert np.abs(reg1.coef_[0] - reg2.coef_[0]) < 1e-2
def test_quantile_equals_huber_for_low_epsilon(fit_intercept, default_solver): X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0) alpha = 1e-4 huber = HuberRegressor( epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept ).fit(X, y) quant = QuantileRegressor( alpha=alpha, fit_intercept=fit_intercept, solver=default_solver ).fit(X, y) assert_allclose(huber.coef_, quant.coef_, atol=1e-1) if fit_intercept: assert huber.intercept_ == approx(quant.intercept_, abs=1e-1) # check that we still predict fraction assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
def build_model(model_type, params): """Implement function build model Support LinearRegression, HuberRegression, LassoRegression, SupportVectorRegression, DecisionTreeRegressor, RandomForest, XGBoost and MultiPerceptron Apply normalization input data Parameters: ----------- model_type: str, type of model, support LinearRegression, HuberRegressor, Lasso, DecisionTreeRegressor, SupportVectorRegressor and XGBRegressor params: dict, parameter responding each model Returns: -------- estimator: instance of class model """ support_type = [ 'LinearRegression', 'HuberRegressor', 'Lasso', 'DecisionTreeRegressor', 'RandomForestRegressor', 'SupportVectorMachine', 'XGBoost', 'MultiPerceptron' ] assert (model_type in support_type), 'Expected one of value {}'.format( ','.join(support_type)) steps = [('minmax-scaler', MinMaxScaler()), ('standard-scaler', StandardScaler()), ('polynomial', PolynomialFeatures(params.pop('degree', 1)))] # Choice model type if model_type == 'LinearRegression': steps.append(('model', LinearRegression(**params))) elif model_type == 'HuberRegressor': steps.append(('model', HuberRegressor(**params))) elif model_type == 'Lasso': steps.append(('model', Lasso(**params))) elif model_type == 'DecisionTreeRegressor': steps.append(('model', DecisionTreeRegressor(**params))) elif model_type == 'RandomForestRegressor': steps.append(('model', RandomForestRegressor(**params))) elif model_type == 'SupportVectorMachine': steps.append(('model', SVR(**params))) elif model_type == 'XGBoost': steps.append(('model', XGBRegressor(**params))) elif model_type == 'MultiPerceptron': steps.append(('model', MLPRegressor(**params))) estimator = Pipeline(steps) return estimator
def test_huber_warm_start(): X, y = make_regression_with_outliers() huber_warm = HuberRegressor( fit_intercept=True, alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1) huber_warm.fit(X, y) huber_warm_coef = huber_warm.coef_.copy() huber_warm.fit(X, y) # SciPy performs the tol check after doing the coef updates, so # these would be almost same but not equal. assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1) # No n_iter_ in old SciPy (<=0.9) if huber_warm.n_iter_ is not None: assert_equal(0, huber_warm.n_iter_)
def __init__(self, x_train, y_train, test_split_available=False, test_size=0.1, shuffle=True, number_of_estimator=10, estimator=None, estimators=None, random_state=None): if test_split_available: self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x_train, y_train, test_size=test_size, shuffle=shuffle, random_state=random_state) else: self.x_test = x_train self.y_test = y_train self.x_train = x_train self.y_train = y_train self.y_predict_test = {} self.y_predict_train = {} self.models = {'svr': SVR(), 'knn': KNeighborsRegressor(), 'tree': DecisionTreeRegressor(), 'logistic': LogisticRegression(), 'linear': LinearRegression(), 'ridge': Ridge(), 'ridgecv': RidgeCV(), 'lasso': Lasso(), 'lassolars': LassoLars(alpha=0.1), 'bayesian': BayesianRidge(), 'ElasticNet': ElasticNet(), 'TheilSenRegressor': TheilSenRegressor(), 'ARDRegression': ARDRegression(), 'RANSACRegressor': RANSACRegressor(), 'HuberRegressor': HuberRegressor(), 'randomForest': RandomForestRegressor(n_estimators=50), 'boost': AdaBoostRegressor(random_state=0, n_estimators=100)} self.estimator = self.models[estimator] estimators_list = [] for i in range(len(estimators)): estimators_list.append((estimators[i], self.models[estimators[i]])) self.models = {'svr': SVR(), 'knn': KNeighborsRegressor(), 'tree': DecisionTreeRegressor(), 'logistic': LogisticRegression(), 'linear': LinearRegression(), 'ridge': Ridge(), 'ridgecv': RidgeCV(), 'lasso': Lasso(), 'lassolars': LassoLars(alpha=0.1), 'bayesian': BayesianRidge(), 'ElasticNet': ElasticNet(), 'TheilSenRegressor': TheilSenRegressor(), 'ARDRegression': ARDRegression(), 'RANSACRegressor': RANSACRegressor(), 'HuberRegressor': HuberRegressor(), 'randomForest': RandomForestRegressor(n_estimators=50), 'bagging': BaggingRegressor(base_estimator=self.estimator, n_estimators=number_of_estimator, max_features=0.8), 'voting': VotingRegressor(estimators=estimators_list), 'boost': AdaBoostRegressor(random_state=0, n_estimators=100)}
def get_models(models=dict()): # linear models models['linear regression'] = LinearRegression() models['lasso'] = Lasso() models['ridge'] = Ridge() models['elastic net'] = ElasticNet() models['huber regressor'] = HuberRegressor() #models['lars'] = Lars() models['lasso lars'] = LassoLars() models['passive aggressive regressor'] = PassiveAggressiveRegressor( max_iter=1000, tol=1e-3) models['ranscac regressor'] = RANSACRegressor(min_samples=4) models['sgd regressor'] = SGDRegressor(max_iter=5000, tol=1e-3) print('Defined %d models' % len(models)) return models
def test_huber_scaling_invariant(): # Test that outliers filtering is scaling independent. X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=False, alpha=0.0) huber.fit(X, y) n_outliers_mask_1 = huber.outliers_ assert not np.all(n_outliers_mask_1) huber.fit(X, 2.0 * y) n_outliers_mask_2 = huber.outliers_ assert_array_equal(n_outliers_mask_2, n_outliers_mask_1) huber.fit(2.0 * X, 2.0 * y) n_outliers_mask_3 = huber.outliers_ assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
def _algorithm(self): if self.algorithm.lower() == 'svr': tuned_parameters = [{ 'C': np.arange(1, 4, 0.5), 'epsilon': np.arange(0.5, 2, 0.2), 'tol': [1, 1e-1, 1e-2, 1e-3] }] #return GridSearchCV( # SVR(kernel='rbf', shrinking=True, gamma='auto'), tuned_parameters, # cv=5, error_score=0, n_jobs=4, verbose=1 #) return SVR(kernel='rbf', C=1) elif self.algorithm.lower() == 'mlp': return MLPRegressor() elif self.algorithm.lower() == 'huber': return HuberRegressor() elif self.algorithm.lower() == 'lr': return linear_model.LinearRegression() elif self.algorithm.lower() == 'rigid': return linear_model.Ridge(alpha=0.5) elif self.algorithm.lower() == 'rf': return RandomForestRegressor(random_state=0, n_estimators=200) elif self.algorithm.lower() == 'gbr': tuned_parameters = [{ 'n_estimators': [160, 170, 180], 'subsample': [0.6, 0.7, 0.8], }] ''' return GridSearchCV( GradientBoostingRegressor( loss='ls', warm_start=False, max_features=0.2, learning_rate=0.05, alpha=0.4, max_depth=13, subsample=0.6, n_estimators=180), tuned_parameters, cv=5, error_score=0, n_jobs=4, verbose=1) ''' return GradientBoostingRegressor(loss='ls', warm_start=False, max_features=0.2, learning_rate=0.05, alpha=0.4, max_depth=13, subsample=0.6, n_estimators=180) elif self.algorithm.lower() == 'adb': return AdaBoostRegressor() else: raise Exception('Sklearn Algorithm Options: svr')
def forecaster(returns, ff, loss='MSE'): output = [] dates = sorted(list(ff.index)) dataset = ff.merge(returns, left_index=True, right_index=True) columnNames = ['MktPremium', 'HML', 'Mom'] name = returns.columns.tolist()[0] i = dates.index('200201') for j in range(i, (len(dates))): trainData = dataset.loc['199801':dates[j], :] trainX = trainData[columnNames] trainY = trainData[[name]] model = LinearRegression() if loss == 'MSE': model = LinearRegression() if loss == 'Ridge': model = Ridge() if loss == 'Lasso': model = Lasso() if loss == 'Hub': model = HuberRegressor() if loss == 'ElasticNet': model = ElasticNet() model.fit(trainX, trainY) testData = pd.DataFrame(dataset.loc[dates[j], :]).T testX = testData[columnNames] prediction = model.predict(testX) if loss == 'LAD': model = QuantReg(endog=trainY, exog=trainX) res = model.fit(q=0.5) prediction = model.predict(res.params, exog=testX) if loss == '1Q': model = QuantReg(endog=trainY, exog=trainX) res = model.fit(q=0.25) prediction = model.predict(res.params, exog=testX) if loss == '3Q': model = QuantReg(endog=trainY, exog=trainX) res = model.fit(q=0.75) prediction = model.predict(res.params, exog=testX) if loss in ['Lasso', 'Hub', 'ElasticNet', 'LAD', '1Q', '3Q']: output.append(prediction[0]) else: output.append(prediction[0][0]) return (name, output)
def test_huber_scaling_invariant(): """Test that outliers filtering is scaling independent.""" rng = np.random.RandomState(0) X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100) huber.fit(X, y) n_outliers_mask_1 = huber.outliers_ assert_false(np.all(n_outliers_mask_1)) huber.fit(X, 2. * y) n_outliers_mask_2 = huber.outliers_ assert_array_equal(n_outliers_mask_2, n_outliers_mask_1) huber.fit(2. * X, 2. * y) n_outliers_mask_3 = huber.outliers_ assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
def test_huber_warm_start(): X, y = make_linear_regression_with_outliers() huber_warm = HuberRegressor(alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1) huber_warm.fit(X, y) huber_warm_coef = huber_warm.coef_.copy() huber_warm.fit(X, y) # SciPy performs the tol check after doing the coef updates, so # these would be almost same but not equal. assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1) assert huber_warm.n_iter_ == 0
def __init__(self, observations, groups, features, peaked, tail_prob=0.4, regressor=HuberRegressor(), classifier=LinearSVC(random_state=42)): super().__init__(observations, groups, features) if len(observations) != len(features) or len(observations) != len( peaked): raise ValueError() self.peaked = peaked self.regressor = regressor self.classifier = classifier self.tail_prob = tail_prob
def fit(x, y, axis, ic=ip, xlab=None, ylab=None): mx, my, sx, sy = x.mean(), y.mean(), x.std(), y.std() #mask = (x>mx-3*sx) & (x<mx+3*sx) #x, y = x[mask], y[mask] m, c = np.polyfit(x, y, deg=1) linearfit = HuberRegressor().fit(x.reshape(-1, 1), y) m, c = linearfit.coef_[0], linearfit.intercept_ print(m, c) axis.plot(x, y, 'C%d.' % ic, ms=2) axis.axvline(mx, lw=0.5, color='gray') xx = np.linspace(x.min(), x.max()) yy = xx * m + c axis.plot(xx, yy, 'C%d' % ip, label='m={:.2f}'.format(m)) #axis.set_xlim(mx-2*sx, mx+2*sx) #axis.set_ylim(m*(mx-2*sx)+c, m*(mx+2*sx)+c) return xx, m * xx + c, m, c
def get_models_multioutput(models=dict()): # linear models models['lr'] = MultiOutputRegressor(LinearRegression()) alpha = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for a in alpha: models['lasso-' + str(a)] = MultiOutputRegressor(Lasso(alpha=a)) for a in alpha: models['ridge-' + str(a)] = MultiOutputRegressor(Ridge(alpha=a)) for a1 in alpha: for a2 in alpha: name = 'en-' + str(a1) + '-' + str(a2) models[name] = MultiOutputRegressor(ElasticNet(a1, a2)) models['huber'] = MultiOutputRegressor(HuberRegressor()) models['lars'] = MultiOutputRegressor(Lars()) models['llars'] = MultiOutputRegressor(LassoLars()) models['pa'] = MultiOutputRegressor( PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)) models['ranscac'] = MultiOutputRegressor(RANSACRegressor()) models['sgd'] = MultiOutputRegressor(SGDRegressor(max_iter=1000, tol=1e-3)) models['theil'] = MultiOutputRegressor(TheilSenRegressor()) # non-linear models n_neighbors = range(1, 21) for k in n_neighbors: models['knn-' + str(k)] = MultiOutputRegressor( KNeighborsRegressor(n_neighbors=k)) models['cart'] = MultiOutputRegressor(DecisionTreeRegressor()) models['extra'] = MultiOutputRegressor(ExtraTreeRegressor()) models['svml'] = MultiOutputRegressor(SVR(kernel='linear')) models['svmp'] = MultiOutputRegressor(SVR(kernel='poly')) c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for c in c_values: models['svmr' + str(c)] = SVR(C=c) # ensemble models n_trees = 100 models['ada'] = MultiOutputRegressor( AdaBoostRegressor(n_estimators=n_trees)) models['bag'] = MultiOutputRegressor( BaggingRegressor(n_estimators=n_trees)) models['rf'] = MultiOutputRegressor( RandomForestRegressor(n_estimators=n_trees)) models['et'] = MultiOutputRegressor( ExtraTreesRegressor(n_estimators=n_trees)) models['gbm'] = MultiOutputRegressor( GradientBoostingRegressor(n_estimators=n_trees)) print('Defined %d models' % len(models)) return models
def __default_regressors(): return { 'huber': HuberRegressor(), 'theil_sen': TheilSenRegressor(), 'linear': LinearRegression(), 'ard': ARDRegression(), 'orthogonal_matching': OrthogonalMatchingPursuit(), 'elastic_net': ElasticNet(), 'bayesian_ridge': BayesianRidge(), 'lasso_lars': LassoLars(), 'lasso': Lasso(), 'ridge': Ridge(), 'gaussian_process': GaussianProcessRegressor(), 'decision_tree': DecisionTreeRegressor(), 'svr': SVR(), 'nu_svr': NuSVR(), 'kernel_ridge': KernelRidge() }
def validate(params): category_encoding = params['category_encoding'] if category_encoding == 'onehot': df2dict = FunctionTransformer(lambda x: x.to_dict(orient='records'), validate=False) transf = make_pipeline( FunctionTransformer(days_to_delta, validate=False), df2dict, DictVectorizer(sparse=False), ) elif category_encoding == 'count': transf = make_pipeline( FunctionTransformer(days_to_delta, validate=False), count_encoder(), SimpleImputer()) else: raise AssertionError( f'unknown category encoding type: {category_encoding}') reg_type = params['regressor_type'] if reg_type == 'rfr': reg = make_pipeline( SelectKBest(f_regression, params['k_best']), RandomForestRegressor(n_jobs=params['n_jobs'], n_estimators=params['n_estimators'], max_features=params['max_features'], max_depth=params['max_depth'], random_state=1)) elif reg_type == 'huber': reg = HuberRegressor(epsilon=params['epsilon']) elif reg_type == 'ard': reg = ARDRegression() est = make_pipeline(transf, reg) if params['drop_outliers']: est = no_outliers_pipeline(est) valid_mode = params['valid_mode'] n_folds = params['n_folds'] if valid_mode == 'split': return split_test(est, n_folds)
def main(): data = data_loading('international-airline-passengers.csv') regressors = [ ('AdaBoostRegressor', AdaBoostRegressor()), ('BaggingRegressor', BaggingRegressor()), ('ExtraTreesRegressor', ExtraTreesRegressor()), ('GaussianProcessRegressor', Pipeline([('scaler', MinMaxScaler()), ('gauss', GaussianProcessRegressor(n_restarts_optimizer=0, normalize_y=True))])), ('GradientBoostingRegressor', GradientBoostingRegressor()), ('HuberRegressor', HuberRegressor()), ('SGDRegressor', Pipeline([('scaler', StandardScaler()), ('sgd', SGDRegressor())])), ('PassiveAggressiveRegressor', PassiveAggressiveRegressor()), ('RANSACRegressor', RANSACRegressor()), ('RandomForestRegressor', RandomForestRegressor()), ('Lasso', Lasso()), ('ElasticNet', ElasticNet()), ('Linear SVR', Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='linear'))])), ('SVR', Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='rbf'))])), ] # Fit them all regressor_data = {} for reg_name, model in regressors: print("#" * 80) print("Start fitting '%s' regressor." % reg_name) examples = 100000 # Reduce data to make training faster t0 = time.time() model.fit(data['train']['X'][:examples], data['train']['y'][:examples]) t1 = time.time() an_data = analyze(model, data['all'], t1 - t0, reg_name) regressor_data[reg_name] = { 'name': reg_name, 'training_time': (t1 - t0) * 1000 } for key, value in an_data.items(): regressor_data[reg_name][key] = value print_website(regressor_data)
def get_ensembles_many_regressors(x: np.array, y: np.array, metric: Callable[[np.array, np.array], float], metric_max_better: bool = True) -> None: """ Tries a few solid regressors in sklearn and returns the best performing one :param x: numpy array of the features :param y: numpy array of the predictor :param metric: Function, the evaluation metric to use. :param metric_max_better: If the metric's higher value means better value """ gb = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42) gb2 = GradientBoostingRegressor(learning_rate=0.05, max_features='sqrt', loss='huber', min_impurity_split=None, min_samples_leaf=15, min_samples_split=10, n_estimators=12000, random_state=42) lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=42)) elastic = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, max_iter=10000, random_state=42)) rf = RandomForestRegressor(n_estimators=200, min_samples_leaf=3, random_state=42) rrf = ExtraTreesRegressor(n_estimators=200, min_samples_leaf=3, random_state=42) huber = HuberRegressor() linear = LinearRegression() nn = MLPRegressor(hidden_layer_sizes=(1000, 10), learning_rate='adaptive', max_iter=1000, random_state=42, early_stopping=True) svm_r = svm.SVR(kernel='poly', gamma='auto') knn = KNeighborsRegressor(n_neighbors=5) regressors = [gb, gb2, lasso, elastic, rf, rrf, huber, linear, nn, svm_r, knn] scores = np.zeros(len(regressors)) for i, r in enumerate(regressors): print('Running k-fold cross validation for', r.__class__.__name__) scores[i] = cross_validate(r, x, y, metric) best_index = np.argmax if metric_max_better else np.argmin best = np.amax if metric_max_better else np.amin first = lambda s: s[0] if len(s) > 1 else s print('Best performing model: ', regressors[first(best_index(scores))].__class__.__name__) print('Best', metric.__name__, ':', best(scores))
def tune_huber_regression_hyperparameters(): #hyperparameters alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20] #eplsion: The parameter epsilon controls the number of samples that should be classified as outliers. The smaller the epsilon, the more robust it is to outliers #epsilon = list(range(1.0,6.0,0.5))#float, greater than 1.0, default 1.35 (do 1 to 5) epsilon = np.append(np.linspace(1,5,9),[1.35]) tol = [0.01, .001, 0.0001, .00001] #trackers for best model and its scores best_model = None best_model_scores = None best_hyperparameters = [] run_once_flag = False for alpha_element in alpha: for epsilon_element in epsilon: for tol_element in tol: myModel = HuberRegressor(alpha=alpha_element, epsilon=epsilon_element, tol=tol_element)#should we set the 'normalize" parameter to true?, default is false #we get back a list of the scores myScores = evaluate_model(myModel, 'Huber Regression') # Least squares loss with L2 reg. #if index is 0, this is teh first iteration of this loop, so just set best model and score b.c. otherwise we'd have nothing to compare against if run_once_flag == False: best_model = myModel best_score = myScores best_hyperparameters.append(alpha_element) best_hyperparameters.append(epsilon_element) best_hyperparameters.append(tol_element) run_once_flag = True #check if we have a better model based on validaiton MSE score, and update if we do if myScores[1] < best_score[1]: #we want the validation MSE best_model = myModel best_score = myScores best_hyperparameters =[]#clear any old ones best_hyperparameters.append(alpha_element) best_hyperparameters.append(epsilon_element) best_hyperparameters.append(tol_element) #now that we've gon through all combinations of hyperparameters store everything in a bestModelObject and return the object, bestModelObjec is just a custom class that acts as a container return BestModelObject(best_model, best_score, best_hyperparameters)#return best model with best hyperparameters
def __init__(self, algorithm_name): """ It initiates the class of the corresponding algorithm. :param predictor: str, the tag of the algorithm's name. """ if algorithm_name == 'RF': from sklearn.ensemble import RandomForestRegressor self.reg = RandomForestRegressor(n_estimators=100, criterion="mse") if algorithm_name == 'RT': from sklearn.tree import DecisionTreeRegressor self.reg = DecisionTreeRegressor(criterion="mse") if algorithm_name == 'SLR': from sklearn.linear_model import LinearRegression self.reg = LinearRegression() if algorithm_name == 'HR': from sklearn.linear_model import HuberRegressor self.reg = HuberRegressor(fit_intercept=True, alpha=1.35, max_iter=100)
def fit_model(self, window_size=2000, summit_dis_cutoff=500): """Fit M-A normalization model.""" if not self.processed: raise ProcessNotReadyError("fit the M-A model", 'process peaks') self._count_reads(window_size=window_size) m_values = [] a_values = [] for chrom in self.peaks_merged.chroms: for peak in self.peaks_merged.fetch(chrom): if peak.summit_dis <= summit_dis_cutoff: m_values.append(peak.m_raw) a_values.append(peak.a_raw) m_values = np.array(m_values) a_values = np.array(a_values) mask = abs(m_values) <= 10 huber = HuberRegressor() huber.fit(a_values[mask].reshape(-1, 1), m_values[mask]) self.ma_params = [huber.intercept_, huber.coef_[0]] self.fitted = True
def plot_huber_vs_ridge(): # Generate toy data. rng = np.random.RandomState(0) X, y = make_regression(n_samples=20, n_features=1, random_state=0, noise=4.0, bias=100.0) # Add four strong outliers to the dataset. X_outliers = rng.normal(0, 0.5, size=(4, 1)) y_outliers = rng.normal(0, 2.0, size=4) X_outliers[:2, :] += X.max() + X.mean() / 4. X_outliers[2:, :] += X.min() - X.mean() / 4. y_outliers[:2] += y.min() - y.mean() / 4. y_outliers[2:] += y.max() + y.mean() / 4. X = np.vstack((X, X_outliers)) y = np.concatenate((y, y_outliers)) plt.plot(X, y, 'b.') # Fit the huber regressor over a series of epsilon values. colors = ['r-', 'b-', 'y-', 'm-'] x = np.linspace(X.min(), X.max(), 7) epsilon_values = [1.35, 1.5, 1.75, 1.9] for k, epsilon in enumerate(epsilon_values): huber = HuberRegressor(alpha=0.0, epsilon=epsilon) huber.fit(X, y) coef_ = huber.coef_ * x + huber.intercept_ plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon) # Fit a ridge regressor to compare it to huber regressor. ridge = Ridge(alpha=0.0, random_state=0, normalize=True) ridge.fit(X, y) coef_ridge = ridge.coef_ coef_ = ridge.coef_ * x + ridge.intercept_ plt.plot(x, coef_, 'g-', label="ridge regression") plt.title("Comparison of HuberRegressor vs Ridge") plt.xlabel("X") plt.ylabel("y") plt.legend(loc=0) plt.show()
def buildScoreCard(df, features, labelCol): binning_process = BinningProcess(features) estimator = HuberRegressor(max_iter=200) scorecard = Scorecard(binning_process=binning_process, target=labelCol, estimator=estimator, scaling_method=None, scaling_method_params={"min": 0, "max": 100}, reverse_scorecard=True) scorecard.verbose = True scorecard.fit(df, check_input=False) scorecard.information(print_level=2) print(scorecard.table(style="summary")) score = scorecard.score(df) y_pred = scorecard.predict(df) plt.scatter(score, df[labelCol], alpha=0.01, label="Average profit") plt.plot(score, y_pred, label="Huber regression", linewidth=2, color="orange") plt.ylabel("Average profit value (unit=100,000)") plt.xlabel("Score") plt.legend() plt.show()
def get_regressors_outlierrobust(nmodels='all'): """ Returns one or all of Outlier-Robust linear regressors """ # 1. HuberRegressor lr1 = HuberRegressor() # 2. RANSACRegressor lr2 = RANSACRegressor() # 3. TheilSenRegressors lr3 = TheilSenRegressors() if (nmodels == 'all'): models = [lr1, lr2, lr3] else: models = ['lr' + str(nmodels)] return models
def hr_example(df_ref, tkr, calc_date, window=500): # example of an alternative formulation for least squares fit # using some ml print('Fitting a Huber Regressor model') calc_date = pd.to_datetime(calc_date, errors='coerce') date_index = df_ref.index.get_level_values(1) # prevents having to call it all the time msk = (date_index >= calc_date - dt.timedelta(days=window)) & (date_index <= calc_date) df_res = df_ref[msk].copy() df_ts = df_res.xs(tkr, level=0) # time series dataframe for given ticker s = df_ts['Returns'].values mkt = df_ts['Market'].values X = mkt.reshape(-1, 1) # construct a pipeline mdl = Pipeline([('scaler', None), ('hr', HuberRegressor(fit_intercept=True))]) parameters = { 'hr__epsilon': np.linspace(1, 4, 20), 'hr__alpha': np.logspace(-4, -2, 3) } mdl = GridSearchCV(mdl, param_grid=parameters, n_jobs=-1, cv=KFold(n_splits=10, shuffle=True, random_state=0), scoring='neg_median_absolute_error', return_train_score=True, refit=True, error_score=np.nan) mdl.fit(X, s) return (mdl, X, s)
def test_huber_and_sgd_same_results(): # Test they should converge to same coefficients for same parameters X, y = make_regression_with_outliers(n_samples=10, n_features=2) # Fit once to find out the scale parameter. Scale down X and y by scale # so that the scale parameter is optimized to 1.0 huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, epsilon=1.35) huber.fit(X, y) X_scale = X / huber.scale_ y_scale = y / huber.scale_ huber.fit(X_scale, y_scale) assert_almost_equal(huber.scale_, 1.0, 3) sgdreg = SGDRegressor( alpha=0.0, loss="huber", shuffle=True, random_state=0, max_iter=10000, fit_intercept=False, epsilon=1.35, tol=None) sgdreg.fit(X_scale, y_scale) assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
def test_huber_better_r2_score(): # Test that huber returns a better r2 score than non-outliers""" X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100) huber.fit(X, y) linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y mask = np.abs(linear_loss) < huber.epsilon * huber.scale_ huber_score = huber.score(X[mask], y[mask]) huber_outlier_score = huber.score(X[~mask], y[~mask]) # The Ridge regressor should be influenced by the outliers and hence # give a worse score on the non-outliers as compared to the huber regressor. ridge = Ridge(fit_intercept=True, alpha=0.01) ridge.fit(X, y) ridge_score = ridge.score(X[mask], y[mask]) ridge_outlier_score = ridge.score(X[~mask], y[~mask]) assert_greater(huber_score, ridge_score) # The huber model should also fit poorly on the outliers. assert_greater(ridge_outlier_score, huber_outlier_score)
def predict(self, X, window=180): """ Predict if a particular sample is an outlier or not. :param X: the time series to detect of :param type X: numpy :param window: the length of window :param type window: int """ x_train = list(range(0, 2 * window + 1)) + list( range(0, 2 * window + 1)) + list(range(0, window + 1)) x_train = np.array(x_train) x_train = x_train[:, np.newaxis] avg_value = np.mean(X[-(window + 1):]) if avg_value > 1: y_train = X / avg_value else: y_train = X #y = X.reshape(-1, 1) model = HuberRegressor().fit(x_train, y_train) return model.predict(x_train)
def count_cars(): gt = pd.read_csv('data/imgs.csv') worker = Parallel(n_jobs=-1, verbose=1, backend='threading') train_features = worker( delayed(make_img_features)(id_) for id_ in gt['id'].values) x_data = np.vstack(tuple(train_features)) y_data = gt['car_count'].values x_test_ids = [ x.split('/')[-1].split('.')[0] for x in glob('data/tif/tif_test/*.tif') ] test_features = worker( delayed(make_img_features)(id_, train=False) for id_ in x_test_ids) x_test = np.vstack(tuple(test_features)) scorer = make_scorer(mape, greater_is_better=True) scaler = StandardScaler() x_data = scaler.fit_transform(x_data) x_test = scaler.transform(x_test) preds = [] for est in HuberRegressor(), BayesianRidge(), RandomForestRegressor(): score = cross_val_score( est, x_data, y_data, scoring=scorer, cv=5, ) logger.info(f'Score for {est.__class__} is {score.mean():.3f}}') est.fit(x_data, y_data) preds.append(est.predict(x_test)) preds = np.array(preds).mean(axis=0) pd.DataFrame({'id': x_test_ids, 'car_count': [int(x) for x in preds]}) \ .to_csv('predicts/final/imgs.csv', index=False)