Пример #1
0
def get_models(models=dict()):
	# linear models
	models['lr'] = LinearRegression()
	models['lasso'] = Lasso()
	models['ridge'] = Ridge()
	models['en'] = ElasticNet()
	models['huber'] = HuberRegressor()
	models['lars'] = Lars()
	models['llars'] = LassoLars()
	models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)
	models['ranscac'] = RANSACRegressor()
	models['sgd'] = SGDRegressor(max_iter=1000, tol=1e-3)
	print('Defined %d models' % len(models))
	return models
Пример #2
0
def test_vs_huber():
    reg1 = RobustWeightedRegressor(
        max_iter=100,
        weighting="huber",
        k=5,
        c=1,
        burn_in=0,
        sgd_args={"learning_rate": "adaptive"},  # test sgd_args
        random_state=rng,
    )
    reg2 = HuberRegressor()
    reg1.fit(X_rcy, y_rcy)
    reg2.fit(X_rcy, y_rcy)
    assert np.abs(reg1.coef_[0] - reg2.coef_[0]) < 1e-2
Пример #3
0
def test_quantile_equals_huber_for_low_epsilon(fit_intercept, default_solver):
    X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0)
    alpha = 1e-4
    huber = HuberRegressor(
        epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept
    ).fit(X, y)
    quant = QuantileRegressor(
        alpha=alpha, fit_intercept=fit_intercept, solver=default_solver
    ).fit(X, y)
    assert_allclose(huber.coef_, quant.coef_, atol=1e-1)
    if fit_intercept:
        assert huber.intercept_ == approx(quant.intercept_, abs=1e-1)
        # check that we still predict fraction
        assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
Пример #4
0
def build_model(model_type, params):
    """Implement function build model
    Support LinearRegression, HuberRegression, LassoRegression,
    SupportVectorRegression, DecisionTreeRegressor, RandomForest, XGBoost and
    MultiPerceptron
    Apply normalization input data

    Parameters:
    -----------
    model_type: str, type of model, support LinearRegression, HuberRegressor,
          Lasso, DecisionTreeRegressor, SupportVectorRegressor and
          XGBRegressor
    params: dict, parameter responding each model
    Returns:
    --------
    estimator: instance of class model
    """
    support_type = [
        'LinearRegression', 'HuberRegressor', 'Lasso', 'DecisionTreeRegressor',
        'RandomForestRegressor', 'SupportVectorMachine', 'XGBoost',
        'MultiPerceptron'
    ]
    assert (model_type in support_type), 'Expected one of value {}'.format(
        ','.join(support_type))

    steps = [('minmax-scaler', MinMaxScaler()),
             ('standard-scaler', StandardScaler()),
             ('polynomial', PolynomialFeatures(params.pop('degree', 1)))]

    # Choice model type
    if model_type == 'LinearRegression':
        steps.append(('model', LinearRegression(**params)))
    elif model_type == 'HuberRegressor':
        steps.append(('model', HuberRegressor(**params)))
    elif model_type == 'Lasso':
        steps.append(('model', Lasso(**params)))
    elif model_type == 'DecisionTreeRegressor':
        steps.append(('model', DecisionTreeRegressor(**params)))
    elif model_type == 'RandomForestRegressor':
        steps.append(('model', RandomForestRegressor(**params)))
    elif model_type == 'SupportVectorMachine':
        steps.append(('model', SVR(**params)))
    elif model_type == 'XGBoost':
        steps.append(('model', XGBRegressor(**params)))
    elif model_type == 'MultiPerceptron':
        steps.append(('model', MLPRegressor(**params)))

    estimator = Pipeline(steps)
    return estimator
Пример #5
0
def test_huber_warm_start():
    X, y = make_regression_with_outliers()
    huber_warm = HuberRegressor(
        fit_intercept=True, alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
    huber_warm.fit(X, y)
    huber_warm_coef = huber_warm.coef_.copy()
    huber_warm.fit(X, y)

    # SciPy performs the tol check after doing the coef updates, so
    # these would be almost same but not equal.
    assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)

    # No n_iter_ in old SciPy (<=0.9)
    if huber_warm.n_iter_ is not None:
        assert_equal(0, huber_warm.n_iter_)
Пример #6
0
    def __init__(self, x_train, y_train, test_split_available=False, test_size=0.1, shuffle=True, number_of_estimator=10, estimator=None, estimators=None, random_state=None):
        if test_split_available:
            self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x_train, y_train,
                                                                                    test_size=test_size,
                                                                                    shuffle=shuffle,
                                                                                    random_state=random_state)
        else:
            self.x_test = x_train
            self.y_test = y_train
            self.x_train = x_train
            self.y_train = y_train
        self.y_predict_test = {}
        self.y_predict_train = {}
        self.models = {'svr': SVR(), 'knn': KNeighborsRegressor(), 'tree': DecisionTreeRegressor(),
                       'logistic': LogisticRegression(), 'linear': LinearRegression(), 'ridge': Ridge(),
                       'ridgecv': RidgeCV(), 'lasso': Lasso(), 'lassolars': LassoLars(alpha=0.1),
                       'bayesian': BayesianRidge(), 'ElasticNet': ElasticNet(),
                       'TheilSenRegressor': TheilSenRegressor(),
                       'ARDRegression': ARDRegression(), 'RANSACRegressor': RANSACRegressor(),
                       'HuberRegressor': HuberRegressor(), 'randomForest': RandomForestRegressor(n_estimators=50),
                       'boost': AdaBoostRegressor(random_state=0, n_estimators=100)}

        self.estimator = self.models[estimator]
        estimators_list = []
        for i in range(len(estimators)):
            estimators_list.append((estimators[i], self.models[estimators[i]]))

        self.models = {'svr': SVR(), 'knn': KNeighborsRegressor(), 'tree': DecisionTreeRegressor(),
                       'logistic': LogisticRegression(), 'linear': LinearRegression(), 'ridge': Ridge(),
                       'ridgecv': RidgeCV(), 'lasso': Lasso(), 'lassolars': LassoLars(alpha=0.1),
                       'bayesian': BayesianRidge(), 'ElasticNet': ElasticNet(),
                       'TheilSenRegressor': TheilSenRegressor(),
                       'ARDRegression': ARDRegression(), 'RANSACRegressor': RANSACRegressor(),
                       'HuberRegressor': HuberRegressor(), 'randomForest': RandomForestRegressor(n_estimators=50),
                       'bagging': BaggingRegressor(base_estimator=self.estimator, n_estimators=number_of_estimator, max_features=0.8),
                       'voting': VotingRegressor(estimators=estimators_list), 'boost': AdaBoostRegressor(random_state=0, n_estimators=100)}
Пример #7
0
def get_models(models=dict()):
    # linear models
    models['linear regression'] = LinearRegression()
    models['lasso'] = Lasso()
    models['ridge'] = Ridge()
    models['elastic net'] = ElasticNet()
    models['huber regressor'] = HuberRegressor()
    #models['lars'] = Lars()
    models['lasso lars'] = LassoLars()
    models['passive aggressive regressor'] = PassiveAggressiveRegressor(
        max_iter=1000, tol=1e-3)
    models['ranscac regressor'] = RANSACRegressor(min_samples=4)
    models['sgd regressor'] = SGDRegressor(max_iter=5000, tol=1e-3)
    print('Defined %d models' % len(models))
    return models
Пример #8
0
def test_huber_scaling_invariant():
    # Test that outliers filtering is scaling independent.
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=False, alpha=0.0)
    huber.fit(X, y)
    n_outliers_mask_1 = huber.outliers_
    assert not np.all(n_outliers_mask_1)

    huber.fit(X, 2.0 * y)
    n_outliers_mask_2 = huber.outliers_
    assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)

    huber.fit(2.0 * X, 2.0 * y)
    n_outliers_mask_3 = huber.outliers_
    assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
Пример #9
0
 def _algorithm(self):
     if self.algorithm.lower() == 'svr':
         tuned_parameters = [{
             'C': np.arange(1, 4, 0.5),
             'epsilon': np.arange(0.5, 2, 0.2),
             'tol': [1, 1e-1, 1e-2, 1e-3]
         }]
         #return GridSearchCV(
         #    SVR(kernel='rbf', shrinking=True, gamma='auto'), tuned_parameters,
         #    cv=5, error_score=0, n_jobs=4, verbose=1
         #)
         return SVR(kernel='rbf', C=1)
     elif self.algorithm.lower() == 'mlp':
         return MLPRegressor()
     elif self.algorithm.lower() == 'huber':
         return HuberRegressor()
     elif self.algorithm.lower() == 'lr':
         return linear_model.LinearRegression()
     elif self.algorithm.lower() == 'rigid':
         return linear_model.Ridge(alpha=0.5)
     elif self.algorithm.lower() == 'rf':
         return RandomForestRegressor(random_state=0, n_estimators=200)
     elif self.algorithm.lower() == 'gbr':
         tuned_parameters = [{
             'n_estimators': [160, 170, 180],
             'subsample': [0.6, 0.7, 0.8],
         }]
         '''
         return GridSearchCV(
             GradientBoostingRegressor(
                 loss='ls', warm_start=False, max_features=0.2,
                 learning_rate=0.05, alpha=0.4, max_depth=13, subsample=0.6,
                 n_estimators=180),
             tuned_parameters,
             cv=5, error_score=0, n_jobs=4, verbose=1)
         '''
         return GradientBoostingRegressor(loss='ls',
                                          warm_start=False,
                                          max_features=0.2,
                                          learning_rate=0.05,
                                          alpha=0.4,
                                          max_depth=13,
                                          subsample=0.6,
                                          n_estimators=180)
     elif self.algorithm.lower() == 'adb':
         return AdaBoostRegressor()
     else:
         raise Exception('Sklearn Algorithm Options: svr')
Пример #10
0
def forecaster(returns, ff, loss='MSE'):

    output = []
    dates = sorted(list(ff.index))
    dataset = ff.merge(returns, left_index=True, right_index=True)
    columnNames = ['MktPremium', 'HML', 'Mom']
    name = returns.columns.tolist()[0]

    i = dates.index('200201')

    for j in range(i, (len(dates))):
        trainData = dataset.loc['199801':dates[j], :]
        trainX = trainData[columnNames]
        trainY = trainData[[name]]
        model = LinearRegression()
        if loss == 'MSE':
            model = LinearRegression()
        if loss == 'Ridge':
            model = Ridge()
        if loss == 'Lasso':
            model = Lasso()
        if loss == 'Hub':
            model = HuberRegressor()
        if loss == 'ElasticNet':
            model = ElasticNet()
        model.fit(trainX, trainY)
        testData = pd.DataFrame(dataset.loc[dates[j], :]).T
        testX = testData[columnNames]
        prediction = model.predict(testX)
        if loss == 'LAD':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.5)
            prediction = model.predict(res.params, exog=testX)
        if loss == '1Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.25)
            prediction = model.predict(res.params, exog=testX)
        if loss == '3Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.75)
            prediction = model.predict(res.params, exog=testX)

        if loss in ['Lasso', 'Hub', 'ElasticNet', 'LAD', '1Q', '3Q']:
            output.append(prediction[0])
        else:
            output.append(prediction[0][0])

    return (name, output)
Пример #11
0
def test_huber_scaling_invariant():
    """Test that outliers filtering is scaling independent."""
    rng = np.random.RandomState(0)
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100)
    huber.fit(X, y)
    n_outliers_mask_1 = huber.outliers_
    assert_false(np.all(n_outliers_mask_1))

    huber.fit(X, 2. * y)
    n_outliers_mask_2 = huber.outliers_
    assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)

    huber.fit(2. * X, 2. * y)
    n_outliers_mask_3 = huber.outliers_
    assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
Пример #12
0
def test_huber_warm_start():
    X, y = make_linear_regression_with_outliers()
    huber_warm = HuberRegressor(alpha=1.0,
                                max_iter=10000,
                                warm_start=True,
                                tol=1e-1)

    huber_warm.fit(X, y)
    huber_warm_coef = huber_warm.coef_.copy()
    huber_warm.fit(X, y)

    # SciPy performs the tol check after doing the coef updates, so
    # these would be almost same but not equal.
    assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)

    assert huber_warm.n_iter_ == 0
 def __init__(self,
              observations,
              groups,
              features,
              peaked,
              tail_prob=0.4,
              regressor=HuberRegressor(),
              classifier=LinearSVC(random_state=42)):
     super().__init__(observations, groups, features)
     if len(observations) != len(features) or len(observations) != len(
             peaked):
         raise ValueError()
     self.peaked = peaked
     self.regressor = regressor
     self.classifier = classifier
     self.tail_prob = tail_prob
Пример #14
0
                def fit(x, y, axis, ic=ip, xlab=None, ylab=None):
                    mx, my, sx, sy = x.mean(), y.mean(), x.std(), y.std()
                    #mask = (x>mx-3*sx) & (x<mx+3*sx)
                    #x, y = x[mask], y[mask]
                    m, c = np.polyfit(x, y, deg=1)
                    linearfit = HuberRegressor().fit(x.reshape(-1, 1), y)
                    m, c = linearfit.coef_[0], linearfit.intercept_

                    print(m, c)
                    axis.plot(x, y, 'C%d.' % ic, ms=2)
                    axis.axvline(mx, lw=0.5, color='gray')
                    xx = np.linspace(x.min(), x.max())
                    yy = xx * m + c
                    axis.plot(xx, yy, 'C%d' % ip, label='m={:.2f}'.format(m))
                    #axis.set_xlim(mx-2*sx, mx+2*sx)
                    #axis.set_ylim(m*(mx-2*sx)+c, m*(mx+2*sx)+c)
                    return xx, m * xx + c, m, c
Пример #15
0
def get_models_multioutput(models=dict()):
    # linear models
    models['lr'] = MultiOutputRegressor(LinearRegression())
    alpha = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for a in alpha:
        models['lasso-' + str(a)] = MultiOutputRegressor(Lasso(alpha=a))
    for a in alpha:
        models['ridge-' + str(a)] = MultiOutputRegressor(Ridge(alpha=a))
    for a1 in alpha:
        for a2 in alpha:
            name = 'en-' + str(a1) + '-' + str(a2)
            models[name] = MultiOutputRegressor(ElasticNet(a1, a2))
    models['huber'] = MultiOutputRegressor(HuberRegressor())
    models['lars'] = MultiOutputRegressor(Lars())
    models['llars'] = MultiOutputRegressor(LassoLars())
    models['pa'] = MultiOutputRegressor(
        PassiveAggressiveRegressor(max_iter=1000, tol=1e-3))
    models['ranscac'] = MultiOutputRegressor(RANSACRegressor())
    models['sgd'] = MultiOutputRegressor(SGDRegressor(max_iter=1000, tol=1e-3))
    models['theil'] = MultiOutputRegressor(TheilSenRegressor())
    # non-linear models
    n_neighbors = range(1, 21)
    for k in n_neighbors:
        models['knn-' + str(k)] = MultiOutputRegressor(
            KNeighborsRegressor(n_neighbors=k))
    models['cart'] = MultiOutputRegressor(DecisionTreeRegressor())
    models['extra'] = MultiOutputRegressor(ExtraTreeRegressor())
    models['svml'] = MultiOutputRegressor(SVR(kernel='linear'))
    models['svmp'] = MultiOutputRegressor(SVR(kernel='poly'))
    c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    for c in c_values:
        models['svmr' + str(c)] = SVR(C=c)
    # ensemble models
    n_trees = 100
    models['ada'] = MultiOutputRegressor(
        AdaBoostRegressor(n_estimators=n_trees))
    models['bag'] = MultiOutputRegressor(
        BaggingRegressor(n_estimators=n_trees))
    models['rf'] = MultiOutputRegressor(
        RandomForestRegressor(n_estimators=n_trees))
    models['et'] = MultiOutputRegressor(
        ExtraTreesRegressor(n_estimators=n_trees))
    models['gbm'] = MultiOutputRegressor(
        GradientBoostingRegressor(n_estimators=n_trees))
    print('Defined %d models' % len(models))
    return models
 def __default_regressors():
     return {
         'huber': HuberRegressor(),
         'theil_sen': TheilSenRegressor(),
         'linear': LinearRegression(),
         'ard': ARDRegression(),
         'orthogonal_matching': OrthogonalMatchingPursuit(),
         'elastic_net': ElasticNet(),
         'bayesian_ridge': BayesianRidge(),
         'lasso_lars': LassoLars(),
         'lasso': Lasso(),
         'ridge': Ridge(),
         'gaussian_process': GaussianProcessRegressor(),
         'decision_tree': DecisionTreeRegressor(),
         'svr': SVR(),
         'nu_svr': NuSVR(),
         'kernel_ridge': KernelRidge()
     }
Пример #17
0
def validate(params):
    category_encoding = params['category_encoding']

    if category_encoding == 'onehot':
        df2dict = FunctionTransformer(lambda x: x.to_dict(orient='records'),
                                      validate=False)

        transf = make_pipeline(
            FunctionTransformer(days_to_delta, validate=False),
            df2dict,
            DictVectorizer(sparse=False),
        )
    elif category_encoding == 'count':
        transf = make_pipeline(
            FunctionTransformer(days_to_delta, validate=False),
            count_encoder(), SimpleImputer())
    else:
        raise AssertionError(
            f'unknown category encoding type: {category_encoding}')

    reg_type = params['regressor_type']

    if reg_type == 'rfr':
        reg = make_pipeline(
            SelectKBest(f_regression, params['k_best']),
            RandomForestRegressor(n_jobs=params['n_jobs'],
                                  n_estimators=params['n_estimators'],
                                  max_features=params['max_features'],
                                  max_depth=params['max_depth'],
                                  random_state=1))
    elif reg_type == 'huber':
        reg = HuberRegressor(epsilon=params['epsilon'])
    elif reg_type == 'ard':
        reg = ARDRegression()

    est = make_pipeline(transf, reg)

    if params['drop_outliers']:
        est = no_outliers_pipeline(est)

    valid_mode = params['valid_mode']
    n_folds = params['n_folds']
    if valid_mode == 'split':
        return split_test(est, n_folds)
Пример #18
0
def main():
    data = data_loading('international-airline-passengers.csv')

    regressors = [
        ('AdaBoostRegressor', AdaBoostRegressor()),
        ('BaggingRegressor', BaggingRegressor()),
        ('ExtraTreesRegressor', ExtraTreesRegressor()),
        ('GaussianProcessRegressor',
         Pipeline([('scaler', MinMaxScaler()),
                   ('gauss',
                    GaussianProcessRegressor(n_restarts_optimizer=0,
                                             normalize_y=True))])),
        ('GradientBoostingRegressor', GradientBoostingRegressor()),
        ('HuberRegressor', HuberRegressor()),
        ('SGDRegressor',
         Pipeline([('scaler', StandardScaler()), ('sgd', SGDRegressor())])),
        ('PassiveAggressiveRegressor', PassiveAggressiveRegressor()),
        ('RANSACRegressor', RANSACRegressor()),
        ('RandomForestRegressor', RandomForestRegressor()),
        ('Lasso', Lasso()),
        ('ElasticNet', ElasticNet()),
        ('Linear SVR',
         Pipeline([('scaler', StandardScaler()),
                   ('svr', SVR(kernel='linear'))])),
        ('SVR',
         Pipeline([('scaler', StandardScaler()), ('svr', SVR(kernel='rbf'))])),
    ]
    # Fit them all
    regressor_data = {}
    for reg_name, model in regressors:
        print("#" * 80)
        print("Start fitting '%s' regressor." % reg_name)
        examples = 100000  # Reduce data to make training faster
        t0 = time.time()
        model.fit(data['train']['X'][:examples], data['train']['y'][:examples])
        t1 = time.time()
        an_data = analyze(model, data['all'], t1 - t0, reg_name)
        regressor_data[reg_name] = {
            'name': reg_name,
            'training_time': (t1 - t0) * 1000
        }
        for key, value in an_data.items():
            regressor_data[reg_name][key] = value
    print_website(regressor_data)
Пример #19
0
def get_ensembles_many_regressors(x: np.array, y: np.array, metric: Callable[[np.array, np.array], float],
                                  metric_max_better: bool = True) -> None:
    """
    Tries a few solid regressors in sklearn and returns the best performing one
    :param x: numpy array of the features
    :param y: numpy array of the predictor
    :param metric: Function, the evaluation metric to use.
    :param metric_max_better: If the metric's higher value means better value
    """

    gb = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state=42)

    gb2 = GradientBoostingRegressor(learning_rate=0.05, max_features='sqrt', loss='huber',
                                    min_impurity_split=None, min_samples_leaf=15,
                                    min_samples_split=10, n_estimators=12000,
                                    random_state=42)

    lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=42))
    elastic = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, max_iter=10000, random_state=42))
    rf = RandomForestRegressor(n_estimators=200, min_samples_leaf=3, random_state=42)
    rrf = ExtraTreesRegressor(n_estimators=200, min_samples_leaf=3, random_state=42)
    huber = HuberRegressor()
    linear = LinearRegression()
    nn = MLPRegressor(hidden_layer_sizes=(1000, 10), learning_rate='adaptive',
                      max_iter=1000, random_state=42, early_stopping=True)
    svm_r = svm.SVR(kernel='poly', gamma='auto')
    knn = KNeighborsRegressor(n_neighbors=5)

    regressors = [gb, gb2, lasso, elastic, rf, rrf, huber, linear, nn, svm_r, knn]
    scores = np.zeros(len(regressors))

    for i, r in enumerate(regressors):
        print('Running k-fold cross validation for', r.__class__.__name__)
        scores[i] = cross_validate(r, x, y, metric)

    best_index = np.argmax if metric_max_better else np.argmin
    best = np.amax if metric_max_better else np.amin
    first = lambda s: s[0] if len(s) > 1 else s

    print('Best performing model: ', regressors[first(best_index(scores))].__class__.__name__)
    print('Best', metric.__name__, ':', best(scores))
def tune_huber_regression_hyperparameters():
    #hyperparameters
    alpha = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
    #eplsion: The parameter epsilon controls the number of samples that should be classified as outliers. The smaller the epsilon, the more robust it is to outliers
    #epsilon = list(range(1.0,6.0,0.5))#float, greater than 1.0, default 1.35 (do 1 to 5)
    epsilon = np.append(np.linspace(1,5,9),[1.35])
    tol = [0.01, .001, 0.0001, .00001]

    #trackers for best model and its scores
    best_model = None
    best_model_scores = None
    best_hyperparameters = []

    run_once_flag = False
    for alpha_element in alpha:
        for epsilon_element in epsilon:
            for tol_element in tol:
                
                myModel = HuberRegressor(alpha=alpha_element, epsilon=epsilon_element, tol=tol_element)#should we set the 'normalize" parameter to true?, default is false
                
                #we get back a list of the scores
                myScores = evaluate_model(myModel, 'Huber Regression') # Least squares loss with L2 reg.
                
                #if index is 0, this is teh first iteration of this loop, so just set best model and score b.c. otherwise we'd have nothing to compare against
                if run_once_flag == False:
                    best_model = myModel
                    best_score = myScores
                    best_hyperparameters.append(alpha_element)
                    best_hyperparameters.append(epsilon_element)
                    best_hyperparameters.append(tol_element)
                    run_once_flag = True


                #check if we have a better model based on validaiton MSE score, and update if we do
                if myScores[1] < best_score[1]: #we want the validation MSE
                    best_model = myModel
                    best_score = myScores
                    best_hyperparameters =[]#clear any old ones
                    best_hyperparameters.append(alpha_element)
                    best_hyperparameters.append(epsilon_element)
                    best_hyperparameters.append(tol_element)

    #now that we've gon through all combinations of hyperparameters store everything in a bestModelObject and return the object, bestModelObjec is just a custom class that acts as a container
    return BestModelObject(best_model, best_score, best_hyperparameters)#return best model with best hyperparameters
Пример #21
0
 def __init__(self, algorithm_name):
     """
     It initiates the class of the corresponding algorithm.
     :param predictor: str, the tag of the algorithm's name.
     """
     if algorithm_name == 'RF':
         from sklearn.ensemble import RandomForestRegressor
         self.reg = RandomForestRegressor(n_estimators=100, criterion="mse")
     if algorithm_name == 'RT':
         from sklearn.tree import DecisionTreeRegressor
         self.reg = DecisionTreeRegressor(criterion="mse")
     if algorithm_name == 'SLR':
         from sklearn.linear_model import LinearRegression
         self.reg = LinearRegression()
     if algorithm_name == 'HR':
         from sklearn.linear_model import HuberRegressor
         self.reg = HuberRegressor(fit_intercept=True,
                                   alpha=1.35,
                                   max_iter=100)
Пример #22
0
 def fit_model(self, window_size=2000, summit_dis_cutoff=500):
     """Fit M-A normalization model."""
     if not self.processed:
         raise ProcessNotReadyError("fit the M-A model", 'process peaks')
     self._count_reads(window_size=window_size)
     m_values = []
     a_values = []
     for chrom in self.peaks_merged.chroms:
         for peak in self.peaks_merged.fetch(chrom):
             if peak.summit_dis <= summit_dis_cutoff:
                 m_values.append(peak.m_raw)
                 a_values.append(peak.a_raw)
     m_values = np.array(m_values)
     a_values = np.array(a_values)
     mask = abs(m_values) <= 10
     huber = HuberRegressor()
     huber.fit(a_values[mask].reshape(-1, 1), m_values[mask])
     self.ma_params = [huber.intercept_, huber.coef_[0]]
     self.fitted = True
Пример #23
0
def plot_huber_vs_ridge():
    # Generate toy data.
    rng = np.random.RandomState(0)
    X, y = make_regression(n_samples=20,
                           n_features=1,
                           random_state=0,
                           noise=4.0,
                           bias=100.0)

    # Add four strong outliers to the dataset.
    X_outliers = rng.normal(0, 0.5, size=(4, 1))
    y_outliers = rng.normal(0, 2.0, size=4)
    X_outliers[:2, :] += X.max() + X.mean() / 4.
    X_outliers[2:, :] += X.min() - X.mean() / 4.
    y_outliers[:2] += y.min() - y.mean() / 4.
    y_outliers[2:] += y.max() + y.mean() / 4.
    X = np.vstack((X, X_outliers))
    y = np.concatenate((y, y_outliers))
    plt.plot(X, y, 'b.')

    # Fit the huber regressor over a series of epsilon values.
    colors = ['r-', 'b-', 'y-', 'm-']

    x = np.linspace(X.min(), X.max(), 7)
    epsilon_values = [1.35, 1.5, 1.75, 1.9]
    for k, epsilon in enumerate(epsilon_values):
        huber = HuberRegressor(alpha=0.0, epsilon=epsilon)
        huber.fit(X, y)
        coef_ = huber.coef_ * x + huber.intercept_
        plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon)

    # Fit a ridge regressor to compare it to huber regressor.
    ridge = Ridge(alpha=0.0, random_state=0, normalize=True)
    ridge.fit(X, y)
    coef_ridge = ridge.coef_
    coef_ = ridge.coef_ * x + ridge.intercept_
    plt.plot(x, coef_, 'g-', label="ridge regression")

    plt.title("Comparison of HuberRegressor vs Ridge")
    plt.xlabel("X")
    plt.ylabel("y")
    plt.legend(loc=0)
    plt.show()
Пример #24
0
def buildScoreCard(df, features, labelCol):
    binning_process = BinningProcess(features)
    estimator = HuberRegressor(max_iter=200)
    scorecard = Scorecard(binning_process=binning_process, target=labelCol,
                          estimator=estimator, scaling_method=None,
                          scaling_method_params={"min": 0, "max": 100},
                          reverse_scorecard=True)
    scorecard.verbose = True
    scorecard.fit(df, check_input=False)
    scorecard.information(print_level=2)
    print(scorecard.table(style="summary"))
    score = scorecard.score(df)
    y_pred = scorecard.predict(df)
    plt.scatter(score, df[labelCol], alpha=0.01, label="Average profit")
    plt.plot(score, y_pred, label="Huber regression", linewidth=2, color="orange")
    plt.ylabel("Average profit value (unit=100,000)")
    plt.xlabel("Score")
    plt.legend()
    plt.show()
def get_regressors_outlierrobust(nmodels='all'):
    """
		Returns one or all of Outlier-Robust linear regressors 
	"""
    # 1. HuberRegressor
    lr1 = HuberRegressor()

    # 2. RANSACRegressor
    lr2 = RANSACRegressor()

    # 3. TheilSenRegressors
    lr3 = TheilSenRegressors()

    if (nmodels == 'all'):
        models = [lr1, lr2, lr3]
    else:
        models = ['lr' + str(nmodels)]

    return models
Пример #26
0
def hr_example(df_ref, tkr, calc_date, window=500):
    # example of an alternative formulation for least squares fit
    # using some ml

    print('Fitting a Huber Regressor model')
    calc_date = pd.to_datetime(calc_date, errors='coerce')

    date_index = df_ref.index.get_level_values(1)
    # prevents having to call it all the time

    msk = (date_index >=
           calc_date - dt.timedelta(days=window)) & (date_index <= calc_date)

    df_res = df_ref[msk].copy()

    df_ts = df_res.xs(tkr, level=0)  # time series dataframe for given ticker

    s = df_ts['Returns'].values
    mkt = df_ts['Market'].values

    X = mkt.reshape(-1, 1)

    # construct a pipeline
    mdl = Pipeline([('scaler', None),
                    ('hr', HuberRegressor(fit_intercept=True))])

    parameters = {
        'hr__epsilon': np.linspace(1, 4, 20),
        'hr__alpha': np.logspace(-4, -2, 3)
    }

    mdl = GridSearchCV(mdl,
                       param_grid=parameters,
                       n_jobs=-1,
                       cv=KFold(n_splits=10, shuffle=True, random_state=0),
                       scoring='neg_median_absolute_error',
                       return_train_score=True,
                       refit=True,
                       error_score=np.nan)

    mdl.fit(X, s)

    return (mdl, X, s)
Пример #27
0
def test_huber_and_sgd_same_results():
    # Test they should converge to same coefficients for same parameters

    X, y = make_regression_with_outliers(n_samples=10, n_features=2)

    # Fit once to find out the scale parameter. Scale down X and y by scale
    # so that the scale parameter is optimized to 1.0
    huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100,
                           epsilon=1.35)
    huber.fit(X, y)
    X_scale = X / huber.scale_
    y_scale = y / huber.scale_
    huber.fit(X_scale, y_scale)
    assert_almost_equal(huber.scale_, 1.0, 3)

    sgdreg = SGDRegressor(
        alpha=0.0, loss="huber", shuffle=True, random_state=0, max_iter=10000,
        fit_intercept=False, epsilon=1.35, tol=None)
    sgdreg.fit(X_scale, y_scale)
    assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
Пример #28
0
def test_huber_better_r2_score():
    # Test that huber returns a better r2 score than non-outliers"""
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100)
    huber.fit(X, y)
    linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
    mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
    huber_score = huber.score(X[mask], y[mask])
    huber_outlier_score = huber.score(X[~mask], y[~mask])

    # The Ridge regressor should be influenced by the outliers and hence
    # give a worse score on the non-outliers as compared to the huber regressor.
    ridge = Ridge(fit_intercept=True, alpha=0.01)
    ridge.fit(X, y)
    ridge_score = ridge.score(X[mask], y[mask])
    ridge_outlier_score = ridge.score(X[~mask], y[~mask])
    assert_greater(huber_score, ridge_score)

    # The huber model should also fit poorly on the outliers.
    assert_greater(ridge_outlier_score, huber_outlier_score)
Пример #29
0
 def predict(self, X, window=180):
     """
     Predict if a particular sample is an outlier or not.
     :param X: the time series to detect of
     :param type X: numpy
     :param window: the length of window
     :param type window: int
     """
     x_train = list(range(0, 2 * window + 1)) + list(
         range(0, 2 * window + 1)) + list(range(0, window + 1))
     x_train = np.array(x_train)
     x_train = x_train[:, np.newaxis]
     avg_value = np.mean(X[-(window + 1):])
     if avg_value > 1:
         y_train = X / avg_value
     else:
         y_train = X
     #y = X.reshape(-1, 1)
     model = HuberRegressor().fit(x_train, y_train)
     return model.predict(x_train)
Пример #30
0
def count_cars():
    gt = pd.read_csv('data/imgs.csv')
    worker = Parallel(n_jobs=-1, verbose=1, backend='threading')

    train_features = worker(
        delayed(make_img_features)(id_) for id_ in gt['id'].values)
    x_data = np.vstack(tuple(train_features))
    y_data = gt['car_count'].values

    x_test_ids = [
        x.split('/')[-1].split('.')[0] for x in glob('data/tif/tif_test/*.tif')
    ]

    test_features = worker(
        delayed(make_img_features)(id_, train=False) for id_ in x_test_ids)
    x_test = np.vstack(tuple(test_features))

    scorer = make_scorer(mape, greater_is_better=True)
    scaler = StandardScaler()

    x_data = scaler.fit_transform(x_data)
    x_test = scaler.transform(x_test)

    preds = []
    for est in HuberRegressor(), BayesianRidge(), RandomForestRegressor():
        score = cross_val_score(
            est,
            x_data,
            y_data,
            scoring=scorer,
            cv=5,
        )
        logger.info(f'Score for {est.__class__} is {score.mean():.3f}}')

        est.fit(x_data, y_data)
        preds.append(est.predict(x_test))

    preds = np.array(preds).mean(axis=0)

    pd.DataFrame({'id': x_test_ids, 'car_count': [int(x) for x in preds]}) \
        .to_csv('predicts/final/imgs.csv', index=False)