Exemplo n.º 1
0
def background_null_distance_resid_correlation(x, y, z, data, model=linear_model.LassoLarsCV(), Reps=100):
    backgrounds = []

    X, Y, Z = prepare_data_for_regression_stat(x, y, z, data)
    n = X.shape[0]
    UCMD_A = DistCor.U_centered_matrix(DistCor.dist_matrix(X))
    triu_indices = np.triu_indices_from(UCMD_A, k=1)
    UCMD_B = DistCor.U_centered_matrix(DistCor.dist_matrix(Y))
    U_vector_B = UCMD_B[triu_indices].reshape(-1, 1)

    indices = np.arange(n)

    if Z is not None:
        UCMD_Cs = list(map(DistCor.U_centered_matrix, map(DistCor.dist_matrix, Z.T)))
        U_matrix_C = np.vstack([UCMD_Cs[i][triu_indices] for i in range(len(UCMD_Cs))]).T
        for i in range(Reps):
            np.random.shuffle(indices)
            shuffled_UCMD_A = UCMD_A[:, indices][indices]
            shuffled_U_vector_A = shuffled_UCMD_A[triu_indices].reshape(-1, 1)

            background_statistic = reg_correlation(shuffled_U_vector_A, U_vector_B, U_matrix_C, model)
            backgrounds.append(background_statistic)
    else:
        for i in range(Reps):
            np.random.shuffle(indices)
            shuffled_UCMD_A = UCMD_A[:, indices][indices]
            shuffled_U_vector_A = shuffled_UCMD_A[triu_indices].reshape(-1, 1)
            background_statistic = np.corrcoef(shuffled_U_vector_A.flatten(), U_vector_B.flatten())[0][1]
            backgrounds.append(background_statistic)
    return backgrounds
Exemplo n.º 2
0
def train_lassolars_model(train_x, train_y, predict_x):
    print_title("LassoLars Regressor")
    reg = linear_model.LassoLarsCV(cv=10,
                                   n_jobs=3,
                                   max_iter=2000,
                                   normalize=False)
    reg.fit(train_x, train_y)
    print("alphas and cv_alphas: {0} and {1}".format(reg.alphas_.shape,
                                                     reg.cv_alphas_.shape))
    print("alphas[%d]: %s" % (len(reg.cv_alphas_), reg.cv_alphas_))
    print("mse shape: {0}".format(reg.cv_mse_path_.shape))
    # print("mse: %s" % np.mean(_mse, axis=0))
    # print("mse: %s" % np.mean(_mse, axis=1))
    # index = np.where(reg.alphas_ == reg.alpha_)
    # print("itemindex: %s" % index)
    index = np.where(reg.cv_alphas_ == reg.alpha_)
    _mse_v = np.mean(reg.cv_mse_path_[index, :])
    print("mse value: %f" % _mse_v)

    print("best alpha: %f" % reg.alpha_)
    best_alpha = reg.alpha_
    reg = linear_model.LassoLars(alpha=best_alpha)
    reg.fit(train_x, train_y)
    n_nonzeros = (reg.coef_ != 0).sum()
    print("Non-zeros coef: %d" % n_nonzeros)
    predict_y = reg.predict(predict_x)
    return {'y': predict_y, "coef": reg.coef_}
Exemplo n.º 3
0
    def choose_optimizer(self,
                         LassoType='Lasso',
                         RegCoef=0.00001,
                         cv=5,
                         criterion='aic',
                         maxiter=10000,
                         tolerance=0.0001,
                         normalize=True):

        if LassoType == 'Lasso':
            lin = linear_model.Lasso(alpha=RegCoef,
                                     max_iter=maxiter,
                                     normalize=normalize,
                                     tol=tolerance)
        elif LassoType == 'LassoCV':
            lin = linear_model.LassoCV(cv=cv,
                                       normalize=normalize,
                                       max_iter=maxiter)
        elif LassoType == 'LassoLarsCV':
            lin = linear_model.LassoLarsCV(cv=cv,
                                           normalize=normalize,
                                           max_iter=maxiter)
        elif LassoType == 'LarsCV':
            lin = linear_model.LarsCV(cv=cv,
                                      normalize=normalize,
                                      max_iter=maxiter)
        elif LassoType == 'LassoLarsIC':
            lin = linear_model.LassoLarsIC(criterion=criterion,
                                           normalize=normalize,
                                           max_iter=maxiter)
        else:
            raise Exception("wrong option")

        return lin
Exemplo n.º 4
0
def scale2fitting(scale, x_train, y_train, x_test, y_test):
    global test_a, test_b
    # 选取gamma,其中scale为尺度上界一般取(2^n),具体scale值需要crossValiation确定,gamma服从U[0,scale]
    parameters = {}
    # parameters里第一列放的是系数coef_,第二列放的是x_train的位置,第三列放的是gamma
    parameters['x_train'] = x_train
    singleScale = np.random.uniform(0, scale, size=1)
    gamma = np.ones([x_train.shape[0]])*singleScale
    parameters['gamma'] = gamma
    trainMap = featureMap(x_train, gamma, x_train)
    try:
        F1 = lm.LassoLarsCV(cv=5, normalize=False)
        F1.fit(trainMap, y_train)
        parameters['coef'] = F1.coef_
        y_train_fit = F1.predict(trainMap)
        mseTrain = (float(1) / len(y_train)) * np.linalg.norm((y_train_fit - y_train), ord=2) ** 2
        rmseTrain = mseTrain ** 0.5

        testMap = featureMap(x_test, gamma, x_train)
        y_test_fit = F1.predict(testMap)
        mseTest = (float(1) / len(y_test)) * np.linalg.norm((y_test_fit - y_test), ord=2) ** 2
        rmseTest = mseTest ** 0.5

        return {'mseTrain': mseTrain, 'mseTest': mseTest, 'rmseTrain': rmseTrain, 'rmseTest': rmseTest,
                'parameters': parameters, 'scale': scale, 'x_train': x_train, 'gamma': gamma, 'model': F1}

    except:
        print 'lasso/lars error'
        return {'mseTest': 999999999}
Exemplo n.º 5
0
def test_lasso():
    alphaNum = 6
    print '*' * 80
    inputData = pd.read_hdf(
        './rise_DM_fraud/dev1/preprocessing/preprocessing_result.h5')
    target = 'fpd'
    Y = inputData[target]
    X = inputData.drop(target, axis=1)
    X.fillna(-999, inplace=True)
    lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, Y)
    skf = cv.StratifiedKFold(y=Y, n_folds=5)
    for i, (_, test_index) in enumerate(skf):
        print 'Fold', i
        test_X = X.iloc[test_index, :]
        test_Y = Y[test_index]
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0],
                             alphaNum)
        clf = linear_model.RandomizedLasso(alphas, random_state=33,
                                           n_jobs=1).fit(test_X, test_Y)
        featureImportance = pd.DataFrame(sorted(zip(
            map(lambda x: round(x, 4), clf.scores_), X.columns),
                                                reverse=True),
                                         columns=['importance', 'name'])
        featureImportance.to_csv(
            './rise_DM_fraud/dev1/feature_ranking/feature_importance_lasso_fold_%d.csv'
            % (i + 1),
            index=False)
Exemplo n.º 6
0
def test_sk_LassoLarsCV():
    print("Testing sklearn, LassoLarsCV...")
    mod = linear_model.LassoLarsCV()
    X, y = iris_data
    mod.fit(X, y)
    docs = {'name': "LassoLarsCV test"}
    fv = X[0, :]
    upload(mod, fv, docs)
Exemplo n.º 7
0
def reg_correlation_with_residuals(X, Y, Z, model=linear_model.LassoLarsCV()):
    model.fit(Z, X.ravel())
    X_res = X.ravel() - model.predict(Z)

    model.fit(Z, Y.ravel())
    Y_res = Y.ravel() - model.predict(Z)
    if np.isclose(np.linalg.norm(X_res), 0) or np.isclose(np.linalg.norm(Y_res), 0):
        return 0
    return np.corrcoef(X_res.flatten(), Y_res.flatten())[0][1], X_res, Y_res
Exemplo n.º 8
0
def test_lars_cv_max_iter():
    with warnings.catch_warnings(record=True) as w:
        rng = np.random.RandomState(42)
        x = rng.randn(len(y))
        X = diabetes.data
        X = np.c_[X, x, x]  # add correlated features
        lars_cv = linear_model.LassoLarsCV(max_iter=5)
        lars_cv.fit(X, y)
    assert len(w) == 0
Exemplo n.º 9
0
    def _train(self):
        x = self._train_set.features
        y = self._train_set.outputs

        self._transform = preprocessing.PolynomialFeatures(1)

        clf = linear_model.LassoLarsCV(fit_intercept=True)
        clf.fit(self._transform.fit_transform(x, y), y)

        self._model = clf.predict
def test_lars_cv_max_iter():
    with warnings.catch_warnings(record=True) as w:
        X = diabetes.data
        y = diabetes.target
        rng = np.random.RandomState(42)
        x = rng.randn(len(y))
        X = np.c_[X, x, x]  # add correlated features
        lars_cv = linear_model.LassoLarsCV(max_iter=5)
        lars_cv.fit(X, y)
    # Expected single FutureWarning for deprecation of n_splits=3
    assert_true(len(w) != 0)
Exemplo n.º 11
0
 def test_model_lasso_lars_cv(self):
     model, X = fit_regression_model(linear_model.LassoLarsCV())
     model_onnx = convert_sklearn(
         model,
         "lasso lars cv", [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X,
                         model,
                         model_onnx,
                         basename="SklearnLassoLarsCV-Dec4")
Exemplo n.º 12
0
def _noise_filtering(X, target, good_cols=[], problem_type="regression"):
    """
    Trains a prediction model with additional noise features and selects only those of the
    original features that have a higher coefficient than any of the noise features.

    Inputs:
        - X: n x d numpy array with d features
        - target: n dimensional array with targets corresponding to the data points in X
        - good_cols: list of column names for the features in X
        - problem_type: str, either "regression" or "classification" (default: "regression")
    Returns:
        - good_cols: list of noise filtered column names
    """
    n_feat = X.shape[1]
    assert len(
        good_cols) == n_feat, "fewer column names provided than features in X."
    if not good_cols:
        good_cols = list(range(n_feat))
    # perform noise filtering on these features
    if problem_type == "regression":
        model = lm.LassoLarsCV(cv=5, eps=1e-8)
    elif problem_type == "classification":
        model = lm.LogisticRegressionCV(cv=5,
                                        penalty="l1",
                                        solver="saga",
                                        class_weight="balanced")
    else:
        print(
            "[featsel] WARNING: Unknown problem_type %r - not performing noise filtering."
            % problem_type)
        model = None
    if model is not None:
        X = _add_noise_features(X)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # TODO: remove if sklearn least_angle issue is fixed
            try:
                model.fit(X, target)
            except ValueError:
                rand_idx = np.random.permutation(X.shape[0])
                model.fit(X[rand_idx], target[rand_idx])
            # model.fit(X, target)
        if problem_type == "regression":
            coefs = np.abs(model.coef_)
        else:
            # model.coefs_ is n_classes x n_features, but we need n_features
            coefs = np.max(np.abs(model.coef_), axis=0)
        weights = dict(zip(good_cols, coefs[:len(good_cols)]))
        # only include features that are more important than our known noise features
        noise_w_thr = np.max(coefs[n_feat:])
        good_cols = [c for c in good_cols if weights[c] > noise_w_thr]
    return good_cols
Exemplo n.º 13
0
def test_lars_cv():
    # Test the LassoLarsCV object by checking that the optimal alpha
    # increases as the number of samples increases.
    # This property is not actually garantied in general and is just a
    # property of the given dataset, with the given steps chosen.
    old_alpha = 0
    lars_cv = linear_model.LassoLarsCV()
    for length in (400, 200, 100):
        X = diabetes.data[:length]
        y = diabetes.target[:length]
        lars_cv.fit(X, y)
        np.testing.assert_array_less(old_alpha, lars_cv.alpha_)
        old_alpha = lars_cv.alpha_
Exemplo n.º 14
0
def cross_validated_estimators_tests():
    models = [
        linear_model.ElasticNetCV(),
        linear_model.LarsCV(),
        linear_model.LassoCV(),
        linear_model.LassoLarsCV(),
        linear_model.LogisticRegressionCV(),
        linear_model.OrthogonalMatchingPursuitCV(),
        linear_model.RidgeClassifierCV(),
        linear_model.RidgeCV()
    ]
    for model in models:
        cross_validated_estimators(model)
 def test_model_lasso_lars_cv(self):
     model, X = _fit_model(linear_model.LassoLarsCV())
     model_onnx = convert_sklearn(model, "lasso lars cv",
                                  [("input", FloatTensorType(X.shape))])
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X.astype(numpy.float32),
         model,
         model_onnx,
         basename="SklearnLassoLarsCV-Dec4",
         allow_failure="StrictVersion("
         "onnxruntime.__version__)"
         "<= StrictVersion('0.2.1')",
     )
Exemplo n.º 16
0
def predict_profit(feature_pred=None):

    df = clean.doit()
    df = df[df['title_year'] >= 1990]
    df.keys()
    m = preprocessing.LabelEncoder()
    u = m.fit_transform(df['content_rating'])
    y = pd.Series(u, index=df.index)
    ya = pd.DataFrame({"Rating": y})
    df = df.join(ya)
    s = df['genres'].str.split('|').apply(pd.Series, 1)
    s = s.fillna('')
    le = defaultdict(preprocessing.LabelEncoder)

    genres_num = s.apply(lambda x: le[x.name].fit_transform(x))
    df = df.join(genres_num)
    feature = df.ix[:, [
        'bud', 'director_avg_profit', 'director_movie_count',
        'actor_1_avg_profit', 'actor_1_movie_count', 'actor_2_avg_profit',
        'actor_2_movie_count', 'actor_3_avg_profit', 'actor_3_movie_count'
    ]]  #,'title_year',0,1,2,3,4,'Rating']]
    label = df['profit']
    feat_train, feat_test, lab_train, lab_test = train_test_split(
        feature, label, random_state=1)
    regress = linear_model.LassoLarsCV(cv=10, precompute=False)
    regress.fit(feat_train, lab_train)
    sco = cross_val_score(regress, feat_test, lab_test, cv=10)
    cross_score = sco.mean()
    print "cross validated score:", cross_score
    print "coefficients:", regress.coef_
    print "intercept:", regress.intercept_

    plt.clf()
    plt.scatter(feat_train['actor_1_avg_profit'],
                lab_train,
                color='blue',
                label='training data')
    plt.scatter(feat_test['actor_1_avg_profit'],
                lab_test,
                color='red',
                label='testing data')
    plt.plot(feat_test['actor_1_avg_profit'],
             regress.predict(feat_test),
             color='black',
             linewidth='2')
    plt.xlabel('director_profit')
    plt.ylabel('profit_of_movie')
    plt.show()
    with open("prediction.pickle", "wb") as f:
        pickle.dump(regress, f)
 def test_model_lasso_lars_cv(self):
     model, X = fit_regression_model(linear_model.LassoLarsCV())
     model_onnx = convert_sklearn(
         model,
         "lasso lars cv", [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnLassoLarsCV-Dec4",
         allow_failure="StrictVersion("
         "onnxruntime.__version__)"
         "<= StrictVersion('0.2.1')",
     )
Exemplo n.º 18
0
def RunLassoLARS( args, verbose=True ):
    '''
    Run a LassoLARS model.
    trainX, trainY, testX: you know what those are
    f_psearch: the fraction of the test sample to use to choose hyperparameters (0 < f_psearch < 1)
    '''
    trainX, trainY, testX = args
    if verbose: print '\nChoosing best alpha and fitting the model'
    model = linear_model.LassoLarsCV( cv=5, verbose=int(verbose), normalize=False )
    model.fit( trainX, trainY )
    
    if verbose: print '\nUsing alpha =',model.alpha_,'\nProducing estimates'
    predictions = model.predict( testX )
    if verbose: print '\nComplete.'
    
    return predictions
Exemplo n.º 19
0
def test_lars_cv_max_iter(recwarn):
    warnings.simplefilter('always')
    with np.errstate(divide='raise', invalid='raise'):
        X = diabetes.data
        y = diabetes.target
        rng = np.random.RandomState(42)
        x = rng.randn(len(y))
        X = diabetes.data
        X = np.c_[X, x, x]  # add correlated features
        lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5)
        lars_cv.fit(X, y)
    # Check that there is no warning in general and no ConvergenceWarning
    # in particular.
    # Materialize the string representation of the warning to get a more
    # informative error message in case of AssertionError.
    recorded_warnings = [str(w) for w in recwarn]
    assert recorded_warnings == []
Exemplo n.º 20
0
def sklearn_liner_model_regressions(xTrain, xTest, yTrain, yTest):
    modelForConsideration: DataFrame = pd.DataFrame()
    LinerModels = \
        [
            linear_model.ARDRegression(), linear_model.BayesianRidge(), linear_model.ElasticNet(),
            linear_model.ElasticNetCV(),
            linear_model.HuberRegressor(), linear_model.Lars(), linear_model.LarsCV(), linear_model.Lasso(),
            linear_model.LassoCV(), linear_model.LassoLars(), linear_model.LassoLarsCV(), linear_model.LassoLarsIC(),
            linear_model.LinearRegression(), linear_model.MultiTaskLasso(),
            linear_model.MultiTaskElasticNet(), linear_model.MultiTaskLassoCV(), linear_model.MultiTaskElasticNetCV(),
            linear_model.OrthogonalMatchingPursuit(),
            linear_model.OrthogonalMatchingPursuitCV(), linear_model.PassiveAggressiveClassifier(),
            linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(),
            linear_model.RANSACRegressor(), linear_model.Ridge(), linear_model.RidgeClassifier(),
            linear_model.RidgeClassifierCV(),
            linear_model.RidgeCV(), linear_model.SGDClassifier(), linear_model.SGDRegressor(),
            linear_model.TheilSenRegressor(),
            linear_model.enet_path(xTrain, yTrain),
            linear_model.lars_path(xTrain, yTrain), linear_model.lasso_path(xTrain, yTrain),
            # linear_model.LogisticRegression()
            # ,linear_model.LogisticRegressionCV(),linear_model.logistic_regression_path(xTrain, yTrain), linear_model.orthogonal_mp(xTrain, yTrain), linear_model.orthogonal_mp_gram(), linear_model.ridge_regression()
        ]
    for model in LinerModels:
        modelName: str = model.__class__.__name__
        try:
            # print(f"Preparing Model {modelName}")
            if modelName == "LogisticRegression":
                model = linear_model.LogisticRegression(random_state=0)
            model.fit(xTrain, yTrain)
            yTrainPredict = model.predict(xTrain)
            yTestPredict = model.predict(xTest)
            errorList = calculate_prediction_error(modelName, yTestPredict,
                                                   yTest, yTrainPredict,
                                                   yTrain)

            if errorList["Test Average Error"][0] < 30 and errorList[
                    "Train Average Error"][0] < 30:
                try:
                    modelForConsideration = modelForConsideration.append(
                        errorList)
                except (Exception) as e:
                    print(e)

        except (Exception, ArithmeticError) as e:
            print(f"Error occurred while preparing Model {modelName}")
    return modelForConsideration
Exemplo n.º 21
0
def _l1_graph_setup(X, positive, alpha):
  n, d = X.shape
  # Choose an efficient Lasso solver
  if alpha is not None:
    if positive or d < n:
      clf = linear_model.Lasso(positive=positive, alpha=alpha)
    else:
      clf = linear_model.LassoLars(alpha=alpha)
  else:
    cv = min(d, 3)
    if positive or d < n:
      clf = linear_model.LassoCV(positive=positive, cv=cv)
    else:
      clf = linear_model.LassoLarsCV(cv=cv)
  # Normalize all samples
  X = X / np.linalg.norm(X, ord=2, axis=1)[:,None]
  return clf, X
Exemplo n.º 22
0
def make_prediction(train, test):

    y_train = train.SalePrice.values.tolist()

    train = train.drop('SalePrice', 1)
    # test = train.drop('Id', 1)
    # train = train.drop('Id', 1)

    x_train = train.values.tolist()
    x_test = test.values.tolist()

    model = linear_model.LassoLarsCV(normalize = False)
    model = model.fit(x_train, y_train)
    answer = model.predict(test.values.tolist())

    df = return_csv_from_arr(answer)
    df.to_csv(predictionsFolder + 'submission.csv' ,  index=False)
Exemplo n.º 23
0
    def build(self, **kwargs):
        """
        builds and returns estimator

        Args:
            hyperparameters (dictionary): Dictionary of hyperparameters to be used for tuning the estimator.
            **kwargs (key-value arguments): Ignored in this implementation. Added for compatibility with :func:`mlaut.estimators.nn_estimators.Deep_NN_Classifier`.
        
        Returns:
            `sklearn pipeline` object: pipeline for transforming the features and training the estimator
        """

        estimator = linear_model.LassoLarsCV(
            max_n_alphas=self._hyperparameters['max_n_alphas'],
            cv=self._num_cv_folds,
            n_jobs=self._n_jobs)

        return self._create_pipeline(estimator=estimator)
Exemplo n.º 24
0
def test_lars_cv_max_iter(recwarn):
    warnings.simplefilter("always")
    with np.errstate(divide="raise", invalid="raise"):
        X = diabetes.data
        y = diabetes.target
        rng = np.random.RandomState(42)
        x = rng.randn(len(y))
        X = diabetes.data
        X = np.c_[X, x, x]  # add correlated features
        lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5)
        lars_cv.fit(X, y)
    # Check that there is no warning in general and no ConvergenceWarning
    # in particular.
    # Materialize the string representation of the warning to get a more
    # informative error message in case of AssertionError.
    recorded_warnings = [str(w) for w in recwarn]
    # FIXME: when 'normalize' is removed set exchange below for:
    # assert len(recorded_warnings) == []
    assert len(recorded_warnings) == 1
    assert "normalize' will be set to False in version 1.2" in recorded_warnings[0]
    def _estimate_model(self):
        """Estimates lasso regression object.

        Returns
        -------
        model : sklearn lasso regression or lasso cv object
            Fitted lasso model.
        """
        ###Lars Algorithm
        if self.solver == "Lars":
            self.underlying = linear_model.LassoLars(
                fit_intercept=self.intercept, normalize=False)
            if self.cv_folds is 'IC':  #For AIC/BIC. criterion kwarg should be provided.
                model = linear_model.LassoLarsIC(fit_intercept=self.intercept,
                                                 normalize=False,
                                                 **self.kwargs)
            elif self.cv_folds is not None:
                model = linear_model.LassoLarsCV(fit_intercept=self.intercept,
                                                 cv=self.cv_folds,
                                                 normalize=False,
                                                 **self.kwargs)
            else:
                model = linear_model.Lasso(fit_intercept=self.intercept,
                                           **self.kwargs)
        ###Coordinate Descent Algorithm
        elif self.solver == "Coordinate Descent":
            self.underlying = linear_model.Lasso(fit_intercept=self.intercept)
            if self.cv_folds is not None:
                model = linear_model.LassoCV(fit_intercept=self.intercept,
                                             cv=self.cv_folds,
                                             **self.kwargs)
            else:
                model = linear_model.Lasso(fit_intercept=self.intercept,
                                           **self.kwargs)
        else:
            raise NotImplementedError(
                'Solver not implemented. Choices are Lars or Coordinate Descent.'
            )
        #self.model.fit(np.asanyarray(self.x_train.values,order='F'), self.y_train)
        model.fit(self.x_train, self.y_train)
        return model
def cross_validate_model(X_train, Y_train):
    """
	Here we perform cross validation of models to choose the best one.
	"""
    # Divide the training and testing data
    train, test, y_actual, y_predict = train_test_split(X_train,
                                                        Y_train,
                                                        test_size=0.5,
                                                        random_state=42)

    # List the regression methods to use.
    clf_random_forest = ensemble.RandomForestRegressor(n_estimators=50)
    clf_adaboost_reg = ensemble.AdaBoostRegressor(n_estimators=50)
    clf_lasso_larscv = sklinear.LassoLarsCV(cv=9)
    clf_ridge = sklinear.RidgeCV()
    clf_elastic_net = sklinear.ElasticNet()
    clf_extra_tree = ensemble.ExtraTreesRegressor(n_estimators=50)
    clf_mlpr = neural_network.MLPRegressor(solver='adam')

    # Add the above methods in an array
    # More ameable for looping
    methods = [
        clf_random_forest, clf_adaboost_reg, clf_lasso_larscv, clf_elastic_net,
        clf_extra_tree, clf_mlpr
    ]
    methods_label = [
        'clf_random_forest', 'clf_adaboost_reg', 'clf_lasso_larscv',
        'clf_elastic_net', 'clf_extra_tree', 'clf_mlpr'
    ]

    method_mse = np.zeros((len(methods), 1))
    # Fit and predict for each method
    for i in range(len(methods)):
        methods[i].fit(train, y_actual)
        method_predict = methods[i].predict(test)
        method_mse[i] = metrics.mean_squared_error(y_predict, method_predict)
        print('MSE for %s while cross validation : %f' %
              (methods_label[i], method_mse[i]))

    # We return the method which has the minimum mse
    return np.argmin(method_mse)
Exemplo n.º 27
0
    def __init__(self, params=None):
        self.clf = xgb.sklearn.XGBRegressor(
            max_depth=3,
            learning_rate=0.1,
            n_estimators=300,
            silent=True,
            objective='reg:linear',
            nthread=1,
            gamma=0,
            min_child_weight=1,
            max_delta_step=0,
            subsample=1,
            colsample_bytree=1,
            colsample_bylevel=.25,  #.5
            reg_alpha=0,  #1
            reg_lambda=.5,  #.2
            scale_pos_weight=1,
            base_score=0.5,
            seed=0,
            missing=None)

        self.clf2 = linear_model.LassoLarsCV(fit_intercept=True)
Exemplo n.º 28
0
 def _model_fitting_cv(cls, x, y, num_cv, plotting=False):
     # Compute paths
     # print("Computing regularization path using the Lars lasso...")
     model = linear_model.LassoLarsCV(cv=num_cv).fit(x, y)
     # Display results
     if plotting:
         import matplotlib.pyplot as plt
         m_log_alphas = -np.log10(model.cv_alphas_)
         plt.figure(figsize=(20, 10))
         plt.plot(m_log_alphas, model.cv_mse_path_, ':')
         plt.plot(m_log_alphas,
                  model.cv_mse_path_.mean(axis=-1),
                  'k',
                  label='Average across the folds',
                  linewidth=2)
         plt.axvline(-np.log10(model.alpha_),
                     linestyle='--',
                     color='k',
                     label='alpha CV')
         plt.legend()
         plt.xlabel('-log(alpha)')
         plt.ylabel('Mean square error')
         plt.axis('tight')
         plt.savefig('cross_validation',
                     dpi=None,
                     facecolor='w',
                     edgecolor='w',
                     orientation='portrait',
                     papertype=None,
                     format=None,
                     transparent=False,
                     bbox_inches=None,
                     pad_inches=0.1,
                     frameon=None)
         plt.plot()
     return model
Exemplo n.º 29
0
    def __init__(
        self,
        method,
        yrange,
        params,
        i=0
    ):  #TODO: yrange doesn't currently do anything. Remove or do something with it!
        self.algorithm_list = [
            'PLS',
            'GP',
            'OLS',
            'OMP',
            'Lasso',
            'Elastic Net',
            'Ridge',
            'Bayesian Ridge',
            'ARD',
            'LARS',
            'LASSO LARS',
            'SVR',
            'KRR',
        ]
        self.method = method
        self.outliers = None
        self.ransac = False

        print(params)
        if self.method[i] == 'PLS':
            self.model = PLSRegression(**params[i])

        if self.method[i] == 'OLS':
            self.model = linear.LinearRegression(**params[i])

        if self.method[i] == 'OMP':
            # check whether to do CV or not
            self.do_cv = params[i]['CV']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove CV parameter
            params_temp.pop('CV')
            if self.do_cv is False:
                self.model = linear.OrthogonalMatchingPursuit(**params_temp)
            else:
                params_temp.pop('precompute')
                self.model = linear.OrthogonalMatchingPursuitCV(**params_temp)

        if self.method[i] == 'LASSO':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # check whether to do CV or not
            try:
                self.do_cv = params[i]['CV']
                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lasso(**params_temp)
            else:
                params_temp.pop('alpha')
                self.model = linear.LassoCV(**params_temp)

        if self.method[i] == 'Elastic Net':
            params_temp = copy.copy(params[i])
            try:
                self.do_cv = params[i]['CV']
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.ElasticNet(**params_temp)
            else:
                params_temp['l1_ratio'] = [.1, .5, .7, .9, .95, .99, 1]
                self.model = linear.ElasticNetCV(**params_temp)

        if self.method[i] == 'Ridge':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv:
                self.model = linear.RidgeCV(**params_temp)
            else:
                self.model = linear.Ridge(**params_temp)

        if self.method[i] == 'BRR':
            self.model = linear.BayesianRidge(**params[i])

        if self.method[i] == 'ARD':
            self.model = linear.ARDRegression(**params[i])

        if self.method[i] == 'LARS':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lars(**params_temp)
            else:
                self.model = linear.LarsCV(**params_temp)

        if self.method[i] == 'LASSO LARS':
            model = params[i]['model']
            params_temp = copy.copy(params[i])
            params_temp.pop('model')

            if model == 0:
                self.model = linear.LassoLars(**params_temp)
            elif model == 1:
                self.model = linear.LassoLarsCV(**params_temp)
            elif model == 2:
                self.model = linear.LassoLarsIC(**params_temp)
            else:
                print("Something went wrong, \'model\' should be 0, 1, or 2")

        if self.method[i] == 'SVR':
            self.model = svm.SVR(**params[i])

        if self.method[i] == 'KRR':
            self.model = kernel_ridge.KernelRidge(**params[i])

        if self.method[i] == 'GP':
            # get the method for dimensionality reduction and the number of components
            self.reduce_dim = params[i]['reduce_dim']
            self.n_components = params[i]['n_components']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove parameters not accepted by Gaussian Process
            params_temp.pop('reduce_dim')
            params_temp.pop('n_components')
            self.model = GaussianProcess(**params_temp)
Exemplo n.º 30
0
    def fit_transform(self, X, y):
        """
        Fits the regression model and returns a new dataframe with the additional features.

        Inputs:
            - X: pandas dataframe or numpy array with original features (n_datapoints x n_features)
            - y: pandas dataframe or numpy array with targets for all n_datapoints
        Returns:
            - new_df: new pandas dataframe with all the original features (except categorical features transformed
                      into multiple 0/1 columns) and the most promising engineered features. This df can then be
                      used to train your final model.

        Please ensure that X only contains valid feature columns (including possible categorical variables).

        Note: we strongly encourage you to name your features X1 ...  Xn or something simple like this before passing
              a DataFrame to this model. This can help avoid potential problems with sympy later on.
              The data should only contain finite values (no NaNs etc.)
        """
        # store column names as they'll be lost in the other check
        cols = [str(c) for c in X.columns] if isinstance(X, pd.DataFrame) else []
        # check input variables
        X, target = check_X_y(X, y, y_numeric=self.problem_type == "regression", dtype=None)
        if not cols:
            # the additional zeros in the name are because of the variable check in _generate_features,
            # where we check if the column name occurs in the the expression. this would lead to many
            # false positives if we have features x1 and x10...x19 instead of x001...x019.
            cols = ["x%03i" % i for i in range(X.shape[1])]
        self.original_columns_ = cols
        # transform X into a dataframe (again)
        df = pd.DataFrame(X, columns=cols)
        # possibly convert categorical columns
        df = self._transform_categorical_cols(df)
        # if we're not given specific feateng_cols, then just take all columns except categorical
        if self.feateng_cols:
            fcols = []
            for c in self.feateng_cols:
                if c not in self.original_columns_:
                    raise ValueError("[AutoFeat] feateng_col %r not in df.columns" % c)
                if c in self.categorical_cols_map_:
                    fcols.extend(self.categorical_cols_map_[c])
                else:
                    fcols.append(c)
            self.feateng_cols_ = fcols
        else:
            self.feateng_cols_ = list(df.columns)
        # convert units to proper pint units
        if self.units:
            # need units for only and all feateng columns
            self.units = {c: self.units[c] if c in self.units else "" for c in self.feateng_cols_}
            # apply pi-theorem -- additional columns are not used for regular feature engineering (for now)!
            df = self._apply_pi_theorem(df)
        # subsample data points and targets in case we'll generate too many features
        # (n_rows * n_cols * 32/8)/1000000000 <= max_gb
        n_cols = n_cols_generated(len(self.feateng_cols_), self.feateng_steps, len(self.transformations))
        n_gb = (len(df) * n_cols) / 250000000
        if self.verbose:
            print("[AutoFeat] The %i step feature engineering process could generate up to %i features." % (self.feateng_steps, n_cols))
            print("[AutoFeat] With %i data points this new feature matrix would use about %.2f gb of space." % (len(df), n_gb))
        if self.max_gb and n_gb > self.max_gb:
            n_rows = int(self.max_gb * 250000000 / n_cols)
            if self.verbose:
                print("[AutoFeat] As you specified a limit of %.1d gb, the number of data points is subsampled to %i" % (self.max_gb, n_rows))
            subsample_idx = np.random.permutation(list(df.index))[:n_rows]
            df_subs = df.iloc[subsample_idx]
            df_subs.reset_index(drop=True, inplace=True)
            target_sub = target[subsample_idx]
        else:
            df_subs = df.copy()
            target_sub = target.copy()
        # generate features
        df_subs, self.feature_formulas_ = engineer_features(df_subs, self.feateng_cols_, _parse_units(self.units, verbose=self.verbose),
                                                            self.feateng_steps, self.transformations, self.verbose)
        # select predictive features
        if self.featsel_runs <= 0:
            if self.verbose:
                print("[AutoFeat] WARNING: Not performing feature selection.")
            good_cols = df_subs.columns
        else:
            if self.problem_type in ("regression", "classification"):
                good_cols = select_features(df_subs, target_sub, self.featsel_runs, None, self.problem_type, self.n_jobs, self.verbose)
                # if no features were selected, take the original features
                if not good_cols:
                    good_cols = list(df.columns)
            else:
                print("[AutoFeat] WARNING: Unknown problem_type %r - not performing feature selection." % self.problem_type)
                good_cols = df_subs.columns
        # filter out those columns that were original features or generated otherwise
        self.new_feat_cols_ = [c for c in good_cols if c not in list(df.columns)]
        self.good_cols_ = good_cols
        # re-generate all good feature again; for all data points this time
        self.feature_functions_ = {}
        df = self._generate_features(df, self.new_feat_cols_)
        # filter out unnecessary junk from self.feature_formulas_
        self.feature_formulas_ = {f: self.feature_formulas_[f] for f in self.new_feat_cols_ + self.feateng_cols_}
        self.feature_functions_ = {f: self.feature_functions_[f] for f in self.new_feat_cols_}
        self.all_columns_ = list(df.columns)
        # train final prediction model on all selected features
        if self.verbose:
            # final dataframe contains original columns and good additional columns
            print("[AutoFeat] Final dataframe with %i feature columns (%i new)." % (len(df.columns), len(df.columns) - len(self.original_columns_)))

        # train final prediction model
        if self.problem_type == "regression":
            model = lm.LassoLarsCV(cv=5)
        elif self.problem_type == "classification":
            model = lm.LogisticRegressionCV(cv=5, class_weight="balanced")
        else:
            print("[AutoFeat] WARNING: Unknown problem_type %r - not fitting a prediction model." % self.problem_type)
            model = None
        if model is not None:
            if self.verbose:
                print("[AutoFeat] Training final %s model." % self.problem_type)
            X = df[self.good_cols_].to_numpy()
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                model.fit(X, target)
            self.prediction_model_ = model
            # sklearn requires a "classes_" attribute
            if self.problem_type == "classification":
                self.classes_ = model.classes_
            if self.verbose:
                if self.problem_type == "regression":
                    coefs = model.coef_
                else:
                    # model.coefs_ is n_classes x n_features, but we need n_features
                    coefs = np.max(np.abs(model.coef_), axis=0)
                weights = dict(zip(self.good_cols_, coefs))
                print("[AutoFeat] Trained model: largest coefficients:")
                print(model.intercept_)
                for c in sorted(weights, key=lambda x: abs(weights[x]), reverse=True):
                    if abs(weights[c]) < 1e-5:
                        break
                    print("%.6f * %s" % (weights[c], c))
                print("[AutoFeat] Final score: %.4f" % model.score(X, target))
        if self.always_return_numpy:
            return df.to_numpy()
        return df