Пример #1
0
def test_multitask_enet_and_lasso_cv():
    X, y, _, _ = build_dataset(n_features=50, n_targets=3)
    clf = MultiTaskElasticNetCV(cv=3).fit(X, y)
    assert_almost_equal(clf.alpha_, 0.00556, 3)
    clf = MultiTaskLassoCV(cv=3).fit(X, y)
    assert_almost_equal(clf.alpha_, 0.00278, 3)

    X, y, _, _ = build_dataset(n_targets=3)
    clf = MultiTaskElasticNetCV(n_alphas=10,
                                eps=1e-3,
                                max_iter=100,
                                l1_ratio=[0.3, 0.5],
                                tol=1e-3,
                                cv=3)
    clf.fit(X, y)
    assert 0.5 == clf.l1_ratio_
    assert (3, X.shape[1]) == clf.coef_.shape
    assert (3, ) == clf.intercept_.shape
    assert (2, 10, 3) == clf.mse_path_.shape
    assert (2, 10) == clf.alphas_.shape

    X, y, _, _ = build_dataset(n_targets=3)
    clf = MultiTaskLassoCV(n_alphas=10, eps=1e-3, max_iter=100, tol=1e-3, cv=3)
    clf.fit(X, y)
    assert (3, X.shape[1]) == clf.coef_.shape
    assert (3, ) == clf.intercept_.shape
    assert (10, 3) == clf.mse_path_.shape
    assert 10 == len(clf.alphas_)
Пример #2
0
def test_enet_l1_ratio():
    # Test that an error message is raised if an estimator that
    # uses _alpha_grid is called with l1_ratio=0
    msg = ("Automatic alpha grid generation is not supported for l1_ratio=0. "
           "Please supply a grid by providing your estimator with the "
           "appropriate `alphas=` argument.")
    X = np.array([[1, 2, 4, 5, 8], [3, 5, 7, 7, 8]]).T
    y = np.array([12, 10, 11, 21, 5])

    assert_raise_message(ValueError, msg,
                         ElasticNetCV(l1_ratio=0, random_state=42).fit, X, y)
    assert_raise_message(
        ValueError, msg,
        MultiTaskElasticNetCV(l1_ratio=0, random_state=42).fit, X, y[:, None])

    # Test that l1_ratio=0 is allowed if we supply a grid manually
    alphas = [0.1, 10]
    estkwds = {'alphas': alphas, 'random_state': 42}
    est_desired = ElasticNetCV(l1_ratio=0.00001, **estkwds)
    est = ElasticNetCV(l1_ratio=0, **estkwds)
    with ignore_warnings():
        est_desired.fit(X, y)
        est.fit(X, y)
    assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)

    est_desired = MultiTaskElasticNetCV(l1_ratio=0.00001, **estkwds)
    est = MultiTaskElasticNetCV(l1_ratio=0, **estkwds)
    with ignore_warnings():
        est.fit(X, y[:, None])
        est_desired.fit(X, y[:, None])
    assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)
def test_enet_path():
    # We use a large number of samples and of informative features so that
    # the l1_ratio selected is more toward ridge than lasso
    X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100,
                                         n_informative_features=100)
    max_iter = 150

    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests
    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3,
                       l1_ratio=[0.5, 0.7], cv=3,
                       max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have selected an elastic-net
    # that is closer to ridge than to lasso
    assert clf.l1_ratio_ == min(clf.l1_ratio)

    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3,
                       l1_ratio=[0.5, 0.7], cv=3,
                       max_iter=max_iter, precompute=True)
    ignore_warnings(clf.fit)(X, y)

    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have selected an elastic-net
    # that is closer to ridge than to lasso
    assert clf.l1_ratio_ == min(clf.l1_ratio)

    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert clf.score(X_test, y_test) > 0.99

    # Multi-output/target case
    X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3)
    clf = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7],
                                cv=3, max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert clf.score(X_test, y_test) > 0.99
    assert clf.coef_.shape == (3, 10)

    # Mono-output should have same cross-validated alpha_ and l1_ratio_
    # in both cases.
    X, y, _, _ = build_dataset(n_features=10)
    clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf1.fit(X, y)
    clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf2.fit(X, y[:, np.newaxis])
    assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_)
    assert_almost_equal(clf1.alpha_, clf2.alpha_)
Пример #4
0
def test_1d_multioutput_enet_and_multitask_enet_cv():
    X, y, _, _ = build_dataset(n_features=10)
    y = y[:, np.newaxis]
    clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf.fit(X, y[:, 0])
    clf1 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf1.fit(X, y)
    assert_almost_equal(clf.l1_ratio_, clf1.l1_ratio_)
    assert_almost_equal(clf.alpha_, clf1.alpha_)
    assert_almost_equal(clf.coef_, clf1.coef_[0])
    assert_almost_equal(clf.intercept_, clf1.intercept_[0])
def elastic_net(X,Y):
    print(X.shape)
    clf = MultiTaskElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, 
                                normalize=False, max_iter=1000, tol=0.0001, cv=None, copy_X=True, 
                                verbose=0, n_jobs=1, random_state=None, selection='cyclic')
    
    fit=clf.fit(X,Y)
    sfm = SelectFromModel(fit,prefit=True)
    values= SelectFromModel.get_support(sfm,indices=True)
    new_features = sfm.transform(X)
   
    return new_features,values
Пример #6
0
class _MultiTaskElasticNetCVImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Пример #7
0
def test_uniform_targets():
    enet = ElasticNetCV(n_alphas=3)
    m_enet = MultiTaskElasticNetCV(n_alphas=3)
    lasso = LassoCV(n_alphas=3)
    m_lasso = MultiTaskLassoCV(n_alphas=3)

    models_single_task = (enet, lasso)
    models_multi_task = (m_enet, m_lasso)

    rng = np.random.RandomState(0)

    X_train = rng.random_sample(size=(10, 3))
    X_test = rng.random_sample(size=(10, 3))

    y1 = np.empty(10)
    y2 = np.empty((10, 2))

    for model in models_single_task:
        for y_values in (0, 5):
            y1.fill(y_values)
            assert_array_equal(model.fit(X_train, y1).predict(X_test), y1)
            assert_array_equal(model.alphas_, [np.finfo(float).resolution] * 3)

    for model in models_multi_task:
        for y_values in (0, 5):
            y2[:, 0].fill(y_values)
            y2[:, 1].fill(2 * y_values)
            assert_array_equal(model.fit(X_train, y2).predict(X_test), y2)
            assert_array_equal(model.alphas_, [np.finfo(float).resolution] * 3)
Пример #8
0
def train_glm_model(
    xtrain: Union[np.ndarray, pd.DataFrame],
    ytrain: Union[np.ndarray, pd.DataFrame],
    verbose: int = 0,
) -> BaseEstimator:
    """Train a basic Generalized Linear Model (GLM)

    Parameters
    ----------
    xtrain : np.ndarray, pd.DataFrame 
             (n_samples x d_features)
             input training data
    
    ytrain : np.ndarray, pd.DataFrame 
             (n_samples x p_outputs)
             labeled training data 
    
    verbose : int, default=0
        option to print out training messages 

    Returns 
    -------
    gl_model : BaseEstimator
        the trained model
    """
    # Initialize GLM
    gl_model = MultiTaskElasticNetCV(
        alphas=None,
        cv=3,
        random_state=123,
        n_jobs=-1,
        normalize=False,
        selection="random",
        verbose=verbose,
    )

    # train GLM
    t0 = time.time()
    gl_model.fit(xtrain, ytrain)
    t1 = time.time() - t0
    if verbose > 0:
        print(f"Training time: {t1:.3f} secs.")
    return gl_model
Пример #9
0
 def test_model_multi_task_elasticnet_cv(self):
     model, X = fit_regression_model(MultiTaskElasticNetCV(), n_targets=2)
     model_onnx = convert_sklearn(
         model, "multi-task elasticnet cv",
         [("input", FloatTensorType([None, X.shape[1]]))])
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X,
                         model,
                         model_onnx,
                         verbose=False,
                         basename="SklearnMultiTaskElasticNetCV-Dec4")
Пример #10
0
def fit_enet(X, flavors):
    # derive the flavor profiles by fitting the elastic net
    flavors[flavors == 0] = 0.01  # logit(0) and logit(1) are not finite
    flavors[flavors == 1] = 0.99
    y = logit(flavors)
    idx = np.all(np.isfinite(y), axis=1)

    print 'Performing multi-task elastic net...'
    enet = MultiTaskElasticNetCV(cv=7, n_jobs=7, fit_intercept=False, verbose=1).fit(X[idx], y[idx])
    weights = inv_logit(enet.coef_.T)  # transform to 0 to 1 scale

    return weights
Пример #11
0
    def select_mtelastic(self, X, y):
        # MultiTaskElasticCV from sklearn used to determine best alpha for Multi-task Elastic-Net Regression.

        mtlasso_alphas = MultiTaskElasticNetCV(alphas=[
            0.00001, .0001, .001, .002, .003, .004, .005, .006, .007, .008,
            .009, .099, .01, .011, .012, .013, .014, .015, .016, .017, .018,
            .019, .02, .025, .026, .027, .028, .029, .03, .031, .032, .033,
            .034, .035, .036, .037, .038, .039, .04, .041, .042, .043, .044,
            .045, .05, .06, .07, .071, .072, .073, .074, .075, .076, .077,
            .078, .079, .08, .1, .2, .225, .23, .24, .245, .246, .247, .248,
            .249, .25, .251, .252, .253, .254, .255, .26, .27, .275, .3, .35,
            .4, .45, .46, .47, .48, .481, .482, .483, .484, .485, .486, .487,
            .488, .489, .49, .491, .492, .493, .494, .495, .496, .497, .498,
            .499, .5, .51, .511, .512, .513, .514, .515, .516, .517, .518,
            .519, .52, .525, .53, .54, .55, .6, .75, .752, .7527, .7528, .7529,
            .753, .7531, .754, .7545, .755, .756, .76, .765, .77, .78, .79, .8,
            .9, 1.0, 1.2, 1.25, 1.5, 1.75, 2.0
        ])

        sel_alpha = mtlasso_alphas.fit(X, y)
        sel_alpha.alpha_
        print(sel_alpha.alpha_)
Пример #12
0
def regtsls(data, opts):
    T_test, Z, T, Y = data
    trans = PolynomialFeatures(degree=_get(opts, 'lin_degree', 1),
                               include_bias=False)
    polyT = trans.fit_transform(T)
    first = Pipeline([('poly',
                       PolynomialFeatures(degree=_get(opts, 'lin_degree', 1))),
                      ('elasticnet', MultiTaskElasticNetCV(cv=3))])
    first.fit(Z, polyT)
    second = ElasticNetCV(cv=3)
    second.fit(first.predict(Z), Y.ravel())
    polyT_test = trans.fit_transform(T_test)
    return second.predict(polyT_test).reshape(T_test.shape[:1] + Y.shape[1:])
Пример #13
0
 def _fit(self):
     """
     Fit regression model with training dataset, update self._regressor and self._param.
     """
     warnings.filterwarnings("ignore", category=ConvergenceWarning)
     # Model for Elastic Net regression
     cv = MultiTaskElasticNetCV(
         alphas=[0, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
         l1_ratio=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
         cv=5,
         n_jobs=-1)
     # Fit with pipeline
     steps = [
         ("scaler", MinMaxScaler()),
         ("regressor", cv),
     ]
     pipeline = Pipeline(steps=steps)
     pipeline.fit(self._X_train, self._y_train)
     reg_output = pipeline.named_steps.regressor
     # Update regressor
     self._regressor = pipeline
     # Intercept/coef
     intercept_df = pd.DataFrame(reg_output.coef_,
                                 index=self._y_train.columns,
                                 columns=self._X_train.columns)
     intercept_df.insert(0, "Intercept", None)
     intercept_df["Intercept"] = reg_output.intercept_
     # Update param
     param_dict = {
         **{k: type(v)
            for (k, v) in steps},
         "alpha": reg_output.alpha_,
         "l1_ratio": reg_output.l1_ratio_,
         "intercept": intercept_df,
         "coef": intercept_df,
     }
     self._param.update(param_dict)
Пример #14
0
def train_multi_elasticnet(train_features, train_labels, num_alphas,
                           skip_cross_validation, alpha, l1_ratio, num_jobs):
    """
  Performs the cross validation of multi elastic net model, and returns the trained model
  with best params. Assume features are scaled/normalized. Assumes train_labels has more
  than one column.
  """

    best_alpha = alpha
    best_l1_ratio = l1_ratio
    max_iter = 10000
    tol = 0.0005
    if not skip_cross_validation:
        # use 5 fold cross validation
        model = MultiTaskElasticNetCV(l1_ratio=[
            0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99, 0.999,
            0.9999
        ],
                                      max_iter=max_iter,
                                      cv=5,
                                      n_alphas=num_alphas,
                                      n_jobs=num_jobs,
                                      normalize=False,
                                      tol=tol)
        model.fit(train_features, train_labels)
        best_alpha = model.alpha_
        best_l1_ratio = model.l1_ratio_
        #print("number of iterations were {}".format(model.n_iter_))

    model = MultiTaskElasticNet(alpha=best_alpha,
                                l1_ratio=best_l1_ratio,
                                normalize=False,
                                max_iter=max_iter,
                                tol=tol)
    model.fit(train_features, train_labels)

    return (model, {'alpha': best_alpha, 'l1_ratio': best_l1_ratio})
Пример #15
0
def train_linear_model(X, y, random_state=1, test_size=0.2,
                       regularization_type='elasticnet', k_fold=5,
                       max_iter=1000000, tol=0.0001,
                       l1_ratio=None):
    """
    Function to train linear model with regularization and cross-validation.

    Args:
        X (pandas.DataFrame): dataframe of descriptors.
        y (pandas.DataFrame): dataframe of cycle lifetimes.
        random_state (int): seed for train/test split.
        test_size (float): proportion of the dataset reserved for model evaluation.
        regularization_type (str): lasso or ridge or elastic-net (with cv).
        k_fold (int): k in k-fold cross-validation.
        max_iter (int): maximum number of iterations for model fitting.
        tol (float): tolerance for optimization.
        l1_ratio ([float]): list of lasso to ridge ratios for elasticnet.

    Returns:
        sklearn.linear_model.LinearModel: fitted model.
        mu (float): Mean value of descriptors used in training.
        s (float): Std dev of descriptors used in training.

    """
    if l1_ratio is None:
        l1_ratio = [.1, .5, .7, .9, .95, 1]
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize (training) data after train/test split
    mu = np.mean(X_train, axis=0)
    s = np.std(X_train, axis=0)
    X_scaled = (X_train - mu) / s
    hyperparameters = {'random_state': random_state,
                       'test_size': test_size,
                       'k_fold': k_fold,
                       'tol': tol,
                       'max_iter': max_iter
                       }
    if regularization_type == 'lasso' and y.shape[1] == 1:
        lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol,
                          cv=k_fold, max_iter=max_iter)
        lassocv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = lassocv.alpha_
        linear_model = Lasso(fit_intercept=True, alpha=alpha_opt,
                             max_iter=max_iter)
        linear_model.fit(X_scaled, y_train.values)
        hyperparameters['l1_ratio'] = 1

    elif regularization_type == 'ridge' and y.shape[1] == 1:
        ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold)
        ridgecv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = ridgecv.alpha_
        linear_model = Ridge(fit_intercept=True, alpha=alpha_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = 0

    elif regularization_type == 'elasticnet' and y.shape[1] == 1:
        elasticnetcv = ElasticNetCV(fit_intercept=True, normalize=False,
                                    alphas=None, cv=k_fold,
                                    l1_ratio=l1_ratio, max_iter=max_iter)
        elasticnetcv.fit(X_scaled, y_train.values.ravel())

        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = elasticnetcv.alpha_
        l1_ratio_opt = elasticnetcv.l1_ratio_
        linear_model = ElasticNet(fit_intercept=True, normalize=False,
                                  l1_ratio=l1_ratio_opt,
                                  alpha=alpha_opt, max_iter=max_iter)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt

    # If more than 1 outcome present, perform multitask regression
    elif regularization_type == 'elasticnet' and y.shape[1] > 1:
        multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True, cv=k_fold,
                                                    normalize=False,
                                                    l1_ratio=l1_ratio, max_iter=max_iter)
        multi_elasticnet_CV.fit(X_scaled, y_train)
        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = multi_elasticnet_CV.alpha_
        l1_ratio_opt = multi_elasticnet_CV.l1_ratio_
        linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False,
                                           max_iter=max_iter)
        linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt
    else:
        raise NotImplementedError

    y_pred = linear_model.predict((X_test-mu)/s)
    Rsq = linear_model.score((X_test - mu) / s, y_test)
    # Compute 95% confidence interval
    # Multioutput = 'raw_values' provides prediction error per output
    pred_actual_ratio = [x/y for x, y in zip(y_pred, np.array(y_test))]
    relative_prediction_error = 1.96*np.sqrt(mean_squared_error(np.ones(y_pred.shape),
                                                                pred_actual_ratio,
                                                                multioutput='raw_values')/y_pred.shape[0])
    hyperparameters['alpha'] = alpha_opt
    return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
def main(X, Y, Params, print_info=False, is_regression=True, Y_other=None):

    parameters = Params['Algorithm'][1]
    is_cv_run = False
    starttime = time.time()

    if print_info:
        print('Fitting model \'%s\' for %s' %
              (Params['Algorithm'][0],
               'regression' if is_regression else 'classification'))

    if Params['Algorithm'][0] == 'BayesianRidge':
        if not is_regression:
            model = BayesianRidge(n_iter=300,
                                  tol=0.001,
                                  compute_score=False,
                                  fit_intercept=True,
                                  normalize=False,
                                  copy_X=True,
                                  verbose=False,
                                  **parameters)
            #parameters = {'alpha_1': [1e-6,1e-5,1e-4],'alpha_2': [1e-6,1e-5,1e-4], 'lambda_1': [1e-6,1e-5,1e-4], 'lambda_2': [1e-6,1e-5,1e-4]}
        else:
            model = BayesianRidge(n_iter=300,
                                  tol=0.001,
                                  compute_score=False,
                                  fit_intercept=True,
                                  normalize=False,
                                  copy_X=True,
                                  verbose=False,
                                  **parameters)
    elif Params['Algorithm'][0] == 'StringKernel':
        if not is_regression:
            raise (Exception('not implemented'))
        else:
            # we create an instance of SVM and fit out data.
            #
            # model = KernelRidge(alpha=parameters['alpha'], kernel='precomputed')
            model = SVR(kernel='precomputed',
                        gamma='auto',
                        coef0=0.0,
                        shrinking=True,
                        tol=0.001,
                        cache_size=400,
                        verbose=False,
                        max_iter=-1)
            param_grid = {
                'C': np.logspace(np.log10(0.0001), np.log10(500), 25)
            }

            model = NuSVR(
                kernel='precomputed'
            )  #cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False,**parameters)
            param_grid = {'nu': (0.50, )}

            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=7,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == 'XGBoost':
        # max_depth = 3, learning_rate = 0.1, n_estimators = 100, silent = True, objective = 'reg:linear',
        # booster = 'gbtree', n_jobs = 1, nthread = None, gamma = 0, min_child_weight = 1,
        # max_delta_step = 0, subsample = 1, colsample_bytree = 1, colsample_bylevel = 1, reg_alpha = 0,
        # reg_lambda = 1, scale_pos_weight = 1, base_score = 0.5, random_state = 0, seed = None,
        # missing = None
        if not is_regression:
            model = xgboost.XGBClassifier(
                missing=None,
                silent=True,
                learning_rate=0.10,
                objective='rank:pairwise',
                booster='gbtree',
                n_jobs=1,
                max_delta_step=0,
                colsample_bylevel=1,
                scale_pos_weight=1,
                base_score=0.5,
                random_state=666,
                colsample_bytree=0.75,  # default 1
                subsample=0.75,
                gamma=0,
                reg_alpha=0.01,  # default 0
                min_child_weight=6,
                **parameters)
        else:
            # model=xgboost.XGBRegressor(missing=None, silent=True,
            #                            learning_rate=0.10,
            #                            objective='reg:linear',#'rank:pairwise' booster='gbtree'
            #                            n_jobs=1,
            #                            booster='gbtree',
            #                            max_delta_step=0,
            #                            colsample_bylevel=1,
            #                            scale_pos_weight=1,
            #                            base_score=0.5,
            #                            random_state=666,
            #                            colsample_bytree=0.75, # default 1
            #                            subsample=0.75,
            #                            gamma=0,
            #                            reg_alpha=0.01, # default 0
            #                            reg_lambda=1.0,
            #                            min_child_weight=6,
            #                            **parameters)

            model = xgboost.XGBRegressor(
                missing=None,
                silent=True,
                learning_rate=0.10,
                objective='reg:linear',  #'rank:pairwise' booster='gbtree'
                n_jobs=1,
                booster='gbtree',
                random_state=666,
                **parameters)

            param_grid = {
                'colsample_bytree': (0.75, 1.0),
                'subsample': (0.75, 1.0),
                'min_child_weight': (3, 6, 9),
                'reg_lambda': (0.80, 1.0, 1.20),
                'reg_alpha': (0.001, 0.01)
            }
            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=7,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == "Keras_ElasticNet":

        #use_keras_CPU()

        if not is_regression:
            raise (Exception('ElasticNet is only for regression!'))
        else:
            param_grid = {
                'l1_ratio': (Params['Algorithm'][1]['l1_ratio'], ),
                'alpha': np.logspace(-3, 1, 15)
            }

            model = GridSearchCV(KerasENet(),
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=5,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            # first_output = Dense(1,activation='sigmoid')(first_output)
            is_cv_run = True

    elif Params['Algorithm'][0] == "Ridge":
        if not is_regression:
            raise (Exception('Ridge is only for regression!'))
        else:
            model = RidgeCV(alphas=np.logspace(-1, np.log10(700),
                                               parameters['n_alphas']),
                            fit_intercept=True,
                            normalize=False,
                            scoring=None,
                            cv=8,
                            gcv_mode=None,
                            store_cv_values=False)

    elif Params['Algorithm'][0] == "ElasticNet":
        tol = 0.0001
        selection = 'cyclic'
        n_alphas = 90
        max_iter = 1300
        if X.shape[1] > 4000:
            tol = 0.001
            selection = 'random'
            n_alphas = 60
            max_iter = 1000
        if not is_regression:
            raise (Exception('ElasticNet is only for regression!'))
        else:
            if Params['is_multitarget']:
                model = MultiTaskElasticNetCV(eps=0.001,
                                              alphas=None,
                                              fit_intercept=True,
                                              normalize=False,
                                              max_iter=max_iter,
                                              tol=tol,
                                              cv=7,
                                              copy_X=True,
                                              verbose=0,
                                              n_alphas=n_alphas,
                                              n_jobs=1,
                                              random_state=666,
                                              selection=selection,
                                              **parameters)
            else:
                model = ElasticNetCV(eps=0.001,
                                     alphas=None,
                                     fit_intercept=True,
                                     normalize=False,
                                     max_iter=max_iter,
                                     tol=tol,
                                     cv=7,
                                     copy_X=True,
                                     verbose=0,
                                     n_alphas=n_alphas,
                                     n_jobs=1,
                                     random_state=666,
                                     selection=selection,
                                     **parameters)

    elif Params['Algorithm'][0] == "RandomForest":

        if not is_regression:
            raise (Exception('not set up (lazy)'))
        else:
            model = RandomForestRegressor(criterion='mse',
                                          min_samples_leaf=1,
                                          min_weight_fraction_leaf=0.0,
                                          max_leaf_nodes=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          bootstrap=True,
                                          oob_score=False,
                                          n_jobs=1,
                                          random_state=None,
                                          verbose=0,
                                          warm_start=False,
                                          **parameters)
            param_grid = {
                'max_features': ('auto', 'sqrt'),
                'min_samples_split': (
                    2,
                    4,
                ),
            }
            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=7,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == 'SVM':
        # 0.001, 0.005, 0.01, 0.05, 0.1, 0.5,1.0,1.5,2.0,3.0,4.0,5.0,10.0
        if not is_regression:
            model = SVC(cache_size=400,
                        coef0=0.0,
                        gamma='auto',
                        max_iter=-1,
                        shrinking=True,
                        tol=0.001,
                        verbose=False,
                        **parameters)
            #parameters = {'reg__C':[0.5],'reg__epsilon':[0.1]}
        else:
            model = SVR(cache_size=400,
                        coef0=0.0,
                        gamma='auto',
                        max_iter=-1,
                        shrinking=True,
                        tol=0.001,
                        verbose=False,
                        **parameters)
            param_grid = {'C': np.logspace(np.log10(0.0005), np.log10(10), 30)}
            #param_grid = {'nu':(0.1,0.3,0.5,0.7,0.9)}
            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=8,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == 'GradientBoosting':
        if not is_regression:
            model = GradientBoostingClassifier(random_state=1, **parameters)
            #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6],'learning_rate':[0.01,0.03,0.1],'min_samples_leaf':[2,3,4]}
        else:
            model = GradientBoostingRegressor(random_state=1, **parameters)
            #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6]}
    elif Params['Algorithm'][0] == 'MLP':
        #parameters['hidden_layer_sizes']=[parameters['hidden_layer_sizes']]
        #model = MLPRegressorCV(hidden_layer_sizes=parameters['hidden_layer_sizes'])
        model = MLPRegressor(
            activation="relu",
            solver="lbfgs",
            learning_rate="constant",
            learning_rate_init=0.0011,
            max_iter=450,
            random_state=None,
            tol=0.00013,
            epsilon=1e-08,
            hidden_layer_sizes=parameters['hidden_layer_sizes'])

        param_grid = {'alpha': np.logspace(0, np.log10(350), 20)}
        model = GridSearchCV(model,
                             param_grid,
                             n_jobs=1,
                             iid=True,
                             refit=True,
                             cv=7,
                             verbose=0,
                             scoring=neg_mean_squared_error_scorer)
        is_cv_run = True
        #model = MLPRegressor(activation="relu", solver ="lbfgs",learning_rate ="constant",
        #             learning_rate_init = 0.001, power_t = 0.5, max_iter = 500, shuffle = True, random_state = None,
        #             tol = 0.0001, verbose = False, warm_start = False, momentum = 0.9, epsilon = 1e-08,**parameters)
    elif Params['Algorithm'][0] == 'MLP_KERAS':

        from keras.models import Sequential
        from keras import regularizers
        from keras.layers import Dense, Dropout
        from keras.callbacks import EarlyStopping
        from sklearn.preprocessing import LabelEncoder
        from keras.utils import np_utils
        import tensorflow as tf
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        session = tf.Session(config=config)

        early_stopping = EarlyStopping(monitor='val_loss', patience=5)

        model = Sequential()
        model.add(
            Dense(
                parameters['layers_and_nodes'][0],
                activation='tanh',
                input_shape=(X.shape[1], ),
                kernel_initializer='glorot_uniform',
                kernel_regularizer=regularizers.l2(
                    parameters['l2_regularization']),
            ))
        model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1))
        for layer in range(1, len(parameters['layers_and_nodes'])):
            model.add(
                Dense(parameters['layers_and_nodes'][layer],
                      activation='relu',
                      input_shape=(parameters['layers_and_nodes'][layer -
                                                                  1], ),
                      kernel_initializer='glorot_normal',
                      kernel_regularizer=regularizers.l2(
                          parameters['l2_regularization'])))
            model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1))

        if not is_regression:
            model.add(
                Dense(1,
                      activation='softmax',
                      input_shape=(parameters['nodes'][-1], )))
            model.compile(loss='categorical_crossentropy',
                          optimizer='rmsprop',
                          metrics=['f1'])
            encoder = LabelEncoder()
            encoder.fit(Y)
            encoded_Y = encoder.transform(Y)
            # convert integers to dummy variables (i.e. one hot encoded)
            Y = np_utils.to_categorical(encoded_Y)
        else:
            model.add(
                Dense(1,
                      activation='linear',
                      input_shape=(parameters['layers_and_nodes'][-1], )))
            model.compile(loss='mean_squared_error',
                          optimizer='adam',
                          metrics=['mse'])

        model.fit(X,
                  Y,
                  batch_size=X.shape[0],
                  epochs=100,
                  validation_split=0,
                  verbose=0)  #,callbacks=[early_stopping])

        return model

    else:

        raise (Exception('unknown model'))
    #decomposer = LatentDirichletAllocation(n_topics=10, max_iter=10,learning_method='online',learning_offset=50.,random_state=1)
    #decomposer = TruncatedSVD(n_components=100,random_state=666)
    """
    X = data.iloc[:]['text'].values
    y = data.iloc[:]['mylabel'].values.astype(str)
    
    dat = vect.fit_transform(X)
    dat = tfidf.fit_transform(dat)
    dat = decomposer.fit_transform(dat)  
    
    for a in numpy.unique(y):
        plt.scatter(dat[y==a,0],dat[y==a,1])
    """
    """
    START LOOP
    """

    #t0 = time()
    # if get_set_count(parameters)>1:
    #     grid_search = GridSearchCV(model, parameters, n_jobs=6,verbose=1,cv=10,refit=True)
    #     grid_search.fit(X=X,y=Y)
    #     best_parameters = grid_search.best_estimator_.get_params()
    #     print('--> best parameters: %s' % best_parameters)
    #     return grid_search
    # else:

    if 1:
        start_time = time.time()
        print('... training model (X.shape=%s)' % str(X.shape), end='')

    warnings.filterwarnings("ignore")

    if Y_other is not None and Params['is_multitarget']:
        Y = np.expand_dims(Y, axis=1)
        model.fit(X=X, y=np.concatenate((Y, Y_other), axis=1))
    else:
        Y = Y.flatten()
        model.fit(X=X, y=Y)

    if is_cv_run:
        print(' [best gridsearch params: %s] ' % model.best_params_, end='')

    if 1:
        end_time = time.time()
        print(' ... done (%1.1f min)' % ((end_time - start_time) / 60.0))

    #elapsedtime = (time.time() - starttime) / 60.0
    #print('fit done (took %f minutes)' % elapsedtime)

    return model
 evolved_freq.append(f2)
 
 #get the average metabolite usage from the evolved population
 used_mets=[]
 for mm in g2:
     reacs=[react_dict[z] for z in mm]
     m=Model(reacs)
     used_mets.append(m.ex_reactants)
 used_mets = list(chain.from_iterable(used_mets))
 mf=[]
 for mm in dm:
     mf.append(used_mets.count(mm)/len(g2))
 true_used_env.append(mf)
 
 from sklearn.linear_model import MultiTaskElasticNetCV as EN
 enet  = EN(cv=50, max_iter=100000)
 x = full_freq_m.T[m_diff_freq_m>.005].T
 y = used_environment.T[m_diff_used_env>0.005].T
 mod=enet.fit(x, y)
 p = mod.predict(f2[m_diff_freq_m>.005].reshape(1,-1))
 
 
 p=p.flatten()
 p = p+abs(min(p))
 p=p/max(p)
 
 c = [sts.pearsonr(mf,used_environment[ee][m_diff_used_env>0.005])[0] for ee in range(len(used_environment))]
 
 
 predicted.append(sts.pearsonr(p, mf)[0])
 
Пример #18
0
    def train(self,
              vectors_path,
              bound_morphemes_path=None,
              word_segmentations_path=None,
              graphemes_to_phonemes_path=None,
              n_jobs=1,
              l1_ratio=0.5):
        train_config = locals()
        train_config.pop("self")
        train_config.pop("__class__", None)
        self.config["train_config"] = train_config
        logger.info("Train config: ")
        pprint.pprint(train_config)

        # Load vectors, where the keys can be words represented as
        # sequences of characters (normal word vectors) or words represented
        # as sequences of phonemes (phonemicized vectors).
        logger.info("Reading vectors from {}".format(vectors_path))
        self.vectors = OrderedDict()
        with open(vectors_path) as vectors_file:
            for line in tqdm(vectors_file,
                             total=get_line_number(vectors_path)):
                split_line = line.rstrip("\n").split()
                word = split_line[0]
                # If we have phonemicized vectors, the keys to the dict are
                # tuples of comma-separated phonemes representing a word.
                if graphemes_to_phonemes_path is not None:
                    word = tuple(word.split(","))
                embedding = np.array([float(val) for val in split_line[1:]])
                self.vectors[word] = embedding

        # Randomly shuffle the OrderedDict
        random_seed = 0
        logger.info(
            "Shuffling vectors with random seed {}".format(random_seed))
        random.seed(random_seed)
        vector_items = list(self.vectors.items())
        # random.shuffle is in-place
        random.shuffle(vector_items)
        self.vectors = OrderedDict(vector_items)

        vocabulary = list(self.vectors.keys())
        targets = np.asarray(list(self.vectors.values()))

        # Load phonemes to graphemes if we were given g2p data
        if graphemes_to_phonemes_path:
            logger.info("Reading graphemes to phonemes data "
                        "from {}".format(graphemes_to_phonemes_path))
            self.phonemes_to_graphemes = {}
            # Load the graphemes to phonemes data
            with open(
                    graphemes_to_phonemes_path) as graphemes_to_phonemes_file:
                for line in tqdm(
                        graphemes_to_phonemes_file,
                        total=get_line_number(graphemes_to_phonemes_path)):
                    split_line = line.rstrip("\n").split("\t")
                    word = split_line[0]
                    phonemes = tuple(split_line[1].split(" "))
                    self.phonemes_to_graphemes[phonemes] = word

        if bound_morphemes_path is not None:
            # Load morpheme data if we were given bound morphemes
            word_segmentations, bound_morphemes = self._load_morpheme_data(
                word_segmentations_path, bound_morphemes_path)
            # Update targets with predictions of the morpheme model. This is equivalent
            # to using the model residuals as the new targets.
            targets = self._get_morpheme_residuals(vocabulary,
                                                   targets,
                                                   bound_morphemes,
                                                   graphemes_to_phonemes_path,
                                                   word_segmentations,
                                                   n_jobs=n_jobs)

        # Get the ngram features for the vocabulary.
        self.X_ngram, self.ngram_to_idx = build_ngram_features(
            vocabulary=vocabulary,
            one_hot=self.one_hot,
            ngram_range=self.ngrams,
            mode=self.mode,
            freq_thres=self.min_count)
        logger.info("Shape of ElasticNet input (number of words, "
                    "number of candidate phonesthemes): {}".format(
                        self.X_ngram.shape))
        logger.info("Shape of ElasticNet targets (number of words, "
                    "vector dimension): {}".format(targets.shape))
        # Fit a MultiTaskElasticNetCV model to extract phonesthemes.
        logger.info("Fitting MultiTaskElasticNetCV")
        self.phonesthemes_reg = MultiTaskElasticNetCV(l1_ratio=l1_ratio,
                                                      n_jobs=n_jobs,
                                                      random_state=0,
                                                      cv=5)
        self.phonesthemes_reg.fit(self.X_ngram, targets)
        logger.info("Done fitting MultiTaskElasticNetCV")

        self.is_trained = True
Пример #19
0
    
    
    def scorer(pipe, X, y):
        pred = pipe.predict(X)
        return metrics.f1_score(y, pred)

    accum = np.zeros((X.shape[1],))
    for y in np.transpose(Y):
        selector = SelectKBest(f_classif, selectedFeaureNum)
        selector = selector.fit(X, y)
        accum += selector.pvalues_
    selectedIndices = accum.argsort()[:selectedFeaureNum]
    def transform(X):
        return X[:, selectedIndices]     
    X_filtered, X_test_filtered =  transform(X), transform(X_test)
    clf = MultiTaskElasticNetCV(normalize=True)
    #clf = MultiTaskLasso(normalize=True)
    clf.fit(X_filtered, Y)
    predTrain = np.array(clf.predict(X_filtered))
    splits = []
    for col in range(predTrain.shape[1]):
        bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col])
        splits.append(bestSplit)
    pred =  np.array(clf.predict(X_test_filtered))
    for col in range(pred.shape[1]):
        pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]]
        predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]]
    ps.append(metrics.precision_score(Y_test, pred))
    rs.append(metrics.recall_score(Y_test, pred))
    teF  = metrics.f1_score(Y_test, pred)
    teFs.append(teF)
Пример #20
0
logging.info("Starting outer CV, N = {}".format(N_outer))

#Outer loop over N splits
split_index = 0
for train_idx, test_idx in cvsplitter_outer.split(X, Y, groups):
    groups_train = groups[train_idx]
    X_train = X[train_idx]
    Y_train = Y[train_idx]

    groups_test = groups[test_idx]
    X_test = X[test_idx]
    Y_test = Y[test_idx]
    
    regressor=MultiTaskElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1], 
                                    n_jobs = args.cores, 
                                    cv = list(cvsplitter_inner.split(X_train, Y_train, groups_train)))
    estimator = make_pipeline(imputer, regressor)
    logging.info("Training...")
    estimator.fit(X_train, Y_train)
    
    logging.info('Training: {:1.3} Testing: {:1.3}'.format(estimator.score(X_train, Y_train), estimator.score(X_test, Y_test)))
    
    out_dict={"score_train" : estimator.score(X_train, Y_train),
              "score_test" : estimator.score(X_test, Y_test),
              "intercept" : estimator.named_steps['multitaskelasticnetcv'].intercept_ ,
              "coef" : estimator.named_steps['multitaskelasticnetcv'].coef_ ,
              "alpha" : estimator.named_steps['multitaskelasticnetcv'].alpha_ ,
              "alphas" : estimator.named_steps['multitaskelasticnetcv'].alphas_ ,
              "mse_path" : estimator.named_steps['multitaskelasticnetcv'].mse_path_ ,
              "l1_ratio" : estimator.named_steps['multitaskelasticnetcv'].l1_ratio_ ,
Пример #21
0
from sklearn.linear_model import MultiTaskElasticNet, MultiTaskElasticNetCV

#cross-validating to find best hyperparams
cv_model = MultiTaskElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                                 verbose=1)
cv_model.fit(X_train, y_train)

#fitting model with hyperparameters from above
model = MultiTaskElasticNet(alpha=cv_model.alpha_,
                            l1_ratio=cv_model.l1_ratio_,
                            random_state=0)
model.fit(X_train, y_train)

#predicting
preds = model.predict(X_test)
test_df[[
    'age', 'domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'
]] = preds
test_df.drop(columns=["is_train"], inplace=True)
test_df.head()

#predictions housekeeping
sub_df = cudf.melt(test_df[[
    "Id", "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
]],
                   id_vars=["Id"],
                   value_name="Predicted")
sub_df["Id"] = sub_df["Id"].astype("str") + "_" + sub_df["variable"].astype(
    "str")
sub_df = sub_df.drop("variable", axis=1).sort_values("Id")
assert sub_df.shape[0] == test_df.shape[0] * 5
train_labels = np.vstack((import_test_labels["Ytest"], import_train["Ytrain"]))  # labels of the original train data

## Standardization
scaler = preprocessing.StandardScaler().fit(X_train_raw)
X_train_scaled = scaler.transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

## PCA and Feature Selection
pca = PCA(n_components=800)
selection = SelectKBest(k=850)
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)])
combined_features.fit(X_train_scaled, train_labels.ravel())
# print(pca.explained_variance_ratio_)
X_train_reduced = combined_features.transform(X_train_scaled)
X_test_reduced = combined_features.transform(X_test_scaled)

## Lasso CV for parameter optimization
t1 = time.time()
alps = np.linspace(0.1, 0.625, 15)
model = MultiTaskElasticNetCV(cv=3, n_jobs=-1, max_iter=25).fit(X_train_reduced, Y_train_raw)
t_lasso_cv = time.time() - t1
print "time to train", t_lasso_cv
print "alpha", model.alpha_
print "i1 ration", model.i1_ratio_


Y_predicted = model.predict(X_test_reduced)

## Save results to csv
np.savetxt("prediction.csv", Y_predicted, fmt="%.5f", delimiter=",")
Пример #23
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 21 23:51:12 2016

@author: patanjali
"""

from sklearn.linear_model import MultiTaskElasticNetCV
from utils2 import load_dataset
import pandas

train, validate, test = load_dataset()

no_classes = train[:,0].max()+1
train_y = pandas.get_dummies(train[:,0])

print no_classes, train.shape

train = train[:201]
validate = validate[:201]
test = test[:201]

for l1_ratio in [.1, .5, .7, .9, .95, .99, 1]:
    
    model = MultiTaskElasticNetCV(l1_ratio=l1_ratio, normalize=True, verbose=True, n_jobs=3)
    model.fit(train[:,1:], train_y)
    predicted_classes = (model.predict(validate[:,1:])).argmax(1)
    
    correct = sum(predicted_classes==validate[:,0])
    print l1_ratio, correct, correct*1.0/validate.shape[0]
    
Пример #24
0
########Elstic Net###################################

####Fit the model####
ElNet = ElasticNet(alpha=0.5, random_state=0).fit(x, y)
ElNet.score(x, y)  #-1.1142739679728243e-16

#Try with cross validation prediction
y_pred_ElNet = cross_val_predict(ElNet, x, y, cv=3)
r2_score(y, y_pred_ElNet)  #-0.0002686650433182912
#the best value is 0.0
mean_squared_error(y, y_pred_ElNet)  #7.85883e-05
#the best value is 0.0
mean_absolute_error(y, y_pred_ElNet)  #0.0005987262

#Multi Task Elstic Net with CV
ElNetCV = MultiTaskElasticNetCV(random_state=0, verbose=1).fit(x, y)
##UserWarning: Objective did not
#converge. You might want to increase the number of iterations

#Plot
start = 10000
plt.figure()
plt.pcolormesh(np.log(x[start:start + 1000, :]))
plt.ylabel('time')
plt.xlabel('freq')
plt.figure()
plt.pcolormesh(np.log(y[start:start + 1000, :]))
plt.ylabel('time')
plt.xlabel('freq')

##### S_clean #################################################################
Пример #25
0
    SVC(kernel='poly', probability=True, degree=3),
    SVC(kernel='poly', probability=True, degree=4),
    SVC(kernel='poly', probability=True, degree=5),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    QuadraticDiscriminantAnalysis(),
    LinearDiscriminantAnalysis(),
    ElasticNetCV(max_iter=10000),
    LarsCV(),
    LassoCV(max_iter=10000),
    LassoLarsCV(),
    LogisticRegressionCV(scoring=multi_class_log_loss),
    MultiTaskElasticNetCV(),
    MultiTaskLassoCV(),
    OrthogonalMatchingPursuitCV(),
    RidgeClassifierCV()
]
algorithm = 17
if len(sys.argv) > 1:
    algorithm = int(sys.argv[1])

name = names[algorithm]
clf = classifiers[algorithm]
output_file_name = output_file_names[algorithm] + file_identifier

t = time.time()
random_state = np.random.RandomState(0)
print "Fitting classifier " + name
Пример #26
0
def GetAllModelsForComparison(X_train, Y_train):
    models = {
        'ARDRegression': ARDRegression(),
        'BayesianRidge': BayesianRidge(),
        'ElasticNet': ElasticNet(),
        'ElasticNetCV': ElasticNetCV(),
        'Hinge': Hinge(),
        #'Huber': Huber(),
        'HuberRegressor': HuberRegressor(),
        'Lars': Lars(),
        'LarsCV': LarsCV(),
        'Lasso': Lasso(),
        'LassoCV': LassoCV(),
        'LassoLars': LassoLars(),
        'LassoLarsCV': LassoLarsCV(),
        'LinearRegression': LinearRegression(),
        'Log': Log(),
        'LogisticRegression': LogisticRegression(),
        'LogisticRegressionCV': LogisticRegressionCV(),
        'ModifiedHuber': ModifiedHuber(),
        'MultiTaskElasticNet': MultiTaskElasticNet(),
        'MultiTaskElasticNetCV': MultiTaskElasticNetCV(),
        'MultiTaskLasso': MultiTaskLasso(),
        'MultiTaskLassoCV': MultiTaskLassoCV(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
        'Perceptron': Perceptron(),
        'RANSACRegressor': RANSACRegressor(),
        #'RandomizedLasso': RandomizedLasso(),
        #'RandomizedLogisticRegression': RandomizedLogisticRegression(),
        'Ridge': Ridge(),
        'RidgeCV': RidgeCV(),
        'RidgeClassifier': RidgeClassifier(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        'SquaredLoss': SquaredLoss(),
        'TheilSenRegressor': TheilSenRegressor(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LinearClassifierMixin': LinearClassifierMixin(),
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'StandardScaler': StandardScaler(),
        'TransformerMixin': TransformerMixin(),
        'BaseEstimator': BaseEstimator(),
        'KernelRidge': KernelRidge(),
        'RegressorMixin': RegressorMixin(),
        'LinearSVC': LinearSVC(),
        'LinearSVR': LinearSVR(),
        'NuSVC': NuSVC(),
        'NuSVR': NuSVR(),
        'OneClassSVM': OneClassSVM(),
        'SVC': SVC(),
        'SVR': SVR(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        #'BallTree': BallTree(),
        #'DistanceMetric': DistanceMetric(),
        #'KDTree': KDTree(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'KNeighborsRegressor': KNeighborsRegressor(),
        'KernelDensity': KernelDensity(),
        #'LSHForest': LSHForest(),
        'LocalOutlierFactor': LocalOutlierFactor(),
        'NearestCentroid': NearestCentroid(),
        'NearestNeighbors': NearestNeighbors(),
        'RadiusNeighborsClassifier': RadiusNeighborsClassifier(),
        'RadiusNeighborsRegressor': RadiusNeighborsRegressor(),
        #'GaussianProcess': GaussianProcess(),
        'GaussianProcessRegressor': GaussianProcessRegressor(),
        'GaussianProcessClassifier': GaussianProcessClassifier(),
        'CCA': CCA(),
        'PLSCanonical': PLSCanonical(),
        'PLSRegression': PLSRegression(),
        'PLSSVD': PLSSVD(),
        #'ABCMeta': ABCMeta(),
        #'BaseDiscreteNB': BaseDiscreteNB(),
        'BaseEstimator': BaseEstimator(),
        #'BaseNB': BaseNB(),
        'BernoulliNB': BernoulliNB(),
        'ClassifierMixin': ClassifierMixin(),
        'GaussianNB': GaussianNB(),
        'LabelBinarizer': LabelBinarizer(),
        'MultinomialNB': MultinomialNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'DecisionTreeRegressor': DecisionTreeRegressor(),
        'ExtraTreeClassifier': ExtraTreeClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'BaggingClassifier': BaggingClassifier(),
        'BaggingRegressor': BaggingRegressor(),
        #'BaseEnsemble': BaseEnsemble(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'IsolationForest': IsolationForest(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RandomForestRegressor': RandomForestRegressor(),
        'RandomTreesEmbedding': RandomTreesEmbedding(),
        #'VotingClassifier': VotingClassifier(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LabelBinarizer': LabelBinarizer(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'OneVsOneClassifier': OneVsOneClassifier(),
        #'OneVsRestClassifier': OneVsRestClassifier(),
        #'OutputCodeClassifier': OutputCodeClassifier(),
        'Parallel': Parallel(),
        #'ABCMeta': ABCMeta(),
        'BaseEstimator': BaseEstimator(),
        #'ClassifierChain': ClassifierChain(),
        'ClassifierMixin': ClassifierMixin(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'MultiOutputClassifier': MultiOutputClassifier(),
        #'MultiOutputEstimator': MultiOutputEstimator(),
        #'MultiOutputRegressor': MultiOutputRegressor(),
        'Parallel': Parallel(),
        'RegressorMixin': RegressorMixin(),
        'LabelPropagation': LabelPropagation(),
        'LabelSpreading': LabelSpreading(),
        'BaseEstimator': BaseEstimator(),
        'IsotonicRegression': IsotonicRegression(),
        'RegressorMixin': RegressorMixin(),
        'TransformerMixin': TransformerMixin(),
        'BernoulliRBM': BernoulliRBM(),
        'MLPClassifier': MLPClassifier(),
        'MLPRegressor': MLPRegressor()
    }
    return models
Apply sparse linear regression (ElasticNet) for easier analysis
Force the coefficients to be non-negative as none drug should increase the presence of the bacterias
'''

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=1)

folds = 5
alphas = np.logspace(1, 5, 3)
l1_ratios = np.linspace(0, 1, 2, endpoint=True)

models = MultiTaskElasticNetCV(l1_ratio=l1_ratios,
                               alphas=alphas,
                               verbose=1,
                               cv=folds,
                               n_jobs=-1)
models.fit(X_train, Y_train)
models.score(X_test, Y_test)

print "Alpha: ", models.alpha_
print "L1 ratio: ", models.l1_ratio_
print "Score of Elastic-net on test data: ", models.score(X_test, Y_test)

model_EN = ElasticNet(l1_ratio=models.l1_ratio_, alpha=models.alpha_)
model_EN.fit(np.concatenate((X_train, X_test)),
             np.concatenate((Y_train, Y_test)))

test = np.rint(models.predict(X_test)).astype('int16')
coeff = model_EN.coef_.T
Пример #28
0
class PhonesthemesModel(object):
    """
    Attributes
    ----------
    self.config: Dict
        A dictionary of the arguments passed into the object.

    self.ngrams: List[int]
        A list of integers that refer to the ngram sizes to use.

    self.mode: List[str]
        List of str indicating the positions in the word to use
        as candidate phonesthemes. Possible elements are "start",
        "end", and "all".

    self.min_count: int
        Minimum number of ngram occurrences in order to be included
        as a features.

    self.one_hot: bool
        Whether or not to use one-hot features instead of counts for
        the phonestheme ngram features.

    self.vectors
        Dictionary of word to vector, where word is either a string or a
        tuple of strings (phoneme representation).

    self.phonesthemes_reg
        The MultiTaskElasticNetCV model fit on the phonestheme feature vectors to
        predict the phonestheme targets.

    self.X_ngram
        The input feature vectors used to fit the Elastic Net.

    self.ngram_to_idx
        A mapping from ngram to feature index of X_ngram.

    self.is_trained
        A boolean describing whether this model has been trained or not.
    """
    def __init__(self, ngrams, mode, min_count, one_hot):
        self.config = locals()
        self.config.pop("self")
        self.config.pop("__class__", None)

        logger.info("Config: ")
        pprint.pprint(self.config)

        self.ngrams = ngrams
        self.mode = mode
        self.min_count = min_count
        self.one_hot = one_hot

        # Placeholder values, these get set when we call train
        self.vectors = None
        self.phonesthemes_reg = None
        self.X_ngram = None
        self.ngram_to_idx = None
        self.phonemes_to_graphemes = None

        self.is_trained = False

    def get_phonesthemes(self):
        return get_phonesthemes_from_model(self)

    def train(self,
              vectors_path,
              bound_morphemes_path=None,
              word_segmentations_path=None,
              graphemes_to_phonemes_path=None,
              n_jobs=1,
              l1_ratio=0.5):
        train_config = locals()
        train_config.pop("self")
        train_config.pop("__class__", None)
        self.config["train_config"] = train_config
        logger.info("Train config: ")
        pprint.pprint(train_config)

        # Load vectors, where the keys can be words represented as
        # sequences of characters (normal word vectors) or words represented
        # as sequences of phonemes (phonemicized vectors).
        logger.info("Reading vectors from {}".format(vectors_path))
        self.vectors = OrderedDict()
        with open(vectors_path) as vectors_file:
            for line in tqdm(vectors_file,
                             total=get_line_number(vectors_path)):
                split_line = line.rstrip("\n").split()
                word = split_line[0]
                # If we have phonemicized vectors, the keys to the dict are
                # tuples of comma-separated phonemes representing a word.
                if graphemes_to_phonemes_path is not None:
                    word = tuple(word.split(","))
                embedding = np.array([float(val) for val in split_line[1:]])
                self.vectors[word] = embedding

        # Randomly shuffle the OrderedDict
        random_seed = 0
        logger.info(
            "Shuffling vectors with random seed {}".format(random_seed))
        random.seed(random_seed)
        vector_items = list(self.vectors.items())
        # random.shuffle is in-place
        random.shuffle(vector_items)
        self.vectors = OrderedDict(vector_items)

        vocabulary = list(self.vectors.keys())
        targets = np.asarray(list(self.vectors.values()))

        # Load phonemes to graphemes if we were given g2p data
        if graphemes_to_phonemes_path:
            logger.info("Reading graphemes to phonemes data "
                        "from {}".format(graphemes_to_phonemes_path))
            self.phonemes_to_graphemes = {}
            # Load the graphemes to phonemes data
            with open(
                    graphemes_to_phonemes_path) as graphemes_to_phonemes_file:
                for line in tqdm(
                        graphemes_to_phonemes_file,
                        total=get_line_number(graphemes_to_phonemes_path)):
                    split_line = line.rstrip("\n").split("\t")
                    word = split_line[0]
                    phonemes = tuple(split_line[1].split(" "))
                    self.phonemes_to_graphemes[phonemes] = word

        if bound_morphemes_path is not None:
            # Load morpheme data if we were given bound morphemes
            word_segmentations, bound_morphemes = self._load_morpheme_data(
                word_segmentations_path, bound_morphemes_path)
            # Update targets with predictions of the morpheme model. This is equivalent
            # to using the model residuals as the new targets.
            targets = self._get_morpheme_residuals(vocabulary,
                                                   targets,
                                                   bound_morphemes,
                                                   graphemes_to_phonemes_path,
                                                   word_segmentations,
                                                   n_jobs=n_jobs)

        # Get the ngram features for the vocabulary.
        self.X_ngram, self.ngram_to_idx = build_ngram_features(
            vocabulary=vocabulary,
            one_hot=self.one_hot,
            ngram_range=self.ngrams,
            mode=self.mode,
            freq_thres=self.min_count)
        logger.info("Shape of ElasticNet input (number of words, "
                    "number of candidate phonesthemes): {}".format(
                        self.X_ngram.shape))
        logger.info("Shape of ElasticNet targets (number of words, "
                    "vector dimension): {}".format(targets.shape))
        # Fit a MultiTaskElasticNetCV model to extract phonesthemes.
        logger.info("Fitting MultiTaskElasticNetCV")
        self.phonesthemes_reg = MultiTaskElasticNetCV(l1_ratio=l1_ratio,
                                                      n_jobs=n_jobs,
                                                      random_state=0,
                                                      cv=5)
        self.phonesthemes_reg.fit(self.X_ngram, targets)
        logger.info("Done fitting MultiTaskElasticNetCV")

        self.is_trained = True

    def _load_morpheme_data(self, word_segmentations_path,
                            bound_morphemes_path):
        # Load word segmentations
        word_segmentations = {}
        if word_segmentations_path:
            logger.info("Loading word segmentations from {}".format(
                word_segmentations_path))
            with open(word_segmentations_path) as word_segmentations_file:
                for line in tqdm(
                        word_segmentations_file,
                        total=get_line_number(word_segmentations_path)):
                    split_line = line.rstrip("\n").split("\t")
                    assert len(split_line) == 2
                    word = split_line[0]
                    morphemes = split_line[1].split(" ")
                    word_segmentations[word] = morphemes
            logger.info("Loaded {} word segmentations".format(
                len(word_segmentations)))

        # Load the list of bound morphemes
        logger.info(
            "Loading bound morphemes from {}".format(bound_morphemes_path))
        bound_morphemes = []
        with open(bound_morphemes_path) as bound_morphemes_file:
            for line in tqdm(bound_morphemes_file,
                             total=get_line_number(bound_morphemes_path)):
                bound_morphemes.append(line.rstrip("\n"))
        logger.info("Loaded {} bound morphemes".format(len(bound_morphemes)))
        return (word_segmentations, bound_morphemes)

    def _get_morpheme_residuals(self,
                                vocabulary,
                                targets,
                                bound_morphemes,
                                graphemes_to_phonemes_path,
                                word_segmentations=None,
                                n_jobs=1):
        # Get the vectors vocabulary, and convert to string if we are using
        # phonemicized vectors.
        if graphemes_to_phonemes_path is None:
            string_vectors_vocab = vocabulary
        else:
            # The vocab of the phonemicized vectors converted to graphemes.
            string_vectors_vocab = [
                self.phonemes_to_graphemes[phonemes] for phonemes in vocabulary
            ]
        # Build the morpheme feature vectors.
        morpheme_features = build_morpheme_features(string_vectors_vocab,
                                                    bound_morphemes,
                                                    word_segmentations)
        logger.info("Input shape for morpheme pretraining linear regression "
                    "(number of words, number of morphemes): {}".format(
                        morpheme_features.shape))
        logger.info("Target shape for morpheme pretraining linear regression "
                    "(number of words, vector dimension): {}".format(
                        targets.shape))
        morph_reg = LinearRegression(n_jobs=n_jobs)
        logger.info("Pretraining on morpheme features.")
        morph_reg = morph_reg.fit(morpheme_features, targets)
        logger.info("Calculating residuals of of linear regression done "
                    "on morpheme features and using that as the train "
                    "vectors for the ngram feature model.")

        # Get the residuals of the model for use in the second model.
        morph_reg_pred_y = morph_reg.predict(morpheme_features)
        morph_reg_residuals = np.subtract(targets, morph_reg_pred_y)
        return morph_reg_residuals

    def __eq__(self, other):
        # Two PhonesthemesModel objects are the same if their members are
        # the same.
        # Compare their ngrams
        if self.ngrams != other.ngrams:
            return False
        # Compare their mode
        if self.mode != other.mode:
            return False
        # Compare their min count
        if self.min_count != other.min_count:
            return False
        # Compare whether they use one-hot or frequency features
        if self.one_hot != other.one_hot:
            return False
        # Compare that they have the same set of vectors in the same order
        if len(self.vectors) != len(other.vectors):
            return False
        for this_word, other_word in zip(self.vectors, other.vectors):
            if this_word != other_word:
                return False
            if not np.allclose(self.vectors[this_word],
                               other.vectors[this_word]):
                return False
        # Check that they were trained on the same features
        if not np.allclose(self.X_ngram, other.X_ngram):
            return False
        # Check that they have the same mapping of ngram to feature idx
        if self.ngram_to_idx != other.ngram_to_idx:
            return False
        return True

    if six.PY2:

        def __ne__(self, other):
            equal = self.__eq__(other)
            return equal if equal is NotImplemented else not equal
Пример #29
0
#把离散特征和连续特征拼接起来
x_vec = np.concatenate((x_vec_con, x_vec_dis), axis=1)

#对于目标进行预测
y_registered = bike_rel['registered'].values.astype(float)
y_casual = bike_rel['casual'].values.astype(float)

y = np.stack((y_registered, y_casual), axis=1)

#建立模型进行预测
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import MultiTaskElasticNetCV
x1, x2, y1, y2 = train_test_split(x_vec, y, test_size=0.2, random_state=20)

############ Lasso
mtl = MultiTaskLassoCV(alphas=np.logspace(-3, -1, 3), cv=8, verbose=3)
mtl.fit(x1, y1)
mtl.score(x1, y1)
mtl.score(x2, y2)

############ ElasticNetCV
mte = MultiTaskElasticNetCV(l1_ratio=np.logspace(-3, -1, 3),
                            alphas=np.logspace(-3, -1, 3),
                            cv=8,
                            verbose=3)
mte.fit(x1, y1)
mtl.score(x1, y1)
mtl.score(x2, y2)
Пример #30
0
p(mean_squared_error(lasso_predict, Y_test))

# ## Ridge
#

# In[25]:

ridge_model = Ridge(alpha=0.01)
ridge_model = ridge_model.fit(X=X_train, y=Y_train)

ridge_predict = ridge_model.predict(X_test)

p(mean_absolute_error(ridge_predict, Y_test))
p(mean_squared_error(ridge_predict, Y_test))

# ## Elastic Net

# In[27]:

enet_params = {
    'alpha': [1e-7],
}

enet_model = MultiTaskElasticNetCV(alphas=enet_params['alpha'])
enet_model = enet_model.fit(X=X_train, y=Y_train)

enet_predict = enet_model.predict(X_test)

p(mean_absolute_error(enet_predict, Y_test))
p(mean_squared_error(enet_predict, Y_test))
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

# Linear Models
from sklearn.linear_model import LassoCV
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import MultiTaskElasticNetCV
from sklearn.linear_model import RidgeCV

# SVM
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.svm import NuSVC

clf = MultiTaskElasticNetCV()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.23)
sfm.fit(trX, trY)
new_trX = sfm.transform(trX)
n_features = new_trX.shape[1]
print sfm.get_support(indices=True)




pd.tools.plotting.scatter_matrix(ttl_X, diagonal="kde")
plt.tight_layout()
plt.show()
Пример #32
0
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = Op(**self._hyperparams)
netTrainFs = []
lastX = np.zeros((X_raw.shape[0], hiddenSize))

for  i in range(epochs/quanta):
    print 'Epoch: ', i*quanta
    an.trainSupervised(quanta, trndata,
        initialLearningrate=learningrate, 
        decay=1,#0.999,
        myWeightdecay=weightDecay,
        momentum=momentum)
    netTrainFs.append(an.scoreOnDS(trndata))    
    X, X_test = an.transform(X_raw),  an.transform(X_test_raw)
    if (lastX == X).all():
        raise 'problem'
    lastX = copy.deepcopy(X)
    clf = MultiTaskElasticNetCV()
    clf.fit(X, Y)
    predTrain = np.array(clf.predict(X))
    splits = []
    for col in range(predTrain.shape[1]):
        bestSplit, bestF1 = labanUtil.getSplitThreshold(predTrain[:, col], Y[:, col])
        splits.append(bestSplit)
    pred =  np.array(clf.predict(X_test))
    for col in range(pred.shape[1]):
        pred[:, col] = [1 if e>=splits[col] else 0 for e in pred[:, col]]
        predTrain[:, col] = [1 if e>=splits[col] else 0 for e in predTrain[:, col]]
    
    testFs.append(metrics.f1_score(Y_test, pred))
    trainFs.append(metrics.f1_score(Y, predTrain))
#des+='\n EN test f1: '+ str(testF)
#des+=' , EN train f1: '+ str(trainF)