示例#1
0
def LassoICReg(df, z, Ncy0, Ty0):
    reg = linear_model.LassoLarsIC(criterion='bic')
    reg2 = linear_model.LassoLarsIC(criterion='bic')
    X = df.loc[:, 'ya0':'Tx']
    y = df.loc[:, 'Ty']
    y2 = df.loc[:, 'Ncy']
    #z= df.loc[:,'ya0':'Tx']
    reg.fit(X, y)
    reg2.fit(X, y2)
    predicted = reg.predict(z)
    predicted2 = reg2.predict(z)
    print('-----------------------------')
    print("Regressão de Lasso IC (BIC)")
    print(print('Ty: ', predicted))
    print(print('Ncy: ', predicted2))

    index = []
    for ind in range(0, len(z.index)):
        index.append(ind)
    d = {
        'index': index,
        'Ty': predicted,
        'Ncy': predicted2,
        'Ncy0': Ncy0,
        'Ty0': Ty0
    }
    df2 = pd.DataFrame(data=d)
    return df2
示例#2
0
def test_no_warning_for_zero_mse():
    # LassoLarsIC should not warn for log of zero MSE.
    y = np.arange(10, dtype=float)
    X = y.reshape(-1, 1)
    lars = linear_model.LassoLarsIC(normalize=False)
    assert_no_warnings(lars.fit, X, y)
    assert_true(np.any(np.isinf(lars.criterion_)))
示例#3
0
def selector(ts_name, lang, a_df, r_df, cutoff_date):
    # for each language, include only regressors that correlate well with the time series to forecast
    # use lasso to find out which ones are to be kept
    r_list = [c for c in r_df.columns if c not in ['ds', 'language']]
    ycol = [c for c in a_df.columns if c not in ['ds', 'language']][0]

    if len(a_df) == 0:
        s_ut.my_print('WARNING: empty actuals for ' + lang + ' and ts ' +
                      ts_name)
        return None
    if len(r_df) == 0:
        s_ut.my_print('WARNING: empty regressors for ' + lang + ' and ts ' +
                      ts_name)
        return None

    ds_min = max(a_df['ds'].min(), r_df['ds'].min())
    s_df = r_df[(r_df['ds'] >= ds_min) & (r_df['ds'] <= cutoff_date)].copy()
    b_df = a_df[(a_df['ds'] >= ds_min) & (a_df['ds'] <= cutoff_date)].copy()

    df = b_df.merge(s_df, on=['ds', 'language'], how='left')
    df.dropna(inplace=True)
    X_train = df[r_list].values
    y_train = df[ycol].values
    lasso_mdl = l_mdl.LassoLarsIC(criterion='aic', normalize=True)
    lasso_mdl.fit(X_train, y_train)
    m_pars = lasso_mdl.coef_  # pars excluding intercept
    new_r_list = [r_list[i] for i in range(len(r_list)) if m_pars[i] != 0.0]
    s_ut.my_print('regressor selector::regressors for ' + ts_name +
                  ' and language ' + lang + ': ' + str(new_r_list))
    return r_df[['ds', 'language'] +
                new_r_list].copy() if len(new_r_list) > 0 else None
示例#4
0
def lasso_selection(lang, lang_f, af, normalize=True):
    cfg_list = lang_f['cfg_idx'].unique()
    f = lang_f[lang_f['cfg_idx'].isin(cfg_list)]
    if len(f) == 0:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: no data for cfg_list: ' + str(cfg_list))
        return None
    pf = pd.pivot_table(f[['ds', 'yhat', 'cfg_idx', 'language']], index=['ds', 'language'], columns='cfg_idx', values='yhat').reset_index(level=0).reset_index(drop=True)
    pf.dropna(inplace=True)  # ensure all fcast cfgs have the same time range
    df = pf.merge(af, on='ds', how='inner')
    pars = len(cfg_list)     # regression parameters: one per fcast_cfg plus 3 extra pars: constant + 2 regularization
    ndata = len(df)          # data points
    while ndata < pars:      # more regression unknowns than data
        s_ut.my_print('pid: ' + str(os.getpid()) + ' WARNING: not enough data for cfg_list: ' + str(cfg_list))
        cfg_list = cfg_list[:int(0.75 * ndata)]   # pick the top one by score
        pars = len(cfg_list)
    if len(cfg_list) == 0:
        s_ut.my_print('pid: ' + str(os.getpid()) + ' ERROR: not enough data for cfg_list: ' + str(cfg_list))
        return None

    X_train = df[cfg_list].values
    y_train = df['y'].values
    lasso_mdl = l_mdl.LassoLarsIC(criterion='aic', normalize=normalize)
    lasso_mdl.fit(X_train, y_train)
    m_pars = lasso_mdl.coef_              # pars excluding intercept
    new_cfg_list = [int(cfg_list[i]) for i in range(len(cfg_list)) if m_pars[i] != 0.0]
    return {'cfg_idx': new_cfg_list, 'alpha': lasso_mdl.alpha_, 'language': lang, 'normalize': normalize}
示例#5
0
 def _regression_B(self, X):
     residual_flag_ = []
     causal_matrix = self.B_prune.copy()
     reg_list = {i: causal_matrix[i, :] != 0 for i in range(self.n_dim)}
     for i in range(self.n_dim):
         if np.sum(reg_list[i]) != 0:
             y_reg = X[:, i]
             X_reg = X.T[reg_list[i]].T
             if self.reg_type == "lasso":
                 clf = linear_model.LassoLarsIC(criterion=self.criterion,
                                                max_iter=self.max_iter_ls,
                                                precompute="auto")
             else:
                 clf = linear_model.LinearRegression()
             clf.fit(y=y_reg.reshape(self.n_samples, -1),
                     X=X_reg.reshape(self.n_samples, -1))
             residual = y_reg.reshape(self.n_samples, -1) - clf.predict(
                 X_reg.reshape(self.n_samples, -1))
             if self.shapiro == True:
                 if stats.shapiro(residual)[1] > 0.05:
                     norm_flag = True
                 else:
                     norm_flag = False
                 residual_flag_.append(norm_flag)
             causal_matrix[i, reg_list[i]] = clf.coef_
         else:
             norm_flag = False
             residual_flag_.append(norm_flag)
     self.residual_flag = residual_flag_
     return causal_matrix
示例#6
0
    def choose_optimizer(self,
                         LassoType='Lasso',
                         RegCoef=0.00001,
                         cv=5,
                         criterion='aic',
                         maxiter=10000,
                         tolerance=0.0001,
                         normalize=True):

        if LassoType == 'Lasso':
            lin = linear_model.Lasso(alpha=RegCoef,
                                     max_iter=maxiter,
                                     normalize=normalize,
                                     tol=tolerance)
        elif LassoType == 'LassoCV':
            lin = linear_model.LassoCV(cv=cv,
                                       normalize=normalize,
                                       max_iter=maxiter)
        elif LassoType == 'LassoLarsCV':
            lin = linear_model.LassoLarsCV(cv=cv,
                                           normalize=normalize,
                                           max_iter=maxiter)
        elif LassoType == 'LarsCV':
            lin = linear_model.LarsCV(cv=cv,
                                      normalize=normalize,
                                      max_iter=maxiter)
        elif LassoType == 'LassoLarsIC':
            lin = linear_model.LassoLarsIC(criterion=criterion,
                                           normalize=normalize,
                                           max_iter=maxiter)
        else:
            raise Exception("wrong option")

        return lin
示例#7
0
def test_sk_LassoLarsIC():
    print("Testing sklearn, LassoLarsIC...")
    mod = linear_model.LassoLarsIC()
    X, y = iris_data
    mod.fit(X, y)
    docs = {'name': "LassoLarsIC test"}
    fv = X[0, :]
    upload(mod, fv, docs)
示例#8
0
def test_lasso_lars_ic():
    """ Test the LassoLarsIC object by checking that
        - some good features are selected.
        - alpha_bic > alpha_aic
        - n_nonzero_bic < n_nonzero_aic
    """
    lars_bic = linear_model.LassoLarsIC('bic')
    lars_aic = linear_model.LassoLarsIC('aic')
    rng = np.random.RandomState(42)
    X = diabetes.data
    y = diabetes.target
    X = np.c_[X, rng.randn(X.shape[0], 4)]  # add 4 bad features
    lars_bic.fit(X, y)
    lars_aic.fit(X, y)
    nonzero_bic = np.where(lars_bic.coef_)[0]
    nonzero_aic = np.where(lars_aic.coef_)[0]
    assert_greater(lars_bic.alpha_, lars_aic.alpha_)
    assert_less(len(nonzero_bic), len(nonzero_aic))
    assert_less(np.max(nonzero_bic), diabetes.data.shape[1])
示例#9
0
def bootstrap(miR_exp, mRNA_exp, num_mrna):
    clf = linear_model.LassoLarsIC(criterion='bic')
    boot_coll = []
    std = []
    for i in range(1000):
        rand_mrna = np.transpose(np.random.permutation(np.transpose(mRNA_exp)))
        clf.fit(np.transpose(rand_mrna), np.transpose(miR_exp))
        boot_coll.append(clf.coef_)
    stdev = np.std(boot_coll, axis=0)
    return stdev
示例#10
0
 def test_model_lasso_lars_ic(self):
     model, X = fit_regression_model(linear_model.LassoLarsIC())
     model_onnx = convert_sklearn(
         model,
         "lasso lars cv", [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X,
                         model,
                         model_onnx,
                         basename="SklearnLassoLarsIC-Dec4")
示例#11
0
def test_lasso_lars_ic():
    # Test the LassoLarsIC object by checking that
    # - some good features are selected.
    # - alpha_bic > alpha_aic
    # - n_nonzero_bic < n_nonzero_aic
    lars_bic = linear_model.LassoLarsIC('bic')
    lars_aic = linear_model.LassoLarsIC('aic')
    rng = np.random.RandomState(42)
    X = diabetes.data
    X = np.c_[X, rng.randn(X.shape[0], 5)]  # add 5 bad features
    lars_bic.fit(X, y)
    lars_aic.fit(X, y)
    nonzero_bic = np.where(lars_bic.coef_)[0]
    nonzero_aic = np.where(lars_aic.coef_)[0]
    assert lars_bic.alpha_ > lars_aic.alpha_
    assert len(nonzero_bic) < len(nonzero_aic)
    assert np.max(nonzero_bic) < diabetes.data.shape[1]

    # test error on unknown IC
    lars_broken = linear_model.LassoLarsIC('<unknown>')
    assert_raises(ValueError, lars_broken.fit, X, y)
示例#12
0
def regressionMethods(independent, dependent, regType=0):
	if regType == 0:
		clf = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
	elif regType == 1:
		clf = linear_model.LassoCV(alphas=[0.1, 1.0, 10.0])
	elif  regtype == 2:
		clf = linear_model.LassoLarsIC(criterion='bic')
	elif  regType == 3:
		clf = linear_model.ElasticNetCV(alphas=[0.1, 1.0, 10.0])
		
	clf.fit (independent, dependent)
	return clf
 def test_model_lasso_lars_ic(self):
     model, X = _fit_model(linear_model.LassoLarsIC())
     model_onnx = convert_sklearn(model, "lasso lars cv",
                                  [("input", FloatTensorType(X.shape))])
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X.astype(numpy.float32),
         model,
         model_onnx,
         basename="SklearnLassoLarsIC-Dec4",
         allow_failure="StrictVersion("
         "onnxruntime.__version__)"
         "<= StrictVersion('0.2.1')",
     )
 def test_model_lasso_lars_ic(self):
     model, X = fit_regression_model(linear_model.LassoLarsIC())
     model_onnx = convert_sklearn(
         model,
         "lasso lars cv", [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnLassoLarsIC-Dec4",
         allow_failure="StrictVersion("
         "onnxruntime.__version__)"
         "<= StrictVersion('0.2.1')",
     )
示例#15
0
def cal_dep(Seq_dict, Deg_dict, miR_exp, mRNA_exp):
    f = open('MicroTrans_results.txt', "w")
    f.write('microRNA')
    f.write('\t')
    f.write('mRNA')
    f.write('\t')
    f.write('p-value')
    f.write('\n')
    #perform lasso regression
    lasso_reg = {}
    for (Key_mirna, Value_mrna) in Seq_dict.items():
        print('processing microRNA ' + Key_mirna, end="\r")
        TotalmRNA = []
        for i in range(len(Value_mrna)):
            TotalmRNA.append(mRNA_exp[Value_mrna[i]])
        clf = linear_model.LassoLarsIC(criterion='bic')
        clf.fit(np.transpose(np.asarray(TotalmRNA)),
                np.asarray(miR_exp[Key_mirna]))
        if len(np.nonzero(clf.coef_)) == 0:
            continue
        stdev = bootstrap(np.asarray(miR_exp[Key_mirna]),
                          np.asarray(TotalmRNA), len(Value_mrna))
        for j in range(len(clf.coef_)):
            if clf.coef_[j] != 0:
                lasso_reg[(Key_mirna, Value_mrna[j])] = 1 - round(
                    scipy.stats.norm(0, 1).cdf(clf.coef_[j] / stdev[j]), 3)
    lasso_reg_set = set(lasso_reg)
    deg_set = set(Deg_dict)
    sharedKey = {}
    for inter_key in lasso_reg_set.intersection(deg_set):
        sharedKey[(inter_key[0], inter_key[1])] = 1
        Pvalue = scipy.stats.combine_pvalues(
            [float(lasso_reg[inter_key]),
             float(deg_set[inter_key])],
            method='fisher')
        output(inter_key[0], inter_key[1], Pvalue, f)
    for uniq_key in lasso_reg.keys():
        if uniq_key not in sharedKey.keys():
            output(uniq_key[0], uniq_key[1], lasso_reg[uniq_key], f)
    for uniq_key in Deg_dict.keys():
        if uniq_key not in sharedKey.keys():
            output(uniq_key[0], uniq_key[1], Deg_dict[uniq_key][0], f)
    f.close()
    print('Succesfully finished, the results are in MicroTrans_results.txt')
    return None
示例#16
0
 def models(self) -> Dict[str, LinearModel]:
     return {
         "LinearRegression": linear_model.LinearRegression(
         ),  # LinearRegression([…])	Ordinary least squares Linear Regression.
         "RidgeCV": linear_model.RidgeCV(
             cv=5
         ),  # RidgeCV([alphas, …])	Ridge regression with built-in cross-validation.
         "LassoLars": linear_model.LassoLars(
             eps=0.01
         ),  # LassoLars([alpha, …])	Lasso model fit with Least Angle Regression a.k.a.
         "LassoLarsIC": linear_model.LassoLarsIC(
             eps=0.01
         ),  # LassoLarsIC([criterion, …])	Lasso model fit with Lars using BIC or AIC for model selection
         "ARDRegression": linear_model.ARDRegression(
         ),  #  ARDRegression([n_iter, tol, …])	Bayesian ARD regression.
         "ElasticNet": linear_model.ElasticNet(
         ),  # linear_model.ElasticNet([alpha, l1_ratio, …])	Linear regression with combined L1 and L2 priors as regularizer.
     }
    def _estimate_model(self):
        """Estimates lasso regression object.

        Returns
        -------
        model : sklearn lasso regression or lasso cv object
            Fitted lasso model.
        """
        ###Lars Algorithm
        if self.solver == "Lars":
            self.underlying = linear_model.LassoLars(
                fit_intercept=self.intercept, normalize=False)
            if self.cv_folds is 'IC':  #For AIC/BIC. criterion kwarg should be provided.
                model = linear_model.LassoLarsIC(fit_intercept=self.intercept,
                                                 normalize=False,
                                                 **self.kwargs)
            elif self.cv_folds is not None:
                model = linear_model.LassoLarsCV(fit_intercept=self.intercept,
                                                 cv=self.cv_folds,
                                                 normalize=False,
                                                 **self.kwargs)
            else:
                model = linear_model.Lasso(fit_intercept=self.intercept,
                                           **self.kwargs)
        ###Coordinate Descent Algorithm
        elif self.solver == "Coordinate Descent":
            self.underlying = linear_model.Lasso(fit_intercept=self.intercept)
            if self.cv_folds is not None:
                model = linear_model.LassoCV(fit_intercept=self.intercept,
                                             cv=self.cv_folds,
                                             **self.kwargs)
            else:
                model = linear_model.Lasso(fit_intercept=self.intercept,
                                           **self.kwargs)
        else:
            raise NotImplementedError(
                'Solver not implemented. Choices are Lars or Coordinate Descent.'
            )
        #self.model.fit(np.asanyarray(self.x_train.values,order='F'), self.y_train)
        model.fit(self.x_train, self.y_train)
        return model
示例#18
0
    def test_LassoCV(self, criterion):
        diabetes = datasets.load_diabetes()
        X = diabetes.data
        y = diabetes.target

        X = pp.normalize(X)

        df = pdml.ModelFrame(diabetes)
        df.data = df.data.pp.normalize()

        mod1 = lm.LassoLarsIC(criterion=criterion)
        mod1.fit(X, y)

        mod2 = df.lm.LassoLarsIC(criterion=criterion)
        df.fit(mod2)
        self.assertAlmostEqual(mod1.alpha_, mod2.alpha_)

        expected = mod1.predict(X)
        predicted = df.predict(mod2)
        self.assertIsInstance(predicted, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(predicted.values, expected)
示例#19
0
print("Set up KFolds...")
n_splits = 5
kf = KFold(n_splits=n_splits)
kf.get_n_splits(X)
predictions0 = np.zeros((test.shape[0], n_splits))
predictions1 = np.zeros((test.shape[0], n_splits))
score = 0

print("Starting ", n_splits, "-fold CV loop...")
oof_predictions = np.zeros(X.shape[0])
for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_valid = X[train_index, :], X[test_index, :]
    y_train, y_valid = y[train_index], y[test_index]

    clf = linear_model.LassoLarsIC()
    clf.fit(X_train, y_train)

    pred0 = clf.predict(X)
    pred1 = clf.predict(test)
    oof_predictions[test_index] = clf.predict(X_valid)
    predictions0[:, fold] = pred0
    predictions1[:, fold] = pred1
    score += r2_score(y_train, clf.predict(X_train))
    print('Fold %d: Score %f' % (fold, clf.score(X_train, y_train)))

prediction0 = predictions0.mean(axis=1)
prediction1 = predictions1.mean(axis=1)
score /= n_splits
oof_score = r2_score(y, oof_predictions)
示例#20
0
def run_models(fullname, choose, delete):
    folder, filename = os.path.split(fullname)
    folder = folder.replace('/uploads', '/static')
    a = choose
    b = delete
    df = pd.read_csv(fullname)
    y_data = np.array(df.loc[:, a])
    X_picked = df.drop(a, 1)
    for row in b:
        X_picked = X_picked.drop(row, 1)
    num_column = len(X_picked.columns.values)
    imp = Imputer()
    X_picked = imp.fit_transform(X_picked)
    y_data = imp.fit_transform(y_data)
    y_data = np.reshape(y_data, (len(df), ))
    X_picked = preprocessing.minmax_scale(X_picked, feature_range=(0, 1))
    y_data = preprocessing.minmax_scale(y_data, feature_range=(0, 1))

    def add_layer(x, in_size, out_size, activation_function, layer, dropout):
        num_layer = 'layer%s' % layer
        with tf.name_scope(num_layer):
            global W
            tf.set_random_seed(666)
            W = tf.Variable(tf.random_uniform([in_size, out_size]))
        with tf.name_scope('y'):
            output = tf.matmul(x, W)
            if activation_function is 1:
                out = tf.nn.softsign(output)
            elif activation_function is 0:
                out = tf.nn.dropout(output, dropout)
            else:
                out = output
        return out

    # Create input variable
    with tf.name_scope('Input'):
        x = tf.placeholder(tf.float32, [None, num_column])
        y_ = tf.placeholder(tf.float32, [None, 1])
        keep_prob = tf.placeholder(tf.float32)  #dropout probability
        dropout = 0.85

    # Build up layers
    with tf.name_scope('Layer'):
        y = add_layer(x, num_column, 1, 2, 1, dropout)

    # input
    header = list(df.columns.values)
    list_header = []
    for i in range(len(header)):
        if header[i] != a:
            list_header.append(header[i])

    # cost functiontensoflow
    with tf.name_scope('Cost'):
        cost = tf.reduce_mean(tf.square(y_ - y))

    # training
    Training = tf.train.AdamOptimizer(0.01)
    train = Training.minimize(cost)

    with tf.Session() as sess:
        sess = tf.Session()
        init = tf.initialize_all_variables()
        sess.run(init)
        feed = {}
        chi_sum = 0
        predict = []
        actual = []
        chi_plot = []
        i_count = []
        counter = 0
        df1 = np.float32(df)
        for i in xrange(len(df.values)):
            xs = np.array(np.float32([X_picked[i]]))
            ys = np.array(np.float32([[y_data[i]]]))
            feed = {x: xs, y_: ys, keep_prob: dropout}
            prediction = sess.run(y, feed_dict=feed)
            sess.run(train, feed_dict=feed)
            chi = (np.square(ys - prediction)) / (prediction)
            chi_sum += chi
            predict.append(float(prediction) * 1000)
            actual.append(float(ys) * 1000)
            chi_plot.append(float(chi))
            i_count.append(i)
    weight = np.array(sess.run(W)).flatten()
    r2_score_NN = r2_score(actual, predict)

    # scikit learn machine learning
    # Ridge Regression
    model1 = Ridge(alpha=1.0)
    model1.fit(X_picked, y_data)
    prediction_ridge = cross_val_predict(model1, X_picked, y_data, cv=10)
    r2_score_LR = r2_score(y_data, prediction_ridge)
    r_linear_ridge = stats.pearsonr(prediction_ridge, y_data)
    prediction_ridge = np.array(prediction_ridge) * 1000
    ridge_coef = model1.coef_

    # Lasso Regression
    clf = linear_model.LassoLarsIC(criterion='aic')
    clf.fit(X_picked, y_data)
    prediction_lasso = cross_val_predict(clf, X_picked, y_data, cv=10)
    r2_score_lasso_aic = r2_score(y_data, prediction_lasso)
    r_linear_lassoAIC = stats.pearsonr(prediction_lasso, y_data)
    prediction_lasso = np.array(prediction_lasso) * 1000
    lasso_coef = clf.coef_

    # Stats and Plot
    c = stats.pearsonr(actual, predict)
    plt.figure()
    plt.scatter(predict, actual, s=3)
    plt.xlabel('Prediction')
    plt.ylabel('Observation')
    plt.title('This is Neural Network')
    uuid = hashlib.md5(filename + '_NN:' + a).hexdigest()
    # uuid = str(uuid.uuid4())
    id_nn = uuid
    filename = os.path.join(folder, 'images/%s.png' % id_nn)
    plt.savefig(filename)
    plt.figure()
    plt.scatter(prediction_ridge, actual, s=3)
    plt.xlabel('Prediction')
    plt.ylabel('Observation')
    plt.title('This is Ridge Regression')
    uuid = hashlib.md5(filename + '_RIDGE:' + a).hexdigest()
    id_ridge = uuid
    filename1 = os.path.join(folder, 'images/%s.png' % id_ridge)
    plt.savefig(filename1)
    plt.figure()
    plt.scatter(prediction_lasso, actual, s=3)
    plt.xlabel('Prediction')
    plt.ylabel('Observation')
    plt.title('This is Lasso Regression')
    uuid = hashlib.md5(filename + '_LASSO:' + a).hexdigest()
    id_lasso = uuid
    filename2 = os.path.join(folder, 'images/%s.png' % id_lasso)
    plt.savefig(filename2)
    return dict(a=a,
                id_nn=id_nn,
                id_ridge=id_ridge,
                id_lasso=id_lasso,
                ridge_coef=ridge_coef,
                lasso_coef=lasso_coef,
                weight=weight,
                r_linear_lassoAIC=r_linear_lassoAIC,
                r_linear_ridge=r_linear_ridge,
                r2_score_NN=r2_score_NN,
                uuid=uuid)
示例#21
0
    def __init__(
        self,
        method,
        yrange,
        params,
        i=0
    ):  #TODO: yrange doesn't currently do anything. Remove or do something with it!
        self.algorithm_list = [
            'PLS',
            'GP',
            'OLS',
            'OMP',
            'Lasso',
            'Elastic Net',
            'Ridge',
            'Bayesian Ridge',
            'ARD',
            'LARS',
            'LASSO LARS',
            'SVR',
            'KRR',
        ]
        self.method = method
        self.outliers = None
        self.ransac = False

        print(params)
        if self.method[i] == 'PLS':
            self.model = PLSRegression(**params[i])

        if self.method[i] == 'OLS':
            self.model = linear.LinearRegression(**params[i])

        if self.method[i] == 'OMP':
            # check whether to do CV or not
            self.do_cv = params[i]['CV']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove CV parameter
            params_temp.pop('CV')
            if self.do_cv is False:
                self.model = linear.OrthogonalMatchingPursuit(**params_temp)
            else:
                params_temp.pop('precompute')
                self.model = linear.OrthogonalMatchingPursuitCV(**params_temp)

        if self.method[i] == 'LASSO':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # check whether to do CV or not
            try:
                self.do_cv = params[i]['CV']
                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lasso(**params_temp)
            else:
                params_temp.pop('alpha')
                self.model = linear.LassoCV(**params_temp)

        if self.method[i] == 'Elastic Net':
            params_temp = copy.copy(params[i])
            try:
                self.do_cv = params[i]['CV']
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.ElasticNet(**params_temp)
            else:
                params_temp['l1_ratio'] = [.1, .5, .7, .9, .95, .99, 1]
                self.model = linear.ElasticNetCV(**params_temp)

        if self.method[i] == 'Ridge':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv:
                self.model = linear.RidgeCV(**params_temp)
            else:
                self.model = linear.Ridge(**params_temp)

        if self.method[i] == 'BRR':
            self.model = linear.BayesianRidge(**params[i])

        if self.method[i] == 'ARD':
            self.model = linear.ARDRegression(**params[i])

        if self.method[i] == 'LARS':
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            try:
                # check whether to do CV or not
                self.do_cv = params[i]['CV']

                # Remove CV parameter
                params_temp.pop('CV')
            except:
                self.do_cv = False

            if self.do_cv is False:
                self.model = linear.Lars(**params_temp)
            else:
                self.model = linear.LarsCV(**params_temp)

        if self.method[i] == 'LASSO LARS':
            model = params[i]['model']
            params_temp = copy.copy(params[i])
            params_temp.pop('model')

            if model == 0:
                self.model = linear.LassoLars(**params_temp)
            elif model == 1:
                self.model = linear.LassoLarsCV(**params_temp)
            elif model == 2:
                self.model = linear.LassoLarsIC(**params_temp)
            else:
                print("Something went wrong, \'model\' should be 0, 1, or 2")

        if self.method[i] == 'SVR':
            self.model = svm.SVR(**params[i])

        if self.method[i] == 'KRR':
            self.model = kernel_ridge.KernelRidge(**params[i])

        if self.method[i] == 'GP':
            # get the method for dimensionality reduction and the number of components
            self.reduce_dim = params[i]['reduce_dim']
            self.n_components = params[i]['n_components']
            # create a temporary set of parameters
            params_temp = copy.copy(params[i])
            # Remove parameters not accepted by Gaussian Process
            params_temp.pop('reduce_dim')
            params_temp.pop('n_components')
            self.model = GaussianProcess(**params_temp)
def model_selection():
    # This is to avoid division by zero while doing np.log10
    EPSILON = 1e-5

    # #############################################################################
    # LassoLarsIC: least angle regression with BIC/AIC criterion

    model_bic = linear_model.LassoLarsIC(criterion='bic')
    model_bic.fit(data, label)
    # alpha_bic_ = model_bic.alpha_

    model_aic = linear_model.LassoLarsIC(criterion='aic')
    model_aic.fit(data, label)
    # alpha_aic_ = model_aic.alpha_

    plt.figure()
    plot_ic_criterion(model_aic, 'AIC', 'b', EPSILON)
    plot_ic_criterion(model_bic, 'BIC', 'r', EPSILON)
    plt.legend()
    plt.title('Information-criterion for model selection')
    plt.savefig('information_criterion_model_selection.png')

    # #############################################################################
    # LassoCV: coordinate descent

    # Compute paths
    model = linear_model.LassoCV(cv=10).fit(data, label)

    # Display results
    m_log_alphas = -np.log10(model.alphas_ + EPSILON)

    plt.figure()
    ymin, ymax = 20, 300
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas,
             model.mse_path_.mean(axis=-1),
             'k',
             label='Average across the folds',
             linewidth=2)
    plt.axvline(-np.log10(model.alpha_ + EPSILON),
                linestyle='--',
                color='k',
                label='alpha: CV estimate')

    plt.legend()

    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean square error')
    plt.title('Mean square error on each fold: coordinate descent ')
    plt.axis('tight')
    plt.ylim(ymin, ymax)
    plt.savefig('lasso_model_selection.png')

    # #############################################################################
    # LassoLarsCV: least angle regression

    # Compute paths
    model = linear_model.LassoLarsCV(cv=10).fit(data, label)

    # Display results
    m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON)

    plt.figure()
    plt.plot(m_log_alphas, model.mse_path_, ':')
    plt.plot(m_log_alphas,
             model.mse_path_.mean(axis=-1),
             'k',
             label='Average across the folds',
             linewidth=2)
    plt.axvline(-np.log10(model.alpha_),
                linestyle='--',
                color='k',
                label='alpha CV')
    plt.legend()

    plt.xlabel('-log(alpha)')
    plt.ylabel('Mean square error')
    plt.title('Mean square error on each fold: Lars')
    plt.axis('tight')
    plt.ylim(ymin, ymax)
    plt.savefig('lasso_Lars_model_selection.png')
示例#23
0
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.decomposition import PCA

# Problem 1
dataset = pd.read_csv("ecs171.dataset.txt", sep='\s+', header = None).values
dataset_x = dataset[1:,7:4502].astype(float)
dataset_y = dataset[1:,5].astype(float)
# Find optimal alpha using bic criteria
bic_model = linear_model.LassoLarsIC(criterion = 'bic')
bic_model.fit(dataset_x, dataset_y)
optimal_alpha = bic_model.alpha_
clf = linear_model.Lasso(alpha = optimal_alpha)
clf.fit(dataset_x, dataset_y)
#count the number of non-zero features
count = 0
for feature in clf.coef_:
    if feature != 0:
        count += 1
print("The number of non-zero features is", count)
# Cross validation
cv_model = linear_model.LassoLarsCV(cv = 10).fit(dataset_x, dataset_y)
print("Ten fold cross-validation error is", cv_model.cv_mse_path_.mean())

# Problem 2
示例#24
0
        ], n_jobs=-1),
     ),
    ])

X, y = make_xy_data('./data/merged_data.csv', ['surface_m2', 'piece'])

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=2)

X_tr = features.fit_transform(X_train, None)
X_te = features.transform(X_test)

###############################################################################

model_bic = lm.LassoLarsIC(criterion='bic', verbose=2)
t1 = time.time()
model_bic.fit(X_tr, y_train)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_

model_aic = lm.LassoLarsIC(criterion='aic', verbose=2)
model_aic.fit(X_tr, y_train)
alpha_aic_ = model_aic.alpha_

plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)'
          % t_bic)
示例#25
0
        classification_binary(svm.NuSVC(kernel="rbf", **SVC_PARAMS)),
        classification(svm.SVC(kernel="rbf", **SVC_PARAMS)),
        classification(svm.NuSVC(kernel="rbf", **SVC_PARAMS)),

        # Linear Regression
        regression(linear_model.LinearRegression()),
        regression(linear_model.HuberRegressor()),
        regression(linear_model.ElasticNet(random_state=RANDOM_SEED)),
        regression(linear_model.ElasticNetCV(random_state=RANDOM_SEED)),
        regression(linear_model.TheilSenRegressor(random_state=RANDOM_SEED)),
        regression(linear_model.Lars()),
        regression(linear_model.LarsCV()),
        regression(linear_model.Lasso(random_state=RANDOM_SEED)),
        regression(linear_model.LassoCV(random_state=RANDOM_SEED)),
        regression(linear_model.LassoLars()),
        regression(linear_model.LassoLarsIC()),
        regression(linear_model.OrthogonalMatchingPursuit()),
        regression(linear_model.OrthogonalMatchingPursuitCV()),
        regression(linear_model.Ridge(random_state=RANDOM_SEED)),
        regression(linear_model.RidgeCV()),
        regression(linear_model.BayesianRidge()),
        regression(linear_model.ARDRegression()),
        regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)),
        regression(
            linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)),

        # Logistic Regression
        classification(
            linear_model.LogisticRegression(random_state=RANDOM_SEED)),
        classification(
            linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)),
示例#26
0
    rowitems = line.rstrip('\n').split('\t')
    X_item = map(float, rowitems[0:-2])
    y_item = float(rowitems[-2])
    sid_item = int(rowitems[-1])
    X.append(X_item)
    y.append(y_item)
    sid.append(sid_item)
    count += 1

# print count
X_train = np.array(X)
y_train = np.array(y)
sid_train = np.array(sid)

# regr = linear_model.LinearRegression()            # this runs linear regression
regr = linear_model.LassoLarsIC(
    criterion='bic', fit_intercept=False)  # this runs lasso regression

regr.fit(X_train, y_train)

fit_coef = regr.coef_
# P   # for GLM, P is diagonal variance matrix estimated using fit_coef_
fit_covinv = np.dot(
    X_train.T,
    X_train)  # for GLM, use np.dot(np.dot(X_train.T, P), X_train) instead
fit_lambda = regr.alpha_

A = np.dot(inv(fit_covinv), X_train.T)
e = y - np.dot(
    X_train, fit_coef
)  # for GLM, transform np.dot(X_train, fit_coef) using link function
betahat_c = fit_coef + np.dot(A, e)
    spca = dcm.SparsePCA()
    sparse_alpha_opts = [0.1, 0.5, 1, 2, 5, 10]
    kpca = dcm.KernelPCA()
    kernel_opts = ["linear", "rbf", "sigmoid"]
    n_component_opts = [0.7, 0.8, 0.9, 0.95, 0.99]
    lasso = linear_model.LassoCV(max_iter=100000,
                                 n_jobs=28,
                                 alphas=np.arange(0.1, 50, 0.1))
    lasso_lars = linear_model.LassoLarsCV(n_jobs=28,
                                          max_iter=100000,
                                          max_n_alphas=10000)
    eps_opts = [10.0, 5.0, 2.0, 1.5, 0.9, 0.1, 0.01, 0.001, 0.0001]
    elastic = linear_model.ElasticNetCV(alphas=np.arange(0.1, 50, 0.1),
                                        max_iter=100000)
    l1_ratio_opts = [0.1, 0.5, 0.9, 0.95, 0.99]
    lasso_lars_bay = linear_model.LassoLarsIC(max_iter=100000)
    cv_opts = [5, 10, 15, 20]

    param_dict = [
        {
            'clf': (lasso, ),
            'pca': (spca, ),
            'pca__alpha': sparse_alpha_opts,
            'clf__selection': ['random', 'cyclic'],
            'clf__cv': cv_opts,
        },
        {
            'clf': (lasso, ),
            'pca': (kpca, ),
            'pca__kernel': kernel_opts,
            'clf__selection': ['random', 'cyclic'],
示例#28
0
def fit_regression(P, x, u, rule="LS", retall=False, **kws):
    """
Fit a polynomial chaos expansion using linear regression.

Parameters
----------
P : Poly
    Polynomial chaos expansion with `P.shape=(M,)` and `P.dim=D`.
x : array_like
    Collocation nodes with `x.shape=(D,K)`.
u : array_like
    Model evaluations with `len(u)=K`.
retall : bool
    If True return uhat in addition to R
rule : str
    Regression method used.

    The follwong methods uses scikits-learn as backend.
    See `sklearn.linear_model` for more details.

    Key     Scikit-learn    Description
    ---     ------------    -----------
        Parameters      Description
        ----------      -----------

    "BARD"  ARDRegression   Bayesian ARD Regression
        n_iter=300      Maximum iterations
        tol=1e-3        Optimization tolerance
        alpha_1=1e-6    Gamma scale parameter
        alpha_2=1e-6    Gamma inverse scale parameter
        lambda_1=1e-6   Gamma shape parameter
        lambda_2=1e-6   Gamma inverse scale parameter
        threshold_lambda=1e-4   Upper pruning threshold

    "BR"    BayesianRidge   Bayesian Ridge Regression
        n_iter=300      Maximum iterations
        tol=1e-3        Optimization tolerance
        alpha_1=1e-6    Gamma scale parameter
        alpha_2=1e-6    Gamma inverse scale parameter
        lambda_1=1e-6   Gamma shape parameter
        lambda_2=1e-6   Gamma inverse scale parameter

    "EN"    ElastiNet       Elastic Net
        alpha=1.0       Dampening parameter
        rho             Mixing parameter in [0,1]
        max_iter=300    Maximum iterations
        tol             Optimization tolerance

    "ENC"   ElasticNetCV    EN w/Cross Validation
        rho             Dampening parameter(s)
        eps=1e-3        min(alpha)/max(alpha)
        n_alphas        Number of alphas
        alphas          List of alphas
        max_iter        Maximum iterations
        tol             Optimization tolerance
        cv=3            Cross validation folds

    "LA"    Lars            Least Angle Regression
        n_nonzero_coefs Number of non-zero coefficients
        eps             Cholesky regularization

    "LAC"   LarsCV          LAR w/Cross Validation
        max_iter        Maximum iterations
        cv=5            Cross validation folds
        max_n_alphas    Max points for residuals in cv

    "LAS"   Lasso           Least Absolute Shrinkage and
                            Selection Operator
        alpha=1.0       Dampening parameter
        max_iter        Maximum iterations
        tol             Optimization tolerance

    "LASC"  LassoCV         LAS w/Cross Validation
        eps=1e-3        min(alpha)/max(alpha)
        n_alphas        Number of alphas
        alphas          List of alphas
        max_iter        Maximum iterations
        tol             Optimization tolerance
        cv=3            Cross validation folds

    "LL"    LassoLars       Lasso and Lars model
        max_iter        Maximum iterations
        eps             Cholesky regularization

    "LLC"   LassoLarsCV     LL w/Cross Validation
        max_iter        Maximum iterations
        cv=5            Cross validation folds
        max_n_alphas    Max points for residuals in cv
        eps             Cholesky regularization

    "LLIC"  LassoLarsIC     LL w/AIC or BIC
        criterion       "AIC" or "BIC" criterion
        max_iter        Maximum iterations
        eps             Cholesky regularization

    "OMP"   OrthogonalMatchingPursuit
        n_nonzero_coefs Number of non-zero coefficients
        tol             Max residual norm (instead of non-zero coef)

    Local methods

    Key     Description
    ---     -----------
    "LS"    Ordenary Least Squares

    "T"     Ridge Regression/Tikhonov Regularization
        order           Order of regularization (or custom matrix)
        alpha           Dampning parameter (else estimated from gcv)

    "TC"    T w/Cross Validation
        order           Order of regularization (or custom matrix)
        alpha           Dampning parameter (else estimated from gcv)


Returns
-------
R[, uhat]

R : Poly
    Fitted polynomial with `R.shape=u.shape[1:]` and `R.dim=D`.
uhat : np.ndarray
    The Fourier coefficients in the estimation.

Examples
--------
>>> P = cp.Poly([1, x, y])
>>> s = [[-1,-1,1,1], [-1,1,-1,1]]
>>> u = [0,1,1,2]
>>> print fit_regression(P, s, u)
0.5q1+0.5q0+1.0

    """

    x = np.array(x)
    if len(x.shape) == 1:
        x = x.reshape(1, *x.shape)
    u = np.array(u)

    Q = P(*x).T
    shape = u.shape[1:]
    u = u.reshape(u.shape[0], int(np.prod(u.shape[1:])))

    rule = rule.upper()

    # Local rules
    if rule == "LS":
        uhat = la.lstsq(Q, u)[0].T

    elif rule == "T":
        uhat, alphas = rlstsq(Q, u, kws.get("order", 0),
                              kws.get("alpha", None), False, True)
        uhat = uhat.T

    elif rule == "TC":
        uhat = rlstsq(Q, u, kws.get("order", 0), kws.get("alpha", None), True)
        uhat = uhat.T

    else:

        # Scikit-learn wrapper
        try:
            _ = lm
        except:
            raise NotImplementedError("sklearn not installed")

        if rule == "BARD":
            solver = lm.ARDRegression(fit_intercept=False, copy_X=False, **kws)

        elif rule == "BR":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.BayesianRidge(**kws)

        elif rule == "EN":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.ElasticNet(**kws)

        elif rule == "ENC":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.ElasticNetCV(**kws)

        elif rule == "LA":  # success
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.Lars(**kws)

        elif rule == "LAC":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.LarsCV(**kws)

        elif rule == "LAS":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.Lasso(**kws)

        elif rule == "LASC":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.LassoCV(**kws)

        elif rule == "LL":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.LassoLars(**kws)

        elif rule == "LLC":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.LassoLarsCV(**kws)

        elif rule == "LLIC":
            kws["fit_intercept"] = kws.get("fit_intercept", False)
            solver = lm.LassoLarsIC(**kws)

        elif rule == "OMP":
            solver = lm.OrthogonalMatchingPursuit(**kws)

        uhat = solver.fit(Q, u).coef_

    u = u.reshape(u.shape[0], *shape)

    R = po.sum((P * uhat), -1)
    R = po.reshape(R, shape)

    if retall == 1:
        return R, uhat
    elif retall == 2:
        if rule == "T":
            return R, uhat, Q, alphas
        return R, uhat, Q
    return R
示例#29
0
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=1)

alphas = np.logspace(-4, 0.5, 30)

models = {
    'LinReg': linear_model.LinearRegression(),
    'Ridge': linear_model.Ridge(),
    'RidgeCV': linear_model.RidgeCV(),
    'Lasso': linear_model.Lasso(),
    'LassoCV': linear_model.LassoCV(),
    # 'LassoLarsCV': linear_model.LassoLarsCV(),
    'LassoLarsIC_AIC': linear_model.LassoLarsIC(criterion='aic'),
    'LassoLarsIC_BIC': linear_model.LassoLarsIC(criterion='bic'),
    'LassoLars': linear_model.LassoLars(),
    'ElasticNet': linear_model.ElasticNet()
}

scores = {}
t_check = []
t2 = 0
for name, model in models.items():
    for alpha in alphas:
        model.alpha = alpha
        if 'CV' in name:
            model.cv = 20
        #check time of training
        t1 = time.time()
示例#30
0
def lasso_predict(df):
    K=0.65306122
    a_list=list(df.columns)
    names = [item for item in a_list if item not in inter_col]
    xdata=df[names]
    labels=np.array(df.REVENUE[1:],dtype=np.float64)
    xList=np.array(xdata[:-1],dtype=np.float64)
    X,Y=normalize(xList,labels)
    X[np.isnan(X)]=0
    #give value of cv
    n=len(df)
    if n>8:
        cv=8
    else:
        cv=n-2
    #Call LassoCV from sklearn.linear_model
    X=np.nan_to_num(X)
    Rev_Model = LassoLarsCV(cv=cv).fit(X, Y)
    alphas, coefs, _  = linear_model.lasso_path(X, Y,  return_models=False)
    nattr, nalpha = coefs.shape
    #find coefficient ordering
    nzList = []
    for iAlpha in range(1,nalpha):
        coefList = list(coefs[: ,iAlpha])
        nzCoef = [index for index in range(nattr) if coefList[index] != 0.0]
        for q in nzCoef:
            if not(q in nzList):
                nzList.append(q)
    #find coefficients corresponding to best alpha value. alpha value corresponding to
    #normalized X and normalized Y is Rev_Model.alpha_
    alphaStar =Rev_Model.alpha_
    indexLTalphaStar = [index for index in range(100) if alphas[index] > alphaStar]
    indexStar = max(indexLTalphaStar)
    #here's the set of coefficients to deploy
    coefStar = list(coefs[:,indexStar])
    #The coefficients on normalized attributes give another slightly different ordering
    absCoef = [abs(a) for a in coefStar]
    #sort by magnitude
    coefSorted = sorted(absCoef, reverse=True)
    idxCoefSize = [absCoef.index(a) for a in coefSorted if not(a == 0.0)]
    vari_nm= [xdata.columns[idxCoefSize[i]] for i in range(len(idxCoefSize))]
    
    #use variables in vari_nmto regress
    feat=min(len(vari_nm),int(K*len(df)))
    vari_nm=vari_nm[:feat]
    y=np.array(xdata.REVENUE[1:],dtype=np.float64)
    x=np.array(xdata[vari_nm][:-1],dtype=np.float64)
    xpred=np.array(xdata[vari_nm][-2:])
    rev_q1_true=np.float(xdata.REVENUE[-1:])/1000000
    X1,Y1=normalize(x,y)
    reg = linear_model.LassoLarsIC(criterion='aic')
    reg.fit(X1,Y1)
    coefs=reg.coef_
    score=reg.score(X1,Y1)
    vari_nm1=[vari_nm[i] for i in range(len(vari_nm)) if coefs[i]!=0]
    if (len(vari_nm1)>1)&(score>0.412626):
        x=np.array(xdata[vari_nm1][:-1],dtype=np.float64)
        xpred=np.array(xdata[vari_nm1][-2:])
    linreg=LinearRegression()
    linreg.fit(x, y)
    score=linreg.score(x,y)
    rev_q1=linreg.predict(xpred)[0]/1000000
    rev_q2=linreg.predict(xpred)[1]/1000000+rev_q1_true
    return  [rev_q1_true,rev_q1,rev_q2]