示例#1
0
def regression_NumMosquitos(Xtr, ytr, Xte):
    from sklearn.linear_model import ElasticNetCV
    #model_nm = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, cv=4)
    model_nm = ElasticNetCV()
    model_nm.fit(Xtr, ytr)
    results_nm = model_nm.predict(Xte)
    return results_nm
def learn_for(reviews, i):
    reg = ElasticNetCV(fit_intercept=True, alphas=[
                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
    u = reviews[i]
    us = range(reviews.shape[0])
    del us[i]
    ps, = np.where(u.toarray().ravel() > 0)
    x = reviews[us][:, ps].T
    y = u.data
    kf = KFold(len(y), n_folds=4)
    predictions = np.zeros(len(ps))
    for train, test in kf:
        xc = x[train].copy().toarray()
        x1 = np.array([xi[xi > 0].mean() for xi in xc])
        x1 = np.nan_to_num(x1)

        for i in xrange(xc.shape[0]):
            xc[i] -= (xc[i] > 0) * x1[i]

        reg.fit(xc, y[train] - x1)

        xc = x[test].copy().toarray()
        x1 = np.array([xi[xi > 0].mean() for xi in xc])
        x1 = np.nan_to_num(x1)

        for i in xrange(xc.shape[0]):
            xc[i] -= (xc[i] > 0) * x1[i]

        p = np.array(map(reg.predict, xc)).ravel()
        predictions[test] = p
    return predictions
示例#3
0
    def learn_for(self, i):
        reviews        = AbstractEstimateBase.reviews
        reg            = ElasticNetCV(fit_intercept=True, alphas=[
                           0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
        nusers,nmovies = reviews.shape
        u              = reviews[i]
        us             = np.arange(reviews.shape[0])
        us             = np.delete(us, i)
        ps,            = np.where(u.ravel() > 0)
        x              = reviews[us][:, ps].T
        kf             = KFold(len(ps), n_folds=4)
        predictions    = np.zeros(len(ps))
        for train, test in kf:
            xc = x[train].copy()
            x1 = np.array([xi[xi > 0].mean() for xi in xc])
            x1 = np.nan_to_num(x1)

            for i in range(xc.shape[0]):
                xc[i] -= (xc[i] > 0) * x1[i]

            reg.fit(xc, u[train] - x1)

            xc = x[test].copy()
            x1 = np.array([xi[xi > 0].mean() for xi in xc])
            x1 = np.nan_to_num(x1)

            for i in range(xc.shape[0]):
                xc[i] -= (xc[i] > 0) * x1[i]

            p = reg.predict(xc).ravel()
            predictions[test] = p
        fill_preds = np.zeros(nmovies)
        fill_preds[ps] = predictions

        return fill_preds
示例#4
0
def enetCV():
    print ("Doing elastic net")
    cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0)
    clf4 = ElasticNetCV(cv=cross_val)
    clf4.fit(base_X, base_Y)
    print ("Score = %f" % clf4.score(base_X, base_Y))
    clf4_pred = clf4.predict(X_test)
    write_to_file("elasticCV.csv", clf4_pred)
示例#5
0
def train_model(data, target, n_iter, rate):
    """Bootstraps, trains ElasticNetCV model, selects features, and
    trains final linear regression model.

    Returns model and selected features.

    """
    coefs = []
    for i in range(n_iter):
        print "bootstrap iter {}".format(i)
        indices = np.random.choice(len(data), size=len(data), replace=True)
        sample_data = data[indices]
        sample_target = target[indices]
        model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                             max_iter=10000, n_jobs=4)
        model.fit(sample_data, sample_target)
        coefs.append(model.coef_)
    coefs = np.vstack(coefs)
    rate_selected = make_rates(coefs)
    selected1 = np.nonzero(rate_selected >= rate)[0]
    selected2 = np.argsort(rate_selected)[-50:]
    selected = selected1 if len(selected1) < len(selected2) else selected2
    model = LinearRegression()
    model.fit(data[:, selected], target)

    model_full = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                              max_iter=10000, n_jobs=4)
    model_full.fit(data, target)

    return model_full, model, selected, coefs
示例#6
0
    def elastic_net_cv(self, drug_name, l1_ratio=0.5, alphas=None, n_folds=10):

        # Get the data for the requested drug
        xscaled, Y = self._get_one_drug_data(drug_name)

        en = ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, cv=n_folds)

        encv = en.fit(xscaled, Y)

        self.encv = encv
        print("Best alpha on %s folds : %s" % (n_folds, encv.alpha_))
        #df.sort_values().plot(kind='bar')
        return encv.alpha_
示例#7
0
文件: fea-sel.py 项目: ox040c/bnp
def elasticNet(argv):
    data = pd.read_csv(argv, index_col=0)
    y = data['target']
    X = data.drop('target', axis=1)
    featureNames = X.columns.values
    enet = ElasticNetCV(n_jobs=-1, normalize=True)
    enet.fit(X, y)
    dropIdx = featureNames[enet.coef_ < 1e-5]
    print "Elastic Net drop: %d" % len(dropIdx)
    print dropIdx
    data.drop(dropIdx, axis=1, inplace=True)
    data.to_csv(argv+'.enet.csv')
    return enet
示例#8
0
    def run(self):
        allcomments = self._aggregateComments(self.data)
        self._buildDictionary(allcomments)

        # create representation of documents
        tfidfArray = self.vectorizer.transform(allcomments)

        # create labelling
        labels = []
        for datum in self.data:
            labels.append(len(datum.meta()['favorites']))
        labels = np.array(labels)

        print self.vectorizer.get_params()
        print self.vectorizer.get_feature_names()

        # training
        self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds)
        self.elasticNet.fit(tfidfArray,labels)

        for i,l1_ratio in enumerate(self._l1_ratio):
            for j,alpha in enumerate(self._alpha):
                print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:]))

        print self.vectorizer.inverse_transform(self.elasticNet.coef_)
示例#9
0
文件: lccb.py 项目: jmmcd/PODI
def LCCB_coevo(fitness_fn, pop):
    y = fitness_fn.train_y
    # Make a new array composed of pop[i].semantics for all i
    # (pop[i].semantics has already been calculated)
    X = None
    for ind in pop:
        if (ind.phenotype and ind.fitness != sys.maxint
            and all(np.isfinite(ind.semantics))):
            col = ind.semantics
        else:
            print("Omitting a column")
            col = np.zeros(len(y))
        if X is None:
            X = col
        else:
            X = np.c_[X, col]

    eps = 5e-3

    # FIXME FFX processes the data so that has zero mean and unit
    # variance before applying the LR... should we do that?

    # Use ElasticNet with cross-validation, which will automatically
    # get a good value for regularisation
    model = ElasticNetCV()
    model.fit(X, y)
    coefs = model.coef_
    output = model.predict(X)
    rmse = fitness_fn.rmse(y, output)
    print("rmse", rmse)

    # Assign the magnitude of coefficients as individual fitness
    # values. Have to construct a new individual because tuples are
    # immutable. FIXME this is not a great method -- it's likely that
    # the population will converge on one or a few basis functions,
    # and then the performance of the ENet will decrease because there
    # won't be enough independent basis functions to work with.
    pop = [variga.Individual(genome=pop[i].genome,
                             used_codons=pop[i].used_codons,
                             fitness=-abs(coefs[i]),
                             phenotype=pop[i].phenotype,
                             readable_phenotype=pop[i].readable_phenotype,
                             semantics=pop[i].semantics)
           for i in range(len(pop))]

    pop.sort(key=variga.ind_compare)
示例#10
0
def predict(train):
    binary = (train > 0)
    reg = ElasticNetCV(fit_intercept=True, alphas=[
                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
    norm = NormalizePositive()
    train = norm.fit_transform(train)

    filled = train.copy()
    # 모든 사용자에 대해 반복
    for u in range(train.shape[0]):
        # 훈련에서 현재 사용자 제거
        curtrain = np.delete(train, u, axis=0)
        bu = binary[u]
        if np.sum(bu) > 5:
            reg.fit(curtrain[:,bu].T, train[u, bu])

            # 이전에 없는 값을 넣는다
            filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
    return norm.inverse_transform(filled)
示例#11
0
def predict(train):
    binary = (train > 0)
    reg = ElasticNetCV(fit_intercept=True, alphas=[
                       0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])
    norm = NormalizePositive()
    train = norm.fit_transform(train)

    filled = train.copy()
    # iterate over all users
    for u in range(train.shape[0]):
        # remove the current user for training
        curtrain = np.delete(train, u, axis=0)
        bu = binary[u]
        if np.sum(bu) > 5:
            reg.fit(curtrain[:,bu].T, train[u, bu])

            # Fill the values that were not there already
            filled[u, ~bu] = reg.predict(curtrain[:,~bu].T)
    return norm.inverse_transform(filled)
示例#12
0
def regress(x, y, title):
    clf = ElasticNetCV(max_iter=200, cv=10, l1_ratio = [.1, .5, .7, .9, .95, .99, 1])

    clf.fit(x, y)
    print "Score", clf.score(x, y)

    pred = clf.predict(x)
    plt.title("Scatter plot of prediction and " + title)
    plt.xlabel("Prediction")
    plt.ylabel("Target")
    plt.scatter(y, pred)

    # Show perfect fit line
    if "Boston" in title:
        plt.plot(y, y, label="Perfect Fit")
        plt.legend()

    plt.grid(True)
    plt.show()
示例#13
0
def enet_granger_causality_cv(X_t, y_t, cv, alphas, top_num=None, top_perc=4,max_iter=100, lambdas=None):

    # alph ais the l1_ratio
    if lambdas != None:
        use_lambdas = np.tile(lambdas, len(alphas)).reshape(len(alphas), len(lambdas))
        enet = ElasticNetCV(l1_ratio=alphas, alphas=use_lambdas, cv=cv, max_iter=max_iter)
        fit = enet.fit(X_t, y_t)

        use_lambdas = fit.alphas_
        use_lambdas = np.tile(use_lambdas, len(alphas)).reshape(len(alphas), len(lambdas))
        print "Used lambdas"
        print use_lambdas

    else:
        enet = ElasticNetCV(l1_ratio=alphas,  cv=cv, max_iter=max_iter)
        fit  = enet.fit(X_t, y_t)
        use_lambdas = fit.alphas_


    # lambdas is a matrix

    cv_mses = enet.mse_path_.sum(axis=2).flatten()


    cv_alphas = np.repeat(alphas, use_lambdas.shape[1])
    cv_lambdas = use_lambdas.flatten()

    if top_num == None:
        print "Num cv alphas: ", len(cv_alphas)

        top_num = int(len(cv_alphas) * top_perc / 100.0)
        print "Top num ", top_num

    # this will keep the smallest
    top_indices, top_mses = get_min_k(cv_mses, top_num)

    top_lambdas = cv_lambdas[top_indices]
    top_alphas = cv_alphas[top_indices]

    top_df = pd.DataFrame(data={"lambda.min": top_lambdas, "alpha": top_alphas, "error.min": top_mses})

    return top_df
示例#14
0
    def elasticNetRegNT(self, X, Y, nCV, l1_weights=None):
        """Run elastic net with the given params

        :param X: design matrix
        :param Y: true labels
        :param nCV: number of CVs
        :param l1_weights: weights of the lasso term
        :return:
        """

        # very difficult to choose alpha, better use CV
        # enet = ElasticNet(alpha=self.alpha, l1_ratio=0.8, fit_intercept=False)
        # enet = ElasticNetCV(fit_intercept=False, cv=nCV)
        if (self.useCV):
            enet = ElasticNetCV(cv=nCV, max_iter=self.maxItr, l1_weights=l1_weights,
                                fit_intercept=self.fit_intercept,
                                alphas=self.alphas, l1_ratio=self.l1_ratio)
            enet.fit(X, Y)
            self.cv_alpha = enet.alpha_
        else:
            enet = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio,
                              max_iter=self.maxItr, l1_weights=l1_weights)
            enet.fit(X, Y)

        if self.verbose:
            print("Num of iter: %d"%enet.n_iter_)
        # print("Best alpha: {}, l1_ratio: {}"
        #       .format(enet.alpha_, enet.l1_ratio_))
        # print(enet.get_params())
        ## plot regulation path for testing
        # testReg.lassoElasticnetPaths(X, Y)

        return enet.coef_, enet.intercept_
示例#15
0
def eNetModel(data, labels, featureNames, texts, documents, nFolds):
    # run SVM with grid search for parameters and leave-one-out cross validation
    kf = KFold(len(texts), n_folds=nFolds)
    acc = 0
    mean_coefs = []
    for train, test in kf:

        # test_docs = {}
        label_train = labels[train]
        #selected_feats = getSelectedFeatures(train, test, texts, featureNames, documents, label_train, nFeats)

        full_train_data, full_test_data, label_train, label_test = data[train], data[test], labels[train], labels[test]

        #data_train = sortBySelected(full_train_data, selected_feats, featureNames)
        #data_test = sortBySelected(full_test_data, selected_feats, featureNames)

        data_train = full_train_data
        data_test = full_test_data

        enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],n_alphas=1000,alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])

        enet.fit(data_train, label_train)

        data_train = np.asarray(data_train,dtype=float)
        label_train = np.asarray(label_train,dtype=float)

        vals = enet.path(data_train, label_train)
        mean_coefs.append(np.mean(vals[1],axis=1))

        if label_test == 1 and enet.predict(data_test) > 0.5:
            acc += 1
        elif label_test == 0 and enet.predict(data_test) < 0.5:
            acc += 1

        if len(mean_coefs) % 10 == 0:
            print str(len(mean_coefs)), 'out of %s subs finished' %(str(len(data)))

    mean_coefs = np.mean(np.array(mean_coefs), axis=0)

    return Decimal(acc)/Decimal(len(data)), mean_coefs
示例#16
0
# Scale our Data with Robust Scaler to minimise outlier influence (Approx 4% of the data are significant outliers as measured by Cook's Distance)
rb_scaler = RobustScaler()
X_scaled = pd.DataFrame(rb_scaler.fit_transform(X), columns=X.columns)

std_scaler = StandardScaler()
X_standard = pd.DataFrame(std_scaler.fit_transform(X), columns=X.columns)

## Define CV Root Mean Square Error ##
def cv_rmse(estimator, X, y, cv=5):
    rmse = np.mean(np.sqrt(-cross_val_score(estimator, X, y, cv=cv, scoring="neg_mean_squared_error")))
    return rmse

## Regression Models ##
# Elastic Net Regressor
elastic_reg = ElasticNetCV(cv=5, max_iter=15000)
# Lasso Model for Comparison
lasso_reg = LassoCV(cv=5, alphas=[0.011], max_iter=15000) # Previously Optimised

## Model Evaluation & Hyperparameter Tuning ##
# CV Root Mean Squared Error on Training Set (Robust Scaled)
cv_rmse(lasso_reg, X_scaled, np.ravel(y)) # LASSO: 0.319
cv_rmse(elastic_reg, X_scaled, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.317

# CV Root Mean Squared Error on Training Set (Standardised)
cv_rmse(lasso_reg, X_standard, np.ravel(y)) # LASSO: 0.2992
cv_rmse(elastic_reg, X_standard, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.3012


# Alpha Selection
alphas = np.logspace(-10, 1, 400)
示例#17
0
l = []
with h5py.File("ECoG_big_data.h5", "r+") as f1:
    with h5py.File("selected.h5", "r+") as f2:
        for i in range(1, 4):
            sid = "sub" + str(i)
            X = f1[sid]["train_data"][:]
            Y = f1[sid]["train_clabel"][:]
            Yb = f1[sid]["train_blabel"][:]
            Xt = f1[sid]["test_data"][:]
            Yt = f1[sid]["test_clabel"][:]
            Ytb = f1[sid]["test_blabel"][:]
            for finger in range(5):
                for method in ["l1", "mcp", "scad"]:
                    idxc = f2[sid]["finger" + str(finger + 1)][method][:] - 1
                    idxb = f2[sid]["finger" + str(finger + 1)]["l1_l"][:] - 1
                    en = ElasticNetCV()
                    en.fit(X[:, idxc].astype("float64"), Y[:, finger])
                    yp = en.predict(Xt[:, idxc])
                    corr = np.corrcoef(yp, Yt[:, finger])[0, 1]
                    if corr < 0.3:
                        break
                    else:
                        l.append([sid + "//" + "finger" + str(finger + 1), corr])
                        lr = LogisticRegressionCV()
                        lr.fit(X[:, idxc], Yb[:, finger])
                        tp = yp * fun(lr.predict(Xt[:, idxc]))
                        m = np.where(np.convolve(tp, np.ones((40,)) / 40, mode="same") < 0.5, 0, 1)
                        b, a = butter(2, 9.0 / 25, "low")
                        yy = relu(filtfilt(b, a, tp * m))
                        print corr, np.corrcoef(Yt[:, finger], yy)[0, 1]
def main():
    path = '../data/states'
    data = pd.read_csv(f'{path}/state_mean_accident_data.csv',
                       header=0,
                       index_col='state_name')

    non_feature_cols = [
        'state_number', 'state_code', 'accidents', 'fatalities',
        'fatalities_per_accident', 'accidents_per_100k', 'num_vehicles',
        'hour_of_day', 'num_fatalities', 'num_drunk_drivers'
    ]

    labels = data['accidents_per_100k']
    features = data.drop(non_feature_cols, axis=1)
    feature_names = features.columns

    scores_df = features.corrwith(labels, axis=0,
                                  method='pearson').to_frame('r_coef')
    scores_df['f_score'], scores_df['p_value'] = fs.f_regression(
        features, labels)

    # Sort by p value
    scores_df.sort_values('p_value', inplace=True)
    print(scores_df)

    # feature_subset_cols = scores_df[scores_df['p_value'] < 0.05].index.tolist()
    # features = features[feature_subset_cols]

    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    # X_train, X_test, y_train, y_test = train_test_split(features, labels,
    #                                                     test_size=0.2, random_state=2020)

    models = {
        'Linear Regression': (LinearRegression(), 'linreg'),
        'Ridge': (RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0],
                          cv=5,
                          scoring='neg_mean_squared_error'), 'ridge'),
        'Elastic Net':
        (ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0],
                      alphas=[0.01, 0.1, 1.0, 10.0],
                      max_iter=3000,
                      cv=5), 'elastic_net'),
        'Linear SVR': (LinearSVR(), 'svr')
    }

    for name, (model, suffix) in models.items():
        print(name)
        print('-' * 20)
        model.fit(features, labels)

        y_pred = model.predict(features)

        utils.print_regression_metrics(labels, y_pred)

        utils.hist_resids(labels, y_pred, name, suffix)
        utils.resid_qq(labels, y_pred, name, suffix)
        utils.resid_plot(labels, y_pred, name, suffix)

        utils.feature_importance_regression(model, feature_names, name, suffix)
        utils.permutation_importances(model, features, labels, feature_names,
                                      name, suffix)
        print('#' * 50)
示例#19
0
#Lasso
lasso = Lasso(alpha=0.0005, random_state=1)
print("Lasso score: {:.4f} \n".format(cv_rmse(lasso).mean()))

alphas_alt = [
    14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5, 10, 5
]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]

#Ridge
ridge = RidgeCV(alphas=alphas_alt, cv=kfolds)
print("Ridge score: {:.4f} \n".format(cv_rmse(ridge).mean()))

#ElasticNet
elasticnet = ElasticNetCV(cv=kfolds, alphas=e_alphas)
print("ElasticNet score: {:.4f} \n".format(cv_rmse(elasticnet).mean()))

#Svr
#svr = SVR()
#print(cv_rmse(svr).mean())

#XGBoost
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=3460,
                       max_depth=3,
                       min_child_weight=0,
                       gamma=0,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
features = features.dropna(axis=1)

alpha_values = []
for a in range(1, 10001):
    alpha_values.append(a / 100)

print "Started at " + str(datetime.now())

estimator_ridge = RidgeCV(alphas=alpha_values, cv=3)
estimator_ridge.fit(features, goal)
scores = cross_val_score(Ridge(alpha=estimator_ridge.alpha_), features, goal, cv=5)
print "Ridge alpha " + str(estimator_ridge.alpha_)
print str(np.mean(scores))
print scores

estimator_lasso = LassoCV(alphas=alpha_values, cv=3)
estimator_lasso.fit(features, goal)
scores = cross_val_score(Lasso(alpha=estimator_lasso.alpha_), features, goal, cv=5)
print "Lasso alpha " + str(estimator_lasso.alpha_)
print str(np.mean(scores))
print scores


estimator_elastic_net = ElasticNetCV(alphas=alpha_values, cv=3, n_jobs=-1)
estimator_elastic_net.fit(features, goal)
scores = cross_val_score(ElasticNet(alpha=estimator_elastic_net.alpha_), features, goal, cv=5)
print "ElasticNet alpha " + str(estimator_elastic_net.alpha_)
print str(np.mean(scores))
print scores

print "Finished at " + str(datetime.now())
seed = sys.argv[1]
p = sys.argv[2]
corr = ('corr' if sys.argv[3] == 'TRUE' else '')
np.random.seed(int(seed))

Data = np.loadtxt('Data_' + p + 'p_' + corr + seed + '.csv', delimiter=',')
y = Data[:, 0][0:100]
X = Data[:, 1:][0:100, :]
y_test = Data[:, 0][100:200]
X_test = Data[:, 1:][100:200, :]

#----------------------------------------------------------------------------------------------------------------
# Elastic net with cross-validation over lambda and alpha.
#----------------------------------------------------------------------------------------------------------------
Output = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                      cv=10,
                      max_iter=100000)
Output.fit(X, y)

# MAP model.
m = (abs(Output.coef_) > 0)

# Covariates in the MAP model.
covariates = np.where(m == True)[0]

# Root mean squared error of the MAP model on the test set.
rmse = np.mean((y_test - X_test @ Output.coef_)**2)**.5

postProb = -99

items = pd.Series([covariates, postProb, rmse],
    # Now, we use 5 fold cross-validation to estimate generalization error
    kf = KFold(len(x), n_folds=5)
    p = np.zeros_like(y)
    for train, test in kf:
        met.fit(x[train], y[train])
        p[test] = met.predict(x[test])

    r2_cv = r2_score(y, p)
    print('Method: {}'.format(name))
    print('R2 on training: {}'.format(r2_train))
    print('R2 on 5-fold CV: {}'.format(r2_cv))
    print()

# Construct an ElasticNetCV object (use all available CPUs)
met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99])

kf = KFold(len(x), n_folds=5)
pred = np.zeros_like(y)
for train, test in kf:
    met.fit(x[train], y[train])
    pred[test] = met.predict(x[test])


print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(y, p))))
print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(y, p)))
print('')


'''
# unit version
#X = np.array(tfidf_array)
X = X_uni_bi_gram
y = np.array(engagement_rate)
print X

binary_y_pre = []

for i in range(len(y)):
	if y[i]>0: binary_y_pre.append(1)
	else: binary_y_pre.append(0)
binary_y = np.array(binary_y_pre)

coef_path_linear_cv = LinearRegression(normalize=Normalize,fit_intercept=Fit_Intercept) 
coef_path_lasso_cv = LassoCV(normalize=Normalize, max_iter=Max_Iter, copy_X=True, cv=CV, verbose=Verbose, fit_intercept=Fit_Intercept, tol=Tol)#, alphas=Alphas) 
coef_path_elastic_cv = ElasticNetCV(normalize=Normalize,max_iter=Max_Iter, tol=Tol)#,alphas=Alphas)
coef_path_logistic_cv = LogisticRegression( tol=Tol)
coef_path_binary_x_logistic_cv = LogisticRegression( tol=Tol)
coef_path_forest_cv = RandomForestClassifier(n_estimators = N_Estimators, max_features=number_of_features)

binary_X = vectorizer_binary.fit_transform(corpus)
coef_path_forest_cv.fit(X,binary_y)
coef_path_lasso_cv.fit(X,y)
coef_path_binary_x_logistic_cv.fit(binary_X,binary_y)
coef_path_logistic_cv.fit(X,binary_y)
coef_path_elastic_cv.fit(X,y)

forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
示例#24
0
                                            min_samples_leaf=5),
                      random_state=13,
                      n_estimators=17), "AdaBoostAuto")
build_auto(ARDRegression(normalize=True), "BayesianARDAuto")
build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto")
build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2),
           "DecisionTreeAuto",
           compact=False)
build_auto(
    BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                           min_samples_leaf=5),
                     random_state=13,
                     n_estimators=3,
                     max_features=0.5), "DecisionTreeEnsembleAuto")
build_auto(DummyRegressor(strategy="median"), "DummyAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
build_auto(HuberRegressor(), "HuberAuto")
build_auto(LarsCV(), "LarsAuto")
build_auto(LassoCV(random_state=13), "LassoAuto")
build_auto(LassoLarsCV(), "LassoLarsAuto")
build_auto(
    OptimalLGBMRegressor(objective="regression",
                         n_estimators=17,
                         num_iteration=11), "LGBMAuto")
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(
    BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75),
l1_ratio=0.7

enet = ElasticNet(alpha = alpha, l1_ratio = l1_ratio)
enet_model = enet.fit(X_train, y_train)
y_pred_enet = enet_model.predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)

print(enet)
print("r^2 on test data : %f" % r2_score_enet)
# r^2 on test data : 0.100723

# plt.plot(enet.coef_, label='Elastic net coefficients')
# plt.plot(coef, '--', label='original coefficients')
# plt.legend(loc='best')
# plt.title("R^2: %f" % (r2_score_enet))
# plt.show()

# set the parameters alpha and l1_ratio by cross-validation
from sklearn.linear_model import ElasticNetCV

enetcv = ElasticNetCV(l1_ratio=[.1,.2,.3,.4,.5,.6,.7,.8,.9])
enetcv_model = enetcv.fit(X_train, y_train)
y_pred_enetcv = enetcv_model.predict(X_test)
r2_score_enetcv = r2_score(y_test, y_pred_enetcv)

print(enetcv)
print("r^2 on test data : %f" % r2_score_enetcv)
# r^2 on test data : 0.22553

assert(r2_score_enetcv > r2_score_enet)
    def fit(self, raw_array, aux_data_a_d=None, diff=False, feature_s_l=[], holdout_col=0, lag=1, positive_control=False, regression_algorithm_s = 'elastic_net', **kwargs):
        """ Performs an auto-regression of a given lag on the input array. Axis 0 indexes observations (schools) and axis 1 indexes years. For holdout_col>0, the last holdout_col years of data will be withheld from the fitting, which is ideal for training the algorithm. """

        # Apply optional parameters
        if holdout_col > 0:
            raw_array = raw_array[:, :-holdout_col]
        if diff:
            array = np.diff(raw_array, 1, axis=1)
        else:
            array = raw_array

        # Create model and fit parameters
        Y = array[:, lag:].reshape(-1)
        X = np.ndarray((Y.shape[0], 0))
        for i in range(lag):
            X = np.concatenate((X, array[:, i:-lag+i].reshape(-1, 1)), axis=1)
            # Y = X_t = A_1 * X_(t-lag) + A_2 * X_(t-lag+1)) + ... + A_lag * X_(t-1) + A_(lag+1)
        if positive_control:
            X = np.concatenate((X, array[:, lag:].reshape(-1, 1)), axis=1)
        if aux_data_a_d:
            for feature_s in feature_s_l:
                if holdout_col > 0:
                    raw_array = aux_data_a_d[feature_s][:, :-holdout_col]
                else:
                    raw_array = aux_data_a_d[feature_s]
                if diff:
                    array = np.diff(raw_array, 1, axis=1)
                else:
                    array = raw_array
                for i in range(lag):
                    X = np.concatenate((X, array[:, i:-lag+i].reshape(-1, 1)), axis=1)
        estimatorX = Imputer(axis=0)
        X = estimatorX.fit_transform(X)
        estimatorY = Imputer(axis=0)
        Y = estimatorY.fit_transform(Y.reshape(-1, 1)).reshape(-1)

        if regression_algorithm_s == 'elastic_net':
            l1_ratio_l = [.1, .5, .7, .9, .95, .99, 1]
            alpha_l = np.logspace(-15, 5, num=11).tolist()
            max_iter = 1e5
            # It's too slow when I make it high, so I'll keep it low for now
            model = ElasticNetCV(l1_ratio=l1_ratio_l, alphas=alpha_l, max_iter=max_iter,
                                 fit_intercept=True, normalize=True)
        elif regression_algorithm_s == 'gaussian_process':
            model = GaussianProcess()
            # This currently gives the following error: "Exception: Multiple input features cannot have the same target value."
        elif regression_algorithm_s == 'gradient_boosting':
            model = GradientBoostingRegressor(max_features='sqrt')
        elif regression_algorithm_s == 'linear_regression':
            model = LinearRegression(fit_intercept=True, normalize=True)
        elif regression_algorithm_s == 'random_forest':
            model = RandomForestRegressor(max_features='auto')
        model.fit(X, Y)
        if regression_algorithm_s in ['elastic_net', 'linear_regression']:
            with open(os.path.join(config.plot_path, 'coeff_list.txt'), 'a') as f:
                f.write('Lag of {0:d}:\n'.format(lag))
#                f.write('\nElastic net: R^2 = %0.5f, l1_ratio = %0.2f, alpha = %0.1g' %
#                      (model.score(X, Y), model.l1_ratio_, model.alpha_))
                coeff_t = model.coef_
                assert(not positive_control) # The coefficients won't currently line up
                for i_lag in range(lag):
                    f.write('\ti_lag = {0:d}: {1:0.2g}\n'.format(lag-i_lag, coeff_t[i_lag]))
                for i_feature, feature_s in enumerate(feature_s_l):
                    for i_lag in range(lag):
                        f.write('\t{0}:\n\t\ti_lag = {1:d}: {2:0.2g}\n'.format(feature_s, lag-i_lag, coeff_t[lag*(i_feature+1) + i_lag]))

        return model
示例#27
0
alphas = np.random.uniform(low=0, high=10, size=(50, ))
ridgecv = RidgeCV(alphas=alphas, cv=10, normalize=True)
ridgecv.fit(x_train, y_train)

ridgecv.alpha_

ridge_model = Ridge(alpha=ridgecv.alpha_)
ridge_model.fit(x_train, y_train)

ridge_model.score(x_test, y_test)

# we got the same r2 square using Ridge regression as well. So, it's safe to say there is no overfitting.

# Elastic net

elasticCV = ElasticNetCV(alphas=None, cv=10)

elasticCV.fit(x_train, y_train)

elasticCV.alpha_

# l1_ration gives how close the model is to L1 regularization, below value indicates we are giving equal
#preference to L1 and L2
elasticCV.l1_ratio

elasticnet_reg = ElasticNet(alpha=elasticCV.alpha_, l1_ratio=0.5)
elasticnet_reg.fit(x_train, y_train)

elasticnet_reg.score(x_test, y_test)

# So, we can see by using different type of regularization, we still are getting the same r2 score. That means our OLS model has been well trained over the training data and there is no overfitting.
sales1 = sales.reset_index()
brands_new1 = brands_new.reset_index()
sales_merged = sales1.merge(brands_new1, how='left', on=['index', 'Brand_Family', 'SubBrand_Family'])
sales_merged = sales_merged.set_index(['index'])
print('Merged')
#Брендам, к которым отношение не было зарегестрировано, я присвоил "среднее" отношение. Может быть, подпортил этим датасет.
sales_merged.Affinity = sales_merged.Affinity.fillna(sales_merged.Affinity.mean())
sales_merged['Brand Character']= sales_merged['Brand Character'].fillna(sales_merged['Brand Character'].mean())
sales_merged['Functional Performance'] = sales_merged['Functional Performance'].fillna(sales_merged['Functional Performance'].mean())


#Здесь уже времени не хватало, попробовал построить хоть какую-нибудь модель, но она получилась совсем отвратительной.
from sklearn.cross_validation import KFold
from sklearn.linear_model import ElasticNetCV
met = ElasticNetCV()

features = sales_merged[['PMI_Portfolio_AVB_Boost', 'PMI_Portfolio_PFP_Boost',
       'PMI_Portfolio_PPRP', 'PMI_Portfolio_SA', 'SubFam_Hostess',
       'SubFam_PFP_Boost', 'SubFam_RAP', 'SubFam_SA', 'Fam_AVB_Boost',
       'Fam_Hostess', 'Fam_PFP_Boost', 'Fam_RAP', 't', 'Affinity',
       'Brand Character', 'Functional Performance']].as_matrix()
target = sales_merged['Volume_Sales'].as_matrix()

met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99])

kf = KFold(len(target), n_folds=5)
pred = np.zeros_like(target)
for train, test in kf:
    met.fit(features[train], target[train])
    pred[test] = met.predict(features[test])
plt.axis('tight')
plt.show()

#RidgeCV
from sklearn.linear_model import RidgeCV
model = RidgeCV(cv=20)
model_ridge = model.fit(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating'])
rating_predicted = model_ridge.predict(ratings_ext_input_sim2[X_features])
error = (rating_predicted - ratings_ext_input_sim2['rating'])
np.mean(error*error) #  4.77 (0.633 good?)
score=model_ridge.score(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating'])
model_ridge.coef_

# Elastic Net
from sklearn.linear_model import ElasticNetCV
enet = ElasticNetCV(l1_ratio=0.5,cv = 10) # 1 for LASSO
model_enet = enet.fit(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating'])
rating_predicted = model_enet.predict(ratings_ext_input_sim2[X_features])
error = (rating_predicted - ratings_ext_input_sim2['rating'])
np.mean(error*error)  # 4.168
# alpha = 1, l1_ration = 0: very high 4.67
# alpha = 0.1, l1_ration = 0: very high 4.57
# alpha = 0.5, l1_ration = 0: very high 4.64
# alpha = 0.7, l1_ration = 0: very high 4.65
from sklearn.linear_model import lasso_path, enet_path
model_enet.mse_path_
plt.figure(1)
ax = plt.gca()
ax.set_color_cycle(2 * ['b', 'r', 'g', 'c', 'k'])
#l1 = plt.plot(-np.log10(alphas_lasso), coefs_lasso.T)
l1 = plt.plot(-np.log10(model_enet.alphas_), model_enet.coef_, linestyle='--')
示例#30
0
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-114.8406727584057
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    ElasticNetCV(l1_ratio=0.6000000000000001, tol=0.01))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
示例#31
0
#times
X_train,X_test,Y_train,Y_test = train_test_split(alldata,newY,test_size=0.3)

#frequencies
X_train,X_test,Y_train,Y_test = train_test_split(allfreqdata,newY,test_size=0.3)

svr = SVR(cache_size=1500)
svr_params = { 'C' : [1e-2,1,1e2] , 'epsilon' : [1e-3,1e-2,1e-1]  }

#fit without transforms 0.009
#fit with kld 0.017

#test with newy hier. interc.
#takes looong

enet_cv = ElasticNetCV(l1_ratio=[0.1,0.3,0.5,0.7,0.9],max_iter=2000)
enet_cv.fit(X_tr_new,Y_train)

rcv = RidgeCV(alphas=[1e-2,1e-1,1,10])
#rcv.fit(X_train,Y_train)

svr_gs = GridSearchCV(svr,svr_params,verbose=1,n_jobs=-1)
#svr_gs.fit(X_train,Y_train)


#%%
#visualization of posterior ERPs averaged over Pbs and epochs
#for chan Fz
posteriors = np.unique(np.round(bc_dict["01"],decimals=2))

avr_ERP_p_post_list = [get_average_ERPs_per_posterior(mat_dict[k],bc_dict[k],chan=4) for k in sorted(mat_dict.keys())] 
    
#### assessing performance of the negative binomial regression model
performance_negativebinomial = []
for x in [0.01,0.1,1,5,10]:
    cost = []
    for a,b in cross_validation_object:
        resultingmodel = sm.NegativeBinomial(Y[a],X[a],loglike_method = 'geometric')
        #res = resultingmodel.fit(disp=False, maxiter = 200)
        res2 = resultingmodel.fit_regularized(alpha = x, maxiter = 200)
        cost.append(mean_squared_error(res2.predict(X[b]), Y[b]))
    performance_negativebinomial.append(np.mean(cost))


##### Log linear model ########## not even close. 
from sklearn.linear_model import ElasticNetCV
linear_fit = ElasticNetCV(cv = cross_validation_object, alphas = [0.01,0.1,1,5,10])
linear_fit.fit(X,np.log(Y+1))
mean_squared_error(np.exp(linear_fit.predict(X)) - 1, Y)


########## creating final model using train data + test data


X_test,Y_test,junk = prepare_for_model('Dogs_Final_Test.csv',1)
X,Y,junk = prepare_for_model('Dogs_Final_Train.csv',1)
scaler = MinMaxScaler([0,1])
X_all = scaler.fit_transform(np.vstack((X_test,X)))
Y_all = np.hstack((Y_test,Y))
Y_all = np.array([30 if i > 30 else i for i in Y_all])
final_model = sm.NegativeBinomial(Y_all,X_all,loglike_method = 'geometric')
res2 = final_model.fit_regularized( alpha = 5, maxiter = 200)
示例#33
0
                                  warm_start=True)

    if (method == 5):
        print('Random forest 02')
        str_method = 'RandomForest02'
        r = RandomForestRegressor(n_estimators=90,
                                  max_depth=4,
                                  n_jobs=-1,
                                  random_state=ra1,
                                  verbose=0,
                                  warm_start=True)

    if (method == 6):
        print('ElasticNet')
        str_method = 'Elastic Net'
        r = ElasticNetCV()

    if (method == 7):
        print('GradientBoosting 01')
        str_method = 'GradientBoosting01'
        r = GradientBoostingRegressor(n_estimators=80,
                                      max_depth=5,
                                      learning_rate=0.05,
                                      random_state=ra1,
                                      verbose=0,
                                      warm_start=True,
                                      subsample=0.6,
                                      max_features=0.6)
    if (method == 8):
        print('GradientBoosting 02')
        str_method = 'GradientBoosting02'
示例#34
0
X = scaler.transform(X)

#add intercept
X = np.hstack((np.ones(X.shape[0])[:,None],X))

train_X,test_X,train_Y,test_Y = train_test_split(X,y,test_size=0.1)


#%%
#try elastic net

#alpha equals lambda here
lambda_grid = [0.01, 0.1 , 1, 10,100]
l1_ratio_grid = [0.1,0.3,0.5,0.7,0.9]

enet_CV = ElasticNetCV(l1_ratio=l1_ratio_grid,alphas=lambda_grid,cv=3,n_jobs=-1,verbose=True)

enet_CV.fit(train_X,train_Y)

#%%
#show
enet_CV.score(test_X,test_Y)
plt.plot(enet_CV.predict(test_X),test_Y,'o')
#%%
#try svr

svr = SVR(kernel = 'rbf',C=1,cache_size=2000)

SVR_params = { 'C' : [1e-1,1.0,1e2,1e3,1e4] }
svr_rs = grid_search.RandomizedSearchCV(svr,SVR_params,verbose=True,n_jobs=-1)
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model.tests.test_sparse_coordinate_descent import make_sparse_data
from time import time
import pylab as pl 
import numpy as np

X, y = make_sparse_data(n_samples=500, n_features=2000, n_informative=200)
n_cores = [1, 2, 4]
n_alpha = [5, 10, 50, 100]
times = [0] * 12

counter = 0
for _ in range(3):
    for core in n_cores:
        for alpha in n_alpha:
		    clf = ElasticNetCV(n_jobs=core, n_alphas=alpha,
		        	           l1_ratio=0.5, cv=10)
		    print "core = %d, alpha = %d" % (core, alpha)
		    t = time()
		    clf.fit(X, y)
		    times[counter%12] += (time() - t)
		    print times
		    counter += 1

# Got after doing the above. Just for future reference.
core1_mp = [57.457534631093345, 72.31527137756348, 210.2204163869222, 379.9918119907379]
core2_mp = [55.89718206723531, 51.196732918421425, 138.35079900423685, 239.67310031255087]
core3_mp = [42.53018967310587, 49.97517212231954, 122.26631005605061, 204.76643363634744]

core1_t = [60.99967805544535, 75.41305232048035, 219.61244002978006, 390.601344982783]
core2_t = [46.21716833114624, 54.701584259668984, 144.06910300254822, 242.6696043809255]
core3_t = [43.21849703788757, 49.07820804913839, 122.74103697141011, 205.75086871782938]
示例#36
0
# check maybe 10 kfolds would be better
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

# Kernel Ridge Regression : made robust to outliers
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))

# LASSO Regression : made robust to outliers
lasso = make_pipeline(
    RobustScaler(),
    LassoCV(max_iter=1e7, alphas=alphas2, random_state=14, cv=kfolds))

# Elastic Net Regression : made robust to outliers
elasticnet = make_pipeline(
    RobustScaler(),
    ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))

# Gradient Boosting for regression
gboost = GradientBoostingRegressor(n_estimators=3000,
                                   learning_rate=0.05,
                                   max_depth=4,
                                   max_features='sqrt',
                                   min_samples_leaf=15,
                                   min_samples_split=10,
                                   loss='huber',
                                   random_state=5)

# LightGBM regressor.
lgbm = lgb.LGBMRegressor(objective='regression',
                         num_leaves=4,
                         learning_rate=0.01,
md=dnn_reg(X_train,y_train,X_test,y_test)
reg_eval(X_test,y_test,md)

###Lasso CV regression

def reg_eval2(y_test,model):
    y_pred=model.predict(X_test)
    print("evaluation the results for model:",model)
    print("MSE:",mean_squared_error(y_test,y_pred))
    print("R2:",r2_score(y_test,y_pred))
    print("EVS:",explained_variance_score(y_test,y_pred))

lasso = LassoCV(cv=5, random_state=0,max_iter=10000)
lasso.fit(X_train,y_train)
reg_eval2(y_test,lasso)

#ElasticNet Regressionb
ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77)
ela.fit(X_train,y_train)
print("R square:",ela.score(X_test,y_test))
reg_eval2(y_test,ela)


#SVR Regression
from sklearn.svm import LinearSVR
LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000)
# scaler=RobustScaler()
# pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)])
LSVR.fit(X_train,y_train)
reg_eval2(y_test,LSVR))
示例#38
0

#%%
interesting_ones = ['G13','G14','G15','G19','G21']



r2_mfcc = []
r2_stft = []
for chan in interesting_ones:
    y = Y[:,electrode_names.index(chan)]
    train_X,test_X,train_Y,test_Y = train_test_split(np.hstack([mfcc_X,X]),y,test_size=0.3)
    mfcctrain_X = train_X[:,:325]
    train_X = train_X[:,325:]
    l1_ratio_grid = [0.1,0.3,0.5,0.7,0.9]
    enet_CV = ElasticNetCV(l1_ratio=l1_ratio_grid,n_jobs=-1,verbose=True)
    enet_CV.fit(train_X,train_Y)
    r2_stft.append(enet_CV.score(test_X[:,325:],test_Y))
    enet_CV.fit(mfcctrain_X,train_Y)
    r2_mfcc.append(enet_CV.score(test_X[:,:325],test_Y))


#%%

#%for standardizing in lagged stimuli space
scaler = preprocessing.StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

#add intercept
#X = np.hstack((np.ones(X.shape[0])[:,None],X))
示例#39
0
def do_validation(data_path, steps=10):
    allfiles = initialize(data_path)
    gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5)
    ada = AdaBoostRegressor(n_estimators=200, learning_rate=1)
    etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5)
    rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5)
    kn = KNeighborsRegressor(n_neighbors=25)
    logit = LogisticRegression(tol=0.05)
    enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05)
    svr = SVR(kernel="linear", probability=True)
    ridge = Ridge(alpha=18)
    bridge = BayesianRidge(n_iter=500)

    gbm_metrics = 0.0
    ada_metrics = 0.0
    etree_metrics = 0.0
    rf_metrics = 0.0
    kn_metrics = 0.0
    logit_metrics = 0.0
    svr_metrics = 0.0
    ridge_metrics = 0.0
    bridge_metrics = 0.0
    enet_metrics = 0.0
    nnet_metrics = 0.0

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    for i in xrange(steps):
        driver = allfiles[i]
        df, Y = create_merged_dataset(driver)
        df['label'] = Y        
        # Shuffle DF.
        df = df.reindex(np.random.permutation(df.index))

        train = df[:100]
        label = train['label']
        del train['label']

        test = df[100:400]
        Y = test['label']
        del test['label']

        #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', 
        #        'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', 
        #        'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', 
        #        'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', 
        #        'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', 
        #        'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', 
        #        'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', 
        #        'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80']
        to_drop = ['driver', 'trip']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        gbm.fit(X_train, label)
        Y_hat = gbm.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        gbm_metrics += metrics.auc(fpr, tpr) 
        
        ada.fit(X_train, label)
        Y_hat = ada.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ada_metrics += metrics.auc(fpr, tpr)
    
        etree.fit(X_train, label)
        Y_hat = etree.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        etree_metrics += metrics.auc(fpr, tpr)
        
        rf.fit(X_train, label)
        Y_hat = rf.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        rf_metrics += metrics.auc(fpr, tpr)
        
        kn.fit(X_train, label)
        Y_hat = kn.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        kn_metrics += metrics.auc(fpr, tpr)

        # Linear models.
        to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed',
                'sd_avg_speed', 'mean_inst_speed', 'points']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        logit.fit(X_train, label)
        Y_hat = [i[1] for i in logit.predict_proba(X_test)]
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        logit_metrics += metrics.auc(fpr, tpr)

        svr.fit(X_train, label)
        Y_hat = svr.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        svr_metrics += metrics.auc(fpr, tpr)
        
        ridge.fit(X_train, label)
        Y_hat = ridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ridge_metrics += metrics.auc(fpr, tpr)

        bridge.fit(X_train, label)
        Y_hat = bridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        bridge_metrics += metrics.auc(fpr, tpr)

        enet.fit(X_train, label)
        Y_hat = enet.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        enet_metrics += metrics.auc(fpr, tpr)

        classifier.fit(X_train, label)
        Y_hat = classifier.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        nnet_metrics += metrics.auc(fpr, tpr)

    print ""
    print "GBM:", gbm_metrics/steps
    print "AdaBoost:", ada_metrics/steps
    print "Extra Trees:", etree_metrics/steps
    print "RF:", rf_metrics/steps
    print "KN:", kn_metrics/steps
    print ""
    print "Logit:", logit_metrics/steps
    print "SVR:", svr_metrics/steps
    print "Ridge:", ridge_metrics/steps
    print "BayesianRidge:", bridge_metrics/steps
    print "Elastic Net:", enet_metrics/steps
    print "Neural Networks:", nnet_metrics/steps
    print ""
示例#40
0
            ('Poly', PolynomialFeatures(include_bias=True)),
            # alpha给定的是Ridge算法中,L2正则项的权重值
            # alphas是给定CV交叉验证过程中,Ridge算法的alpha参数值的取值的范围
            ('Linear', RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept = False))
        ]),
        Pipeline([
            ('Poly', PolynomialFeatures(include_bias=True)),
            ('Linear',
                LassoCV(alphas=np.logspace(0, 1, 10), fit_intercept=False))
        ]),
        Pipeline([
            ('Poly', PolynomialFeatures(include_bias=True)),
            # la_ratio:给定EN算法中L1正则项在整个惩罚项中的比例,这里给定的是一个列表;
            # 表示的是在CV交叉验证的过程中,EN算法L1正则项的权重比例的可选值的范围
            ('Linear',
                ElasticNetCV(alphas=np.logspace(0, 1, 10), l1_ratio=[.1, .5, .7, .9, .95, 1], fit_intercept=False))
        ])
    ]

    # 线性模型过拟合图形识别
    plt.figure(facecolor='w')
    degree = np.arange(1, N, 4)
    dm = degree.size
    colors = []
    for c in np.linspace(16711680, 255, dm):
        colors.append('#%06x' % int(c))
    model = models[0]
    for i,d in enumerate(degree):
        plt.subplot(int(np.ceil(dm/2)), 2, i+1)
        plt.plot(x, y, 'ro', ms=10, zorder=N)
        model.set_params(Poly__degree=d)