import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:-0.0036575106543468203
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=RandomForestRegressor(bootstrap=False,
                                                      max_features=0.25,
                                                      min_samples_leaf=1,
                                                      min_samples_split=2,
                                                      n_estimators=100)),
    ElasticNetCV(l1_ratio=0.9, tol=0.01))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
예제 #2
0
# Scale our Data with Robust Scaler to minimise outlier influence (Approx 4% of the data are significant outliers as measured by Cook's Distance)
rb_scaler = RobustScaler()
X_scaled = pd.DataFrame(rb_scaler.fit_transform(X), columns=X.columns)

std_scaler = StandardScaler()
X_standard = pd.DataFrame(std_scaler.fit_transform(X), columns=X.columns)

## Define CV Root Mean Square Error ##
def cv_rmse(estimator, X, y, cv=5):
    rmse = np.mean(np.sqrt(-cross_val_score(estimator, X, y, cv=cv, scoring="neg_mean_squared_error")))
    return rmse

## Regression Models ##
# Elastic Net Regressor
elastic_reg = ElasticNetCV(cv=5, max_iter=15000)
# Lasso Model for Comparison
lasso_reg = LassoCV(cv=5, alphas=[0.011], max_iter=15000) # Previously Optimised

## Model Evaluation & Hyperparameter Tuning ##
# CV Root Mean Squared Error on Training Set (Robust Scaled)
cv_rmse(lasso_reg, X_scaled, np.ravel(y)) # LASSO: 0.319
cv_rmse(elastic_reg, X_scaled, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.317

# CV Root Mean Squared Error on Training Set (Standardised)
cv_rmse(lasso_reg, X_standard, np.ravel(y)) # LASSO: 0.2992
cv_rmse(elastic_reg, X_standard, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.3012


# Alpha Selection
alphas = np.logspace(-10, 1, 400)
예제 #3
0
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-114.8406727584057
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    ElasticNetCV(l1_ratio=0.6000000000000001, tol=0.01))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
#Loading Data
data_train = pd.read_csv(pj(
    data_dir, 'Training_Set_YESregressBYeTIVifCorr_LogScaled_combat_SVA.txt'),
                         header=0,
                         sep='\t')
#Extracting numerical features as NumPy array
feats = data_train.loc[:, 'lh_bankssts_area':'rh.Whole_hippocampus'].values
y = data_train['age_floor'].values
X = feats  #n_features = 954 n_samples = 2364

#Defining all the Regularization tools and Penalized Regression tools
alphas = np.arange(0.001, 10, 0.005)
lasso = LassoCV(alphas=alphas, max_iter=100000, cv=5)
ridge = RidgeCV(alphas=alphas, cv=5)
elnet = ElasticNetCV(alphas=alphas, max_iter=100000, cv=5)
regressors = [lasso, ridge, elnet]
scalers = [MinMaxScaler(), StandardScaler(), RobustScaler()]

#%% Loading Results
pipes = []
coefs = []
for sca, reg in product(scalers, regressors):
    pipe = Pipeline([('scaler', sca), ('regressor', reg)])
    filename = "{!s:.5}_{!s:.5}.joblib".format(pipe.named_steps['scaler'],
                                               pipe.named_steps['regressor'])
    pipe = load(pj(results_dir, filename))
    pipes.append(pipe)
    coefs.append(list(pipe.named_steps['regressor'].coef_))

#%% #Computing scores for each features
예제 #5
0
       %np.sqrt(mean_squared_error(y_true=y, y_pred=lass_train_out)))


# In[59]:


result = pd.DataFrame()
result['Id'] = df_test['Id']
result['SalePrice'] = np.exp(lass_test_out)
result.to_csv('lass_result.csv',index=False)


# In[60]:


en = ElasticNetCV(alphas = [0.0001, 0.0003, 0.001, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
en.fit(X_train, y)
alpha = en.alpha_
print(alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
en = ElasticNetCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], 
                cv = 10)
en.fit(X_train, y)
alpha = en.alpha_
print("Best alpha :", alpha)


# In[61]:
예제 #6
0
def get_model(X, y, X_sub):
    # 设置k折交叉验证的参数。
    kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

    # 定义均方根对数误差(Root Mean Squared Logarithmic Error ,RMSLE)
    def rmsle(y, y_pred):
        return np.sqrt(mean_squared_error(y, y_pred))

    # 创建模型评分函数,根据不同模型的表现打分
    # cv表示Cross-validation,交叉验证的意思。
    def cv_rmse(model, X=X):
        rmse = np.sqrt(-cross_val_score(
            model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
        return (rmse)

    # 个体机器学习模型的创建(即模型声明和参数设置)-【开始】
    alphas_alt = [
        14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5
    ]
    alphas2 = [
        5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008
    ]
    e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
    e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

    # 定义ridge岭回归模型(使用二范数作为正则化项。不论是使用一范数还是二范数,正则化项的引入均是为了降低过拟合风险。)
    # 注:正则化项如果使用二范数,那么对于任何需要寻优的参数值,在寻优终止时,它都无法将某些参数值变为严格的0,尽管某些参数估计值变得非常小以至于可以忽略。即使用二范数会保留变量的所有信息,不会进行类似PCA的变量凸显。
    # 注:正则化项如果使用一范数,它比L2范数更易于获得“稀疏(sparse)”解,即它的求解结果会有更多的零分量。
    ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt,
                                                  cv=kfolds))

    # 定义LASSO收缩模型(使用L1范数作为正则化项)(由于对目标函数的求解结果中将得到很多的零分量,它也被称为收缩模型。)
    # 注:正则化项如果使用二范数,那么对于任何需要寻优的参数值,在寻优终止时,它都无法将某些参数值变为严格的0,尽管某些参数估计值变得非常小以至于可以忽略。即使用二范数会保留变量的所有信息,不会进行类似PCA的变量凸显。
    # 注:正则化项如果使用一范数,它比L2范数更易于获得“稀疏(sparse)”解,即它的求解结果会有更多的零分量。
    lasso = make_pipeline(
        RobustScaler(),
        LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))

    # 定义elastic net弹性网络模型(弹性网络实际上是结合了岭回归和lasso的特点,同时使用了L1和L2作为正则化项。)
    elasticnet = make_pipeline(
        RobustScaler(),
        ElasticNetCV(max_iter=1e7,
                     alphas=e_alphas,
                     cv=kfolds,
                     l1_ratio=e_l1ratio))

    # 定义SVM支持向量机模型
    svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003))

    # 定义GB梯度提升模型(展开到一阶导数)
    gbr = GradientBoostingRegressor(n_estimators=3000,
                                    learning_rate=0.05,
                                    max_depth=4,
                                    max_features='sqrt',
                                    min_samples_leaf=15,
                                    min_samples_split=10,
                                    loss='huber',
                                    random_state=42)

    # 定义lightgbm模型
    lightgbm = LGBMRegressor(
        objective='regression',
        num_leaves=4,
        learning_rate=0.01,
        n_estimators=5000,
        max_bin=200,
        bagging_fraction=0.75,
        bagging_freq=5,
        bagging_seed=7,
        feature_fraction=0.2,
        feature_fraction_seed=7,
        verbose=-1,
        # min_data_in_leaf=2,
        # min_sum_hessian_in_leaf=11
    )

    # 定义xgboost模型(展开到二阶导数)
    xgboost = XGBRegressor(
        learning_rate=0.01,
        n_estimators=3460,
        max_depth=3,
        min_child_weight=0,
        gamma=0,
        subsample=0.7,
        colsample_bytree=0.7,
        objective='reg:linear',
        nthread=-1,
        scale_pos_weight=1,
        seed=27,
        reg_alpha=0.00006,
        param={
            'objective':
            'reg:squarederror',  # Specify multiclass classification
            'tree_method': 'gpu_hist'  # Use GPU accelerated algorithm
        })

    # 个体机器学习模型的创建(即模型声明和参数设置)-【结束】

    # 集成多个个体学习器-【开始】
    stack_gen = StackingCVRegressor(
        regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
        meta_regressor=xgboost,
        use_features_in_secondary=True)  # regressors=(...)中并没有纳入前面的svr模型
    # 集成多个个体学习器-【结束】

    # 进行交叉验证打分-【开始】
    # 进行交叉验证,并对不同模型的表现打分
    # (由于是交叉验证,将使用不同的数据集对同一模型进行评分,故每个模型对应一个得分序列。展示模型得分序列的平均分、标准差)
    print('进行交叉验证,计算不同模型的得分TEST score on CV')

    # 打印二范数rideg岭回归模型的得分
    score = cv_rmse(ridge)
    print(
        "二范数rideg岭回归模型的得分: {:.4f} ({:.4f})\n".format(score.mean(),
                                                     score.std()),
        datetime.now(),
    )

    # 打印一范数LASSO收缩模型的得分
    score = cv_rmse(lasso)
    print(
        "一范数LASSO收缩模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),
        datetime.now(),
    )

    # 打印elastic net弹性网络模型的得分
    score = cv_rmse(elasticnet)
    print(
        "elastic net弹性网络模型的得分: {:.4f} ({:.4f})\n".format(
            score.mean(), score.std()),
        datetime.now(),
    )

    # 打印SVR支持向量机模型的得分
    score = cv_rmse(svr)
    print(
        "SVR支持向量机模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),
        datetime.now(),
    )

    # 打印lightgbm轻梯度提升模型的得分
    score = cv_rmse(lightgbm)
    print(
        "lightgbm轻梯度提升模型的得分: {:.4f} ({:.4f})\n".format(score.mean(),
                                                       score.std()),
        datetime.now(),
    )

    # 打印gbr梯度提升回归模型的得分
    score = cv_rmse(gbr)
    print(
        "gbr梯度提升回归模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),
        datetime.now(),
    )

    # 打印xgboost模型的得分
    score = cv_rmse(xgboost)
    print(
        "xgboost模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()),
        datetime.now(),
    )
    # 进行交叉验证打分-【结束】

    # 使用训练数据特征矩阵作为输入,训练数据对数处理后的预测房价作为输出,进行各个模型的训练-【开始】
    # 开始集合所有模型,使用stacking方法
    print('进行模型参数训练 START Fit')

    print(datetime.now(), '对stack_gen集成器模型进行参数训练')
    stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

    print(datetime.now(), '对elasticnet弹性网络模型进行参数训练')
    elastic_model_full_data = elasticnet.fit(X, y)

    print(datetime.now(), '对一范数lasso收缩模型进行参数训练')
    lasso_model_full_data = lasso.fit(X, y)

    print(datetime.now(), '对二范数ridge岭回归模型进行参数训练')
    ridge_model_full_data = ridge.fit(X, y)

    print(datetime.now(), '对svr支持向量机模型进行参数训练')
    svr_model_full_data = svr.fit(X, y)

    print(datetime.now(), '对GradientBoosting梯度提升模型进行参数训练')
    gbr_model_full_data = gbr.fit(X, y)

    print(datetime.now(), '对xgboost二阶梯度提升模型进行参数训练')
    xgb_model_full_data = xgboost.fit(X, y)

    print(datetime.now(), '对lightgbm轻梯度提升模型进行参数训练')
    lgb_model_full_data = lightgbm.fit(X, y)

    # 使用训练数据特征矩阵作为输入,训练数据对数处理后的预测房价作为输出,进行各个模型的训练-【结束】

    # 进行交叉验证打分-【结束】

    # 定义个体学习器的预测值融合函数,检测预测值融合策略的效果-【开始】
    # 综合多个模型产生的预测值,作为多模型组合学习器的预测值
    def blend_models_predict(X):
        return ((0.05 * elastic_model_full_data.predict(X)) +
                (0.05 * lasso_model_full_data.predict(X)) +
                (0.05 * ridge_model_full_data.predict(X)) +
                (0.05 * svr_model_full_data.predict(X)) +
                (0.05 * gbr_model_full_data.predict(X)) +
                (0.3 * xgb_model_full_data.predict(X)) +
                (0.15 * lgb_model_full_data.predict(X)) +
                (0.3 * stack_gen_model.predict(np.array(X))))

    # 打印在上述模型配比下,多模型组合学习器的均方根对数误差(Root Mean Squared Logarithmic Error ,RMSLE)
    # 使用训练数据对创造的模型进行k折交叉验证,以训练创造出的模型的参数配置。交叉验证训练过程结束后,将得到模型的参数配置。使用得出的参数配置下,在全体训练数据上进行验证,验证模型对全体训练数据重构的误差。
    print('融合后的训练模型对原数据重构时的均方根对数误差RMSLE score on train data:')
    print(rmsle(y, blend_models_predict(X)))
    # 定义个体学习器的预测值融合函数,检测预测值融合策略的效果-【结束】

    # 将测试集的特征矩阵作为输入,传入训练好的模型,得出的输出写入.csv文件的第2列-【开始】
    # print('使用测试集特征进行房价预测 Predict submission', datetime.now(), )
    # submission = pd.read_csv("../data/sample_submission.csv")
    # # 函数注释:.iloc[:,1]是基于索引位来选取数据集,[索引1:索引2],左闭右开。
    # submission.iloc[:, 1] = np.floor(np.expm1(blend_models_predict(X_sub)))
    # 将测试集的特征矩阵作为输入,传入训练好的模型,得出的输出写入.csv文件的第2列-【结束】

    return np.floor(np.expm1(blend_models_predict(X_sub)))
예제 #7
0
파일: HAPLEXR.py 프로젝트: rapturous/HAPLEX
def predict_holdout(gene):
    try:
        exppheno = expression_phenotypes.loc[gene].values

        # Select cis-snps and corresponding dosages
        gene_annotation = gene_annotations.loc[gene]
        gene_chr = gene_annotation["chr"]
        gene_start, gene_end = int(gene_annotation["start"]), int(gene_annotation["end"])
        ciswindow_start, ciswindow_end = gene_start - ciswindow_size, gene_end + ciswindow_size
        snp_annotations_chr = snp_annotations_by_chr[gene_chr]
        cissnp_annotations = snp_annotations_chr[snp_annotations_chr["pos"].between(ciswindow_start, ciswindow_end)]
        cissnps = list(cissnp_annotations["varID"])
        # Extract cissnp dosages
        cissnps_subset = list(set(cissnps).intersection(list(dosages.index)))
        cisgenos = dosages.loc[cissnps_subset,:] # TODO: Ensure all cissnps are in cisgenos (necessary?)
        # Extract cishaplo clusterings
        cissnps_haplo = []
        for cissnp in cissnps: cissnps_haplo += [cissnp+"_h0", cissnp+"_h1"]
        cissnps_haplo = [c for c in cissnps_haplo if c in clusterings.index]
        cisclusterings = clusterings.loc[cissnps_haplo,:]

        # Skip this gene if there are fewer than two variants
        if len(cisgenos) < 2:
            return [gene] + ["NA"]*(dosages.shape[1]//10), (gene, "NA", "NA", 0, 0, "CS<2")

        # Encode categorical clusterings with dummy variables
        for _ in range(0, len(cissnps_haplo), 2):    
            dummies = get_haplotype_cluster_dummies(cisclusterings.iloc[0:2,:].values)
            dummies_index = [cisclusterings.iloc[0:2,:].index[0].split("h0")[0]+"d"+str(i) for i in range(dummies.shape[0])]
            dummies_df = pd.DataFrame(dummies, index=dummies_index, columns=cisclusterings.columns)
            cisclusterings = cisclusterings.append(dummies_df)
            cisclusterings.drop(cisclusterings.index[0:2], axis=0, inplace=True)

        cisgenos = cisgenos.append(cisclusterings).T

        # Separate cisgenos and expphenos into train and test sets
        cisgenos_train = cisgenos.drop(cisgenos.index[::10], axis=0)
        cisgenos_test = cisgenos.loc[cisgenos.index[::10]]
        exppheno_train = [Y for i,Y in enumerate(exppheno) if i%10 != 0]
        exppheno_test = [Y for i,Y in enumerate(exppheno) if i%10 == 0]

        # Fit model for this gene
        model = ElasticNetCV(l1_ratio=0.5, n_alphas=n_alphas, cv=n_folds, tol=tol, random_state=SEED, n_jobs=1)
        model.fit(cisgenos_train.values, exppheno_train)

        if all(coef==0 for coef in model.coef_): # NOTE: Near-zero betas are not captured here
            r_train = "NA"
            return [gene] + ["NA"]*(dosages.shape[1]//10), (gene, "NA", "NA", 0, 0, "NB")
        
        # Save regression coefficients
        pd.DataFrame(model.coef_, index=cisgenos.columns).T.to_csv("output/model{}/betas/{}/{}.txt".format(MODEL_NUM, tissue, gene),
                                                                   sep=" ", index=False)
        n_dosage_nonzero_betas = np.count_nonzero(model.coef_[:-len(cisclusterings)])
        n_clustering_nonzero_betas = np.count_nonzero(model.coef_[-len(cisclusterings):])
        
        r_train = pearsonr(exppheno_train, model.predict(cisgenos_train.values))[0]

        exppheno_pred = model.predict(cisgenos_test.values)
        r_test = pearsonr(exppheno_test, exppheno_pred)[0]

        return [gene] + list(exppheno_pred), (gene, r_train, r_test, n_dosage_nonzero_betas, n_clustering_nonzero_betas, "OK")
    except:
        print("{}: unknown error".format(gene))
        return [gene] + ["NA"]*(dosages.shape[1]//10), (gene, "NA", "NA", "NA", "NA", "ERR")
예제 #8
0
from sklearn.model_selection import cross_val_predict

X_train_stack = pd.DataFrame(pd.DataFrame(columns=['econ_r', 'nlp_r']))
X_train_stack['econ_r'] = cross_val_predict(ridge_e, X_train_e, y_train, cv=10)
X_train_stack['nlp_r'] = cross_val_predict(ridge_n, X_train_n, y_train, cv=10)

X_test_stack = pd.DataFrame(pd.DataFrame(columns=['econ_r', 'nlp_r']))
X_test_stack['econ_r'] = ridge_e.predict(X_test_e)
X_test_stack['nlp_r'] = ridge_n.predict(X_test_n)

X_train_stack.to_csv("Stack_train.csv")
X_test_stack.to_csv("Stack_test.csv")

from sklearn.linear_model import ElasticNetCV
stack = ElasticNetCV(cv=tscv)
stack.fit(X_train_stack, y_train)
cv_score = cross_val_score(stack, X_train_stack, y_train, cv=tscv, scoring=scorer)
stack_performance = {'Model':'XGB', 'MSE':np.mean(cv_score), 'SD':(np.std(cv_score))}
stack_performance

mape(y_test, stack.predict(X_test_stack))


# In[81]:


coefs = stack.coef_
ridge_coefs = pd.DataFrame({'Coef': coefs,
                           'Name': list(X_train_stack.columns)})
ridge_coefs["abs"] = ridge_coefs.Coef.apply(np.abs)
예제 #9
0
x.shape = -1, 1
y.shape = -1, 1

models = [
    Pipeline([('Poly', PolynomialFeatures()),
              ('Linear', LinearRegression(fit_intercept=False))]),
    Pipeline([('Poly', PolynomialFeatures()),
              ('Linear',
               RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
    Pipeline([('Poly', PolynomialFeatures()),
              ('Linear',
               LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
    Pipeline([('Poly', PolynomialFeatures()),
              ('Linear',
               ElasticNetCV(alphas=np.logspace(-3, 2, 50),
                            l1_ratio=[.1, .5, .7, .9, .95, 1],
                            fit_intercept=False))])
]

plt.figure(facecolor='w')
degree = np.arange(1, N, 4)  # 阶
dm = degree.size
colors = []  # 颜色
for c in np.linspace(16711680, 255, dm):
    colors.append('#%06x' % c)

model = models[0]
for i, d in enumerate(degree):
    plt.subplot(int(np.ceil(dm / 2.0)), 2, i + 1)
    plt.plot(x, y, 'ro', ms=10, zorder=N)
예제 #10
0
def EAS_VAR(scrY, scrX, steps, burnin, po=None, d=None, N=None, weights=None):

    p, n = scrY.shape
    N = (200 if N is None else N)

    # By defaut, assume the least restrictive sparsity for po, given fixed n
    # and p. Note that this is not compatible with asymptotic considerations,
    # but suffices for finite samples.
    po = (min(p**2, n) if po is None else po)

    Y = (scrY[:, ].flatten('F')).reshape((n * p, 1))
    vecX = (scrX[:, ].flatten('F')).reshape((n * p, 1))
    scrZ = sparse.csr_matrix(np.kron(np.transpose(scrX), np.eye(p)))

    # Jacobian columns corresponding only to nonzero A.
    Ip = np.eye(p)
    scrB_s = sparse.csr_matrix((n * p, 0))
    scrB_A = sparse.csr_matrix((n * p, 0))
    for k in range(p):
        for l in range(p):
            coef_mat = linalg.block_diag(
                *([Ip[:, k].reshape((p, 1)) @ Ip[:, l].reshape((1, p))] * n))
            scrB_A = sparse.hstack(
                [scrB_A, sparse.csr_matrix(coef_mat @ vecX)], format='csr')
            if l == k:
                scrB_s = sparse.hstack(
                    [scrB_s, sparse.csr_matrix(coef_mat)], format='csr')

    # If d or weights are not provided, compute default values using elastic net
    if d is None or weights is None:
        tscv = TimeSeriesSplit(n_splits=3)
        preprocess = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                                  cv=tscv,
                                  verbose=False,
                                  fit_intercept=False)
        #preprocess = RidgeCV(cv=tscv, fit_intercept=False)
        preprocess.fit(scrZ, Y.flatten())
        G_enet = np.where((preprocess.coef_)**2 > 0)[0]
        G_enet = (G_enet if len(G_enet) > 0 else [0])

        if d is None:
            m_G_enet = comp_m_G(scrY, scrX, Y, scrZ, G_enet, p)[0]
            # Set d equal to 10 percent of the minimum m_G component for the lasso
            # selected graph
            d = min(m_G_enet) * .1

        if weights is None:
            min_coef_sq = min((preprocess.coef_[G_enet])**2)
            weights = (preprocess.coef_)**2 + (min_coef_sq /
                                               10 if min_coef_sq > 0 else 1)

    # Starting graph
    G = [np.where(weights == np.max(weights))[0][0]]
    logf = log_mass_fun(scrY, scrX, Y, scrZ, scrB_s, scrB_A, G, n, p, N, po, d)

    # Begin MCMC ----------------------------------------------------------------
    AcceptRatio = 0
    for t in range(0, steps):

        # Propose a new graph
        AddVar = np.random.randint(0, 3)
        proposed_G, qratio = proposal_graph(AddVar, G, p, weights)

        # Compute/estimate the log mass of the proposed graph
        newlogf = log_mass_fun(scrY, scrX, Y, scrZ, scrB_s, scrB_A, proposed_G,
                               n, p, N, po, d)

        # Determine whether to accept or reject the proposed graph
        if np.log(np.random.uniform(0, 1)) < newlogf - logf + np.log(qratio):
            G = proposed_G
            logf = newlogf
            AcceptRatio = AcceptRatio + 1

        # Record the Markov Chain after the burn-in period ------------------------
        if t == burnin:
            graph = np.zeros(p**2, dtype=bool) * 1
            graph[G] = 1
            postSample = graph.reshape((1, p**2))
            postProbs = [1]
            chain = np.zeros((steps - burnin - 1, p**2), dtype=bool) * 1

        elif t > burnin:
            graph = np.zeros(p**2, dtype=bool) * 1
            graph[G] = 1
            chain[t - burnin - 1, :] = graph
            for k in range(postSample.shape[0]):
                if np.array_equal(graph, postSample[k, :]) == True:
                    postProbs[k] = postProbs[k] + 1
                    break

            if (k == (postSample.shape[0] - 1)
                    and np.array_equal(graph, postSample[k, :]) == False):
                postSample = np.append(postSample,
                                       graph.reshape((1, p**2)),
                                       axis=0)
                postProbs.append(1)

        #print(G)
        print('---> ', t, flush=True)

    postProbs = np.array(postProbs) / sum(postProbs)
    AcceptRatio = AcceptRatio / steps

    return pd.Series(
        [chain, postSample, postProbs, AcceptRatio, d],
        index=['chain', 'postSample', 'postProbs', 'AcceptRatio', 'd'])
def test_multioutput_enetcv_error():
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)
    y = rng.randn(10, 2)
    clf = ElasticNetCV()
    assert_raises(ValueError, clf.fit, X, y)
예제 #12
0
파일: usermodel.py 프로젝트: the07/ML
import numpy as np
from scipy import sparse
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.cross_validation import KFold

data = np.array([[int(tok) for tok in line.split('\t')[:3]]
                 for line in open('data/ml-100k/u.data')])
ij = data[:, :2]
ij -= 1  # original data is in 1-based system
values = data[:, 2]
reviews = sparse.csc_matrix((values, ij.T)).astype(float)

reg = ElasticNetCV(fit_intercept=True,
                   alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.])


def movie_norm(xc):
    '''Normalize per movie'''
    xc = xc.copy().toarray()
    # x1 is the mean of the positive items
    x1 = np.array([xi[xi > 0].mean() for xi in xc])
    x1 = np.nan_to_num(x1)

    for i in range(xc.shape[0]):
        xc[i] -= (xc[i] > 0) * x1[i]
    return xc, x1


def learn_for(i):
    u = reviews[i]
    us = np.delete(np.arange(reviews.shape[0]), i)
예제 #13
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=25)

# Average CV score on the training set was: -0.7849110533550745
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.1, tol=0.0001)),
    RandomForestRegressor(bootstrap=False,
                          max_features=0.3,
                          min_samples_leaf=8,
                          min_samples_split=8,
                          n_estimators=100))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 25)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
예제 #14
0
                       w_min=w_min)  #获取训练集
        if window_type == "expanding":
            all_pred.append(linear_i(train, linear_func=linear_func))
        else:
            all_pred.append(
                np.mean([linear_i(x, linear_func=linear_func) for x in train]))

    return pd.Series(all_pred, index=out_months)


#广义交叉验证的岭回归
ridge_pre_expanding = linear_prediction(RidgeCV())
#五折交叉验证的Lasso回归
lasso_pre_expanding = linear_prediction(LassoCV(cv=5))
#五折交叉验证的弹性网络
elasticnet_pre_expanding = linear_prediction(ElasticNetCV(cv=5))
#五折交叉验证的最小角回归
lars_pre_expanding = linear_prediction(LarsCV(cv=5))
#五折交叉验证的正交匹配追踪
omp_pre_expanding = linear_prediction(OrthogonalMatchingPursuitCV(cv=5))

#保存结果
pd.DataFrame({
    "ridge": ridge_pre_expanding,
    "lasso": lasso_pre_expanding,
    "elastic net": elasticnet_pre_expanding,
    "lars": lars_pre_expanding,
    "OMP": omp_pre_expanding
}).to_csv(os.path.join(path, "预测结果", "expanding方式窗口预测.csv"))
linear_pre_expanding = pd.read_csv(os.path.join(path, "预测结果",
                                                "expanding方式窗口预测.csv"),
예제 #15
0
Y_train = X_train['SalePrice_log']
X_train.drop(labels='SalePrice_log', axis=1, inplace=True)
X_test = pd.read_csv(data_dir + 'clean_test.csv')
# ----

#
# ─── BUILD MODEL ────────────────────────────────────────────────────────────────
#

# Build ElasticNet regressor,
# Optimise hyperparameters with CV
enr_cv = ElasticNetCV(
    n_jobs=-1,
    random_state=5,
    cv=5,
    l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
    fit_intercept=True,
    precompute='auto',
    copy_X=True,
    verbose=2)

enr_cv.fit(X_train, Y_train)
print(enr_cv.alpha_)
print(enr_cv.mse_path_)

#
# ─── CREATE .CSV FOR SUBMISSION ──────────────────────────────────────────────────
#

X_test['SalePrice'] = pd.DataFrame(
    enr_cv.predict(X_test.drop(labels='Id', axis=1))).apply(lambda x: 10**x)
예제 #16
0
def regression_stage1(dmi,
                      divs,
                      dtreatments,
                      gm_wbc,
                      dinfections,
                      pidmeta,
                      dw,
                      wbc_type='neutrophils'):
    """Stage 1 Elastic Net regression
    dmi: microbiota composition for intervals
    divs: microbiota diversity for intervals, contains sample additional sample info (pid)
    dtreatments: immunomodulatory medications and treatments administered during interval
    gm_wbc: geometric mean of absolute white blood cell counts during interval
    dinfections: positive blood cultures detected during interval
    pidmeta: patient and HCT meta data
    dw: daily change in WBC count, "y"
    wbc_type: WBC type considered
    """
    # identify patients with microbiota data (stage 2) to exclude in stage 1
    # by inner-joining with microbiota diversity table
    Xmi = pd.merge(divs, dw, on=["pid", "anon-date"], how="inner")
    mi_pids = Xmi.reset_index().pid.unique()
    # stage 1 feature selection regression (and stage 2 as regularized ML version instead of full Bayesian for internal checks)
    for MIINDICATOR in ["stage1", "stage2"]:
        X = pd.merge(
            dw,
            gm_wbc,
            on=["pid", "anon-date"],
            how="inner",
            suffixes=["", "_REMOVEgmwbc"])
        if MIINDICATOR == "stage2":
            # ML-version of the Bayesian model for stage 2, include microbiome
            X = pd.merge(
                X,
                dmi,
                on=["pid", "anon-date"],
                how="inner",
                suffixes=["", "_REMOVE_dmi"])
            X = pd.merge(
                X,
                divs.reset_index()[["pid", "anon-date", "inverseSimpson"]],
                on=["pid", "anon-date"],
                how="inner",
                suffixes=["", "_REMOVE_ivs"])
        X = pd.merge(
            X,
            dtreatments,
            on=["pid", "anon-date"],
            how="left",
            suffixes=["", "_REMOVEdreatments"])
        X = pd.merge(
            X,
            dinfections,
            on=["pid", "anon-date"],
            how="left",
            suffixes=["", "_REMOVEdinfections"])
        X = pd.merge(
            X,
            pidmeta,
            on=["pid", "n_bmt"],
            how="left",
            suffixes=["", "_REMOVEpidmeta"])
        X = X.loc[~X["log_%s" % WBCTYPE].isna()]
        X = X[[x for x in X.columns if "REMOVE" not in x]]
        X = X.drop(columns=["n_bmt"])
        X_columns = X.columns

        if MIINDICATOR == "stage1":
            # mi_pids: patients with microbiota data
            X = X.loc[X.pid.apply(lambda v: v not in mi_pids)]
        elif MIINDICATOR == "stage2":
            # ML-version of the Bayesian model for stage 2
            X = X.loc[X.pid.apply(lambda v: v in mi_pids)]

        # intercepts per transplant type
        X = X.join(pd.get_dummies(X["hct_source"]))
        # drop original HCT type column
        X = X.drop(columns=['hct_source', "PBSC"])  #PBSC as reference
        # intercepts per intensity
        X = X.join(pd.get_dummies(X["Intensity"].fillna("unknown")))
        X = X.drop(columns=["ABLATIVE", "unknown",
                            "Intensity"])  #ABLATIVE as reference
        # intercepts female
        X = X.join(pd.get_dummies(X.sex)['F'])
        X = X.drop(columns='sex')

        # only after engraftment and before day 100
        X = X.query('day>6 ').copy()  #smallest observed engraftment da
        # only analyze daily changes
        X = X.query('dt == 1').copy()
        #
        # from join, fill gaps
        X.loc[:, dinfections.columns] = X[dinfections.columns].fillna(0)
        X.loc[:, dtreatments.columns] = X[dtreatments.columns].fillna(0)
        # missing patient ages, fill with mean for ML feature selection
        X["age"] = X["age"].fillna(X["age"].mean())

        ### drop columns
        # delta time columns
        dtcols = [x for x in X.columns if 'dt' in x]
        X = X.drop(columns=dtcols)
        # time point columns
        anoncols = [x for x in X.columns if 'anon' in x]
        X = X.drop(columns=anoncols)
        # patient id columns
        pidcols = [x for x in X.columns if ('pid' in x) and (x != 'pid')]
        remaining_pids = X.reset_index().pid.unique()
        print("pid count in regression", len(remaining_pids))
        X = X.drop(columns=pidcols)
        print("shape before dropna", X.shape)
        X = X.dropna()
        print("shape after dropna (should not change)", X.shape)

        # drop all zero columns
        drop_zero_columns = (X.sum() == 0) & (X.max() == 0)
        drop_zero_columns = drop_zero_columns.loc[
            drop_zero_columns.values].index
        X = X.drop(columns=drop_zero_columns)
        # drop HCT day
        daycolumns = [
            x for x in X.columns if ("day" in x) and (x not in ["day", "eday"])
        ]
        X.drop(columns=daycolumns, inplace=True)

        #### data transformations and standardizations
        from sklearn.preprocessing import StandardScaler
        X.loc[:, [
            'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils',
            'platelets'
        ]] = np.log10(X[[
            'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils',
            'platelets'
        ]] + 0.1 / 2)
        from sklearn.linear_model import ElasticNetCV
        # Fit and summarize OLS model
        # CV on patient sub samples, X contains 'pid' column
        Xpid = X.copy()
        X = X.drop(columns=['pid'])
        _drug_cols = [x for x in X.columns if x in dtreatments.columns]
        _other_cols = [x for x in X.columns if x not in dtreatments.columns]
        groups = Xpid.pid
        from sklearn.model_selection import GroupKFold
        group_kfold = GroupKFold(n_splits=10)
        cv = list(
            group_kfold.split(
                X.drop(columns='log_%s' % wbc_type),
                X['log_%s' % wbc_type],
                groups))
        mod = ElasticNetCV(
            cv=cv, positive=False, normalize=False, fit_intercept=True)
        res = mod.fit(
            MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)),
            X['log_%s' % wbc_type],
        )
        r2 = mod.score(
            MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)),
            X['log_%s' % wbc_type],
        )
        print('chosen alpha: %f' % mod.alpha_)
        print("R2 = %f" % r2)
        coefs = pd.Series(
            res.coef_,
            index=X.drop(columns='log_%s' % wbc_type).columns).replace(
                0, np.nan).dropna().sort_values()
        coefs["gr"] = mod.intercept_
        coefs["N"] = len(np.unique(Xpid.pid))
        coefs["n"] = X.shape[0]
        coefs["r2"] = r2
        coefs["alpha"] = mod.alpha_
    return (coefs)
예제 #17
0
    'filesReg': filesReg,
    'filesBinClass': filesBinClass,
    'filesMultiClass': filesMultiClass
}
regsNames = ['LinearRegression', 'RidgeCV', 'LassoCV', 'ElasticNetCV']

Regs = [
    LinearRegression(normalize=True),
    RidgeCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0],
            cv=10,
            normalize=True),
    LassoCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0],
            cv=10,
            normalize=True),
    ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0],
                 cv=10,
                 normalize=True)
]
Classifiers = [
    LogisticRegressionCV(cv=10),
    DecisionTreeClassifier(max_depth=3),
    svm.SVC(kernel='rbf', probability=True),
    svm.SVC(kernel='linear', probability=True),
    neighbors.KNeighborsClassifier(n_neighbors=7)
]
#naive_bayes.GaussianNB(),neighbors.KNeighborsClassifier(n_neighbors=7)]
ClassifiersNames = [
    'LogisticRegressionCV', 'DecisionTreeClassifier', 'svm.SVC_rbf',
    'svm.SVC_linear', 'neighbors.KNeighborsClassifier'
]
예제 #18
0
def main_method():
    configure_logging()
    train = load_training_dataset()
    logging.info(train.shape)

    # Duplicates check
    check_for_duplicates(train)
    exit()

    # ## Pre-processing
    plot_col_vs_sale_price(train, "GrLivArea", title="Looking for outliers")

    # Drop the houses with more than 4000 sq feet following dataset author recommendations:
    # https://ww2.amstat.org/publications/jse/v19n3/decock.pdf
    # ### Pre-processing steps added
    #
    # * Filter large house outliers
    # * Log transform the target (Sale Price)
    # * errors have same effect whether the house is cheap or not
    #
    train = preprocessing_pipeline(train)
    y = train.SalePrice

    plot_col_vs_sale_price(train, "GrLivArea", title="Area vs Sale Price AFTER Log transform")

    train = fill_null_values(train)

    # Numerical features that are really categories
    # TODO: pull into pipeline method
    train = create_sub_class_categories(train)
    train = create_month_sold_category(train)

    # Encode categoricals as ordered number features
    # when there is information in the order
    train = create_ordinal_categories(train)

    # Simplifications of existing features
    create_simple_overall_quality(train)
    create_simple_overall_condition(train)
    create_simple_pool_quality(train)
    create_simple_garage_condition(train)
    create_simple_garage_quality(train)
    create_simple_fireplace_quality(train)
    create_simple_functional_feature(train)
    create_simple_kitchen_quality(train)
    create_simple_heating_quality(train)
    create_simple_basement_finish(train, "BsmtFinType1")
    create_simple_basement_finish(train, "BsmtFinType2")

    create_simple_basement_condition(train)
    create_simple_basement_quality(train)
    create_simple_exterior_condition(train)
    create_simple_exterior_quality(train)

    # Combinations of existing features
    create_interaction_features(train)

    # Has masonry veneer or not
    create_simple_has_masonry_veneer(train)

    create_house_bought_pre_build(train)

    create_polynomial_features(train)

    # #### Split numerical and categorical features
    categorical_features = train.select_dtypes(include=["object"]).columns
    numerical_features = train.select_dtypes(exclude=["object"]).columns.drop("SalePrice")
    logging.info(f"Numerical features: {len(numerical_features)}")
    logging.info(f"Categorical features: {len(categorical_features)}")
    train_num = train[numerical_features]
    train_cat = train[categorical_features]

    # Handle missing values in numerical features by using the median
    logging.info(f"Missing numerical values: {train_num.isnull().values.sum()}")
    train_num = train_num.fillna(train_num.median())
    logging.info(f"Remaining missing numerical values: {train_num.isnull().values.sum()}")

    ## Log transform skewed numerical features to lessen impact of outliers
    # Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
    # As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed
    skewness = train_num.apply(lambda x: skew(x))
    skewness = skewness[abs(skewness) > 0.5]
    skewed_features = skewness.index
    train_num[skewed_features] = np.log1p(train_num[skewed_features])

    ## One hot encode categorical variables
    train_cat = pd.get_dummies(train_cat)

    train = pd.concat([train_num, train_cat], axis=1)
    logging.info(f"Number of features: {train.shape[1]}")

    # ## Modelling
    #
    # * Split dataset
    # * Standardisation (don't want to fit on observations that will be in the test set)
    #
    # ### Modelling techniques tried
    #
    # * Linear regression
    # * Ridge Regression (L2)
    # * LASSO (L1)
    # * ElasticNET (L1 AND L2)
    #
    # Split training set
    X_train, X_test, y_train, y_test = train_test_split(
        train,
        y,
        test_size=0.3,
        random_state=0
    )
    logging.info("X_train", str(X_train.shape))
    logging.info("X_test", str(X_test.shape))
    logging.info("y_train", str(y_train.shape))
    logging.info("y_test", str(y_test.shape))
    # Standard scale the features
    # Done after partitioning to avoid fitting scaler to observations in the test set
    # Should the scaler be pickled for deployment use cases then?
    scaler = StandardScaler()
    X_train.loc[:, numerical_features] = scaler.fit_transform(X_train.loc[:, numerical_features])
    X_test.loc[:, numerical_features] = scaler.transform(X_test.loc[:, numerical_features])
    # Official error measure for scoring: RMSE
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    # ## Linear regression without regularisation
    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)
    logging.info(f"RMSE on Training set: {rmse_cv_train(linear_regression).mean()}")
    logging.info(f"RMSE on Test set: {rmse_cv_test(linear_regression).mean()}")
    y_train_pred = linear_regression.predict(X_train)
    y_test_pred = linear_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    # ## Linear Regression with Ridge Regression (L2 Penalty)
    #
    # * Regularisation is a good way to hadnle collinearity, filter out noise and prevent overfitting.
    # * L2 penalty add the squared sum of weights to cost function
    ridge_regression = RidgeCV(
        alphas=[0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60]
    )
    ridge_regression.fit(X_train, y_train)
    best_alpha = ridge_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info("Re-fit with alphas around the best alpha")
    ridge_regression = RidgeCV(
        alphas=[
            best_alpha * .6,
            best_alpha * .65,
            best_alpha * .7,
            best_alpha * .75,
            best_alpha * .8,
            best_alpha * .85,
            best_alpha * .9,
            best_alpha * .9,
            best_alpha * .95,
            best_alpha,
            best_alpha * 1.05,
            best_alpha * 1.1,
            best_alpha * 1.15,
            best_alpha * 1.2,
            best_alpha * 1.25,
            best_alpha * 1.3,
            best_alpha * 1.35,
            best_alpha * 1.4,
        ],
        cv=10
    )
    ridge_regression.fit(X_train, y_train)
    best_alpha = ridge_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info(f"Ridge RMSE on Training set: {rmse_cv_train(ridge_regression).mean()}")
    logging.info(f"Ridge RMSE on Test set: {rmse_cv_test(ridge_regression).mean()}")
    y_train_pred = ridge_regression.predict(X_train)
    y_test_pred = ridge_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression with Ridge regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression with Ridge regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    ## Plot important coefficients
    coefs = pd.Series(ridge_regression.coef_, index=X_train.columns)
    logging.info(f"Ridge picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features")
    important_coefficients = pd.concat([coefs.sort_values().head(10),
                                        coefs.sort_values().tail(10)])
    important_coefficients.plot(kind="barh")
    plt.title("Coefficients in the Ridge Model")
    plt.show()
    # Results:
    # * Better RMSE for Ridge
    # * Small difference between training and test suggests that overfitting has been eliminated
    # * Ridge used almost all of the features (only 3 dropped)
    #
    # TODO: Understand the coefficient plot (if possible?)
    # ## LASSO Regression
    #
    # Least Absolute Shrinkage and Selection Operator.
    #
    # Alternative regularisation method, L1 Regularisation, use the absolute value of weights rather than squares.
    #
    # Most weights will be 0. Useful in high dimensional dataset where most features are irrelevant.
    #
    # Hypothesis: more efficient than Ridge Regression.
    lasso_regression = LassoCV(
        alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1],
        max_iter=50000,
        cv=10
    )
    lasso_regression.fit(X_train, y_train)
    best_alpha = lasso_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info("Re-fit with alphas around the best alpha")
    lasso_regression = LassoCV(
        alphas=[
            best_alpha * .6,
            best_alpha * .65,
            best_alpha * .7,
            best_alpha * .75,
            best_alpha * .8,
            best_alpha * .85,
            best_alpha * .9,
            best_alpha * .9,
            best_alpha * .95,
            best_alpha,
            best_alpha * 1.05,
            best_alpha * 1.1,
            best_alpha * 1.15,
            best_alpha * 1.2,
            best_alpha * 1.25,
            best_alpha * 1.3,
            best_alpha * 1.35,
            best_alpha * 1.4,
        ],
        max_iter=50000,
        cv=10
    )
    lasso_regression.fit(X_train, y_train)
    best_alpha = lasso_regression.alpha_
    logging.info(f"Best alpha {best_alpha}")
    logging.info(f"LASSO RMSE on Training set: {rmse_cv_train(lasso_regression).mean()}")
    logging.info(f"LASSO RMSE on Test set: {rmse_cv_test(lasso_regression).mean()}")
    y_train_pred = lasso_regression.predict(X_train)
    y_test_pred = lasso_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression with LASSO regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression with Ridge regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    ## Plot important coefficients
    coefs = pd.Series(lasso_regression.coef_, index=X_train.columns)
    logging.info(f"LASSO picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features")
    important_coefficients = pd.concat([coefs.sort_values().head(10),
                                        coefs.sort_values().tail(10)])
    important_coefficients.plot(kind="barh")
    plt.title("Coefficients in the Ridge Model")
    plt.show()
    # ## ElasticNET
    #
    # * Compromise between L1 and L2 penalties
    elastic_net_regression = ElasticNetCV(
        l1_ratio=[0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1],
        alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1],
        max_iter=50000,
        cv=10
    )
    elastic_net_regression.fit(X_train, y_train)
    best_l1_ratio = elastic_net_regression.l1_ratio_
    best_alpha = elastic_net_regression.alpha_
    logging.info(f"Best L1 Ratio {best_l1_ratio}")
    logging.info(f"Best alpha {best_alpha}")
    logging.info("Re-fit with alphas around the best alpha")
    elastic_net_regression = ElasticNetCV(
        l1_ratio=[
            best_l1_ratio * .85,
            best_l1_ratio * .9,
            best_l1_ratio * .9,
            best_l1_ratio * .95,
            best_l1_ratio,
            best_l1_ratio * 1.05,
            best_l1_ratio * 1.1,
            best_l1_ratio * 1.15
        ],
        alphas=[
            best_alpha * .6,
            best_alpha * .65,
            best_alpha * .7,
            best_alpha * .75,
            best_alpha * .8,
            best_alpha * .85,
            best_alpha * .9,
            best_alpha * .9,
            best_alpha * .95,
            best_alpha,
            best_alpha * 1.05,
            best_alpha * 1.1,
            best_alpha * 1.15,
            best_alpha * 1.2,
            best_alpha * 1.25,
            best_alpha * 1.3,
            best_alpha * 1.35,
            best_alpha * 1.4,
        ],
        max_iter=50000,
        cv=10
    )
    elastic_net_regression.fit(X_train, y_train)
    best_l1_ratio = elastic_net_regression.l1_ratio_
    best_alpha = elastic_net_regression.alpha_
    logging.info(f"Best L1 Ratio {best_l1_ratio}")
    logging.info(f"Best alpha {best_alpha}")
    logging.info(f"ElasticNet RMSE on Training set: {rmse_cv_train(elastic_net_regression).mean()}")
    logging.info(f"ElasticNet RMSE on Test set: {rmse_cv_test(elastic_net_regression).mean()}")
    y_train_pred = elastic_net_regression.predict(X_train)
    y_test_pred = elastic_net_regression.predict(X_test)
    # Plot residuals
    plt.scatter(y_train_pred,
                y_train_pred - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test_pred - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear Regression with ElasticNET regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()
    # Plot predictions
    plt.scatter(y_train_pred,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_pred,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation")
    plt.title("Linear Regression with ElasticNET regularisation")
    plt.xlabel("Predicted Values")
    plt.ylabel("Real Values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()
    ## Plot important coefficients
    coefs = pd.Series(elastic_net_regression.coef_, index=X_train.columns)
    logging.info(f"ElasticNET picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features")
    important_coefficients = pd.concat([coefs.sort_values().head(10),
                                        coefs.sort_values().tail(10)])
    important_coefficients.plot(kind="barh")
    plt.title("Coefficients in the ElasticNET Model")
    plt.show()
예제 #19
0
classifiers = [
    SVC(kernel="rbf", probability=True),
    SVC(kernel='linear', probability=True),
    SVC(kernel='sigmoid', probability=True),
    SVC(kernel='poly', probability=True, degree=3),
    SVC(kernel='poly', probability=True, degree=4),
    SVC(kernel='poly', probability=True, degree=5),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    GaussianNB(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    QuadraticDiscriminantAnalysis(),
    LinearDiscriminantAnalysis(),
    ElasticNetCV(max_iter=10000),
    LarsCV(),
    LassoCV(max_iter=10000),
    LassoLarsCV(),
    LogisticRegressionCV(),  #17
    MultiTaskElasticNetCV(),
    MultiTaskLassoCV(),
    OrthogonalMatchingPursuitCV(),
    RidgeClassifierCV()
]
algorithm = 17
if len(sys.argv) > 1:
    algorithm = int(sys.argv[1])

name = names[algorithm]
clf = classifiers[algorithm]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=12)

np_Xtrain = np.array(X_train.iloc[:, -len(X_train.columns) + 1:-1])
np_Xtest = np.array(X_test.iloc[:, -len(X_test.columns) + 1:-1])
np_ytest = np.array(y_test.iloc[:, 1])
np_ytrain = np.array(y_train.iloc[:, 1])

#Performing elastic net cv to determine optimal alpha and l1 ratio

X_cv = np.array(X.iloc[:, -len(X.columns) + 1:-1])
y_cv = np.array(y.iloc[:, 1])
from sklearn.linear_model import ElasticNetCV
enet_cv = ElasticNetCV(cv=10, n_jobs=2, tol=0.5)
enet_cvfit = enet_cv.fit(X_cv, y_cv)

#Performing elastic net
from sklearn.linear_model import ElasticNet
alpha = enet_cvfit.alpha_
l1ratio = enet_cvfit.l1_ratio_
tolerance = 0.5
enet = ElasticNet(alpha=alpha, l1_ratio=l1ratio, tol=tolerance, max_iter=10000)
enet_fitted = enet.fit(np_Xtrain, np_ytrain)
y2_pred = enet_fitted.predict(np_Xtest)
from sklearn.metrics import r2_score
r2_score_model = r2_score(np_ytest, y2_pred)
print(enet)
print("r^2 on test data : %f" % r2_score_model)
ax = plt.gca()

ax.plot(lasso_alphas, lasso_coefs)
ax.set_xscale('log')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

# In[51]:

lcv.coef_

# 弹性网络

# In[52]:

from sklearn.linear_model import ElasticNetCV

l1_ratio = [0, .1, .5, .7, .9, .95, .99, 1]

encv = ElasticNetCV(l1_ratio=l1_ratio)
encv.fit(X, y)

# In[53]:

print('The best l1_ratio is {}'.format(encv.l1_ratio_))
print('The best alpha is {}'.format(encv.alpha_))
예제 #22
0
def train_linear_model(
    X,
    y,
    random_state=1,
    test_size=0.2,
    regularization_type="elasticnet",
    k_fold=5,
    max_iter=1000000,
    tol=0.0001,
    l1_ratio=None,
):
    """
    Function to train linear model with regularization and cross-validation.

    Args:
        X (pandas.DataFrame): dataframe of descriptors.
        y (pandas.DataFrame): dataframe of cycle lifetimes.
        random_state (int): seed for train/test split.
        test_size (float): proportion of the dataset reserved for model evaluation.
        regularization_type (str): lasso or ridge or elastic-net (with cv).
        k_fold (int): k in k-fold cross-validation.
        max_iter (int): maximum number of iterations for model fitting.
        tol (float): tolerance for optimization.
        l1_ratio ([float]): list of lasso to ridge ratios for elasticnet.

    Returns:
        sklearn.linear_model.LinearModel: fitted model.
        mu (float): Mean value of descriptors used in training.
        s (float): Std dev of descriptors used in training.

    """
    if l1_ratio is None:
        l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 1]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state)

    # Standardize (training) data after train/test split
    mu = np.mean(X_train, axis=0)
    s = np.std(X_train, axis=0)
    X_scaled = (X_train - mu) / s
    hyperparameters = {
        "random_state": random_state,
        "test_size": test_size,
        "k_fold": k_fold,
        "tol": tol,
        "max_iter": max_iter,
    }
    if regularization_type == "lasso" and y.shape[1] == 1:
        lassocv = LassoCV(fit_intercept=True,
                          alphas=None,
                          tol=tol,
                          cv=k_fold,
                          max_iter=max_iter)
        lassocv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = lassocv.alpha_
        linear_model = Lasso(fit_intercept=True,
                             alpha=alpha_opt,
                             max_iter=max_iter)
        linear_model.fit(X_scaled, y_train.values)
        hyperparameters["l1_ratio"] = 1

    elif regularization_type == "ridge" and y.shape[1] == 1:
        ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold)
        ridgecv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = ridgecv.alpha_
        linear_model = Ridge(fit_intercept=True, alpha=alpha_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters["l1_ratio"] = 0

    elif regularization_type == "elasticnet" and y.shape[1] == 1:
        elasticnetcv = ElasticNetCV(
            fit_intercept=True,
            normalize=False,
            alphas=None,
            cv=k_fold,
            l1_ratio=l1_ratio,
            max_iter=max_iter,
        )
        elasticnetcv.fit(X_scaled, y_train.values.ravel())

        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = elasticnetcv.alpha_
        l1_ratio_opt = elasticnetcv.l1_ratio_
        linear_model = ElasticNet(
            fit_intercept=True,
            normalize=False,
            l1_ratio=l1_ratio_opt,
            alpha=alpha_opt,
            max_iter=max_iter,
        )
        linear_model.fit(X_scaled, y_train)
        hyperparameters["l1_ratio"] = l1_ratio_opt

    # If more than 1 outcome present, perform multitask regression
    elif regularization_type == "elasticnet" and y.shape[1] > 1:
        multi_elasticnet_CV = MultiTaskElasticNetCV(
            fit_intercept=True,
            cv=k_fold,
            normalize=False,
            l1_ratio=l1_ratio,
            max_iter=max_iter,
        )
        multi_elasticnet_CV.fit(X_scaled, y_train)
        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = multi_elasticnet_CV.alpha_
        l1_ratio_opt = multi_elasticnet_CV.l1_ratio_
        linear_model = MultiTaskElasticNet(fit_intercept=True,
                                           normalize=False,
                                           max_iter=max_iter)
        linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters["l1_ratio"] = l1_ratio_opt
    else:
        raise NotImplementedError

    y_pred = linear_model.predict((X_test - mu) / s)
    Rsq = linear_model.score((X_test - mu) / s, y_test)
    # Compute 95% confidence interval
    # Multioutput = 'raw_values' provides prediction error per output
    pred_actual_ratio = [x / y for x, y in zip(y_pred, np.array(y_test))]
    relative_prediction_error = 1.96 * np.sqrt(
        mean_squared_error(np.ones(y_pred.shape),
                           pred_actual_ratio,
                           multioutput="raw_values") / y_pred.shape[0])
    hyperparameters["alpha"] = alpha_opt
    return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
예제 #23
0
# n_jobs=-1 allows your computer to use all its cores for the computation.
# Lasso is more computationally intensive than ridge for a small number of
# features.
# The lasso regression is more resistant to outliers compared to ridge, and it
# has the added advantage of being able to eliminate less important features.
# For this reason it is used when the number of features is very large and you
# don't know which are important.
lasso = LassoCV(n_alphas=50, cv=5)
# ElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None,
# fit_intercept=True, normalize=False, precompute=’auto’, max_iter=1000,
# tol=0.0001, cv=’warn’, copy_X=True, verbose=0, n_jobs=None, positive=False,
# random_state=None, selection=’cyclic’)
# A combination of Ridge and Lasso for large datasets.
# l1_ratio determines how much of Ridge or Lasso to use. 1 is Lasso, 0 is
# Ridge.
elastic = ElasticNetCV(cv=5)


def search_timer(c, X_train, y_train):
    start = time.time()
    c.fit(X_train, y_train)
    end = time.time()
    total = end - start
    return total


# RandomizedSearchCV(estimator, param_distributions, n_iter=10, scoring=None,
# n_jobs=None, refit=True, cv='warn', verbose=0, pre_dispatch='2*n_jobs',
# random_state=None, return_train_score=False) searches for the best parameters
# for a model randomly. The best model, score and parameters can then be
# accessed using the object's properties.
from sklearn.linear_model import ElasticNetCV, enet_path

data = np.loadtxt('spike_cluster5.csv', delimiter=",")
waves = np.loadtxt('waves_cluster5.csv', delimiter=",")
ndir = np.loadtxt('ndir_cluster5.csv', delimiter=",")

#print(waves.shape) # waves (90(sum of all directions) x 25704(12S*102ch*20ms))
#print(data.shape) # data (2040(102channels*20ms) x 273(nspikes*R))

# speeds = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
nspikes = len(ndir) # number of spikes in the cluster
R = int(data.shape[1]/nspikes) # number of shifts in sliding window
S = int(waves.shape[1]/data.shape[0]) # number of propagation speeds
T = data.shape[0]

regression = ElasticNetCV(positive=True, normalize=True, cv=5, max_iter=10000) # elastic net regression
cumdir = 0

bestind = np.zeros([nspikes,], dtype=int) # indices of best speeds
numdir = np.zeros(nspikes,) # number of nonzero directions in optimum
#bestcoef = np.zeros([nspikes, int(ndir.T.max())])
finalscore = np.zeros(nspikes,)

for ind_sp in range(0,nspikes):
    Ndir = int(ndir[ind_sp]) # number of propagation directions
    datasp = data[:, ind_sp*R:(ind_sp + 1)*R]  # spike 21x2040
    wavessp = waves[cumdir:cumdir + Ndir]  # waves for this spike Ndir x 12S*102ch*20ms

    coefs = np.zeros([R,S,Ndir]) # regression coefficients
    intercept = np.zeros([R,S]) # regression intercept
    score = np.zeros([R,S]) # R-squared scores
예제 #25
0
                                  warm_start=True)

    if (method == 5):
        print('Random forest 02')
        str_method = 'RandomForest02'
        r = RandomForestRegressor(n_estimators=90,
                                  max_depth=4,
                                  n_jobs=-1,
                                  random_state=ra1,
                                  verbose=0,
                                  warm_start=True)

    if (method == 6):
        print('ElasticNet')
        str_method = 'Elastic Net'
        r = ElasticNetCV()

    if (method == 7):
        print('GradientBoosting 01')
        str_method = 'GradientBoosting01'
        r = GradientBoostingRegressor(n_estimators=80,
                                      max_depth=5,
                                      learning_rate=0.05,
                                      random_state=ra1,
                                      verbose=0,
                                      warm_start=True,
                                      subsample=0.6,
                                      max_features=0.6)
    if (method == 8):
        print('GradientBoosting 02')
        str_method = 'GradientBoosting02'
예제 #26
0
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [
    5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008
]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))

lasso = make_pipeline(
    RobustScaler(),
    LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))

elasticnet = make_pipeline(
    RobustScaler(),
    ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))

svr = make_pipeline(RobustScaler(), SVR(
    C=20,
    epsilon=0.008,
    gamma=0.0003,
))

gbr = GradientBoostingRegressor(n_estimators=3000,
                                learning_rate=0.05,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)
예제 #27
0
#Lasso
lasso = Lasso(alpha=0.0005, random_state=1)
print("Lasso score: {:.4f} \n".format(cv_rmse(lasso).mean()))

alphas_alt = [
    14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5, 10, 5
]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]

#Ridge
ridge = RidgeCV(alphas=alphas_alt, cv=kfolds)
print("Ridge score: {:.4f} \n".format(cv_rmse(ridge).mean()))

#ElasticNet
elasticnet = ElasticNetCV(cv=kfolds, alphas=e_alphas)
print("ElasticNet score: {:.4f} \n".format(cv_rmse(elasticnet).mean()))

#Svr
#svr = SVR()
#print(cv_rmse(svr).mean())

#XGBoost
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=3460,
                       max_depth=3,
                       min_child_weight=0,
                       gamma=0,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear',
예제 #28
0
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 26 15:48:46 2017

@author: kaifeng
"""
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import KFold
from sklearn.linear_model import ElasticNetCV

data, target = load_svmlight_file('C:/Users/carne/Desktop/E2006.train')

met = ElasticNetCV(fit_intercept=True)

kf = KFold(len(target), n_folds=2)

for train, test in kf:
    met.fit(data[train], target[train])
    p = map(met.predict, data[test])
    p = np.array(p).ravel()
    e = p - target[test]
    err += np.dot(e, e)

rmse_10cv = np.sqrt(err / len(target))

# 别用个人笔记本跑。。。有点浪费时间了~
예제 #29
0
                                            min_samples_leaf=5),
                      random_state=13,
                      n_estimators=17), "AdaBoostAuto")
build_auto(ARDRegression(normalize=True), "BayesianARDAuto")
build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto")
build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2),
           "DecisionTreeAuto",
           compact=False)
build_auto(
    BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                           min_samples_leaf=5),
                     random_state=13,
                     n_estimators=3,
                     max_features=0.5), "DecisionTreeEnsembleAuto")
build_auto(DummyRegressor(strategy="median"), "DummyAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
build_auto(HuberRegressor(), "HuberAuto")
build_auto(LarsCV(), "LarsAuto")
build_auto(LassoCV(random_state=13), "LassoAuto")
build_auto(LassoLarsCV(), "LassoLarsAuto")
build_auto(
    OptimalLGBMRegressor(objective="regression",
                         n_estimators=17,
                         num_iteration=11), "LGBMAuto")
build_auto(LinearRegression(), "LinearRegressionAuto")
build_auto(
    BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75),
예제 #30
0
import numpy as np

from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    StandardScaler(),
    MinMaxScaler(),
    ElasticNetCV(l1_ratio=0.15000000000000002, tol=0.0001)
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)