import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import ElasticNetCV, LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:-0.0036575106543468203 exported_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StackingEstimator(estimator=RandomForestRegressor(bootstrap=False, max_features=0.25, min_samples_leaf=1, min_samples_split=2, n_estimators=100)), ElasticNetCV(l1_ratio=0.9, tol=0.01)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# Scale our Data with Robust Scaler to minimise outlier influence (Approx 4% of the data are significant outliers as measured by Cook's Distance) rb_scaler = RobustScaler() X_scaled = pd.DataFrame(rb_scaler.fit_transform(X), columns=X.columns) std_scaler = StandardScaler() X_standard = pd.DataFrame(std_scaler.fit_transform(X), columns=X.columns) ## Define CV Root Mean Square Error ## def cv_rmse(estimator, X, y, cv=5): rmse = np.mean(np.sqrt(-cross_val_score(estimator, X, y, cv=cv, scoring="neg_mean_squared_error"))) return rmse ## Regression Models ## # Elastic Net Regressor elastic_reg = ElasticNetCV(cv=5, max_iter=15000) # Lasso Model for Comparison lasso_reg = LassoCV(cv=5, alphas=[0.011], max_iter=15000) # Previously Optimised ## Model Evaluation & Hyperparameter Tuning ## # CV Root Mean Squared Error on Training Set (Robust Scaled) cv_rmse(lasso_reg, X_scaled, np.ravel(y)) # LASSO: 0.319 cv_rmse(elastic_reg, X_scaled, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.317 # CV Root Mean Squared Error on Training Set (Standardised) cv_rmse(lasso_reg, X_standard, np.ravel(y)) # LASSO: 0.2992 cv_rmse(elastic_reg, X_standard, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.3012 # Alpha Selection alphas = np.logspace(-10, 1, 400)
import numpy as np import pandas as pd from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-114.8406727584057 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ElasticNetCV(l1_ratio=0.6000000000000001, tol=0.01)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
#Loading Data data_train = pd.read_csv(pj( data_dir, 'Training_Set_YESregressBYeTIVifCorr_LogScaled_combat_SVA.txt'), header=0, sep='\t') #Extracting numerical features as NumPy array feats = data_train.loc[:, 'lh_bankssts_area':'rh.Whole_hippocampus'].values y = data_train['age_floor'].values X = feats #n_features = 954 n_samples = 2364 #Defining all the Regularization tools and Penalized Regression tools alphas = np.arange(0.001, 10, 0.005) lasso = LassoCV(alphas=alphas, max_iter=100000, cv=5) ridge = RidgeCV(alphas=alphas, cv=5) elnet = ElasticNetCV(alphas=alphas, max_iter=100000, cv=5) regressors = [lasso, ridge, elnet] scalers = [MinMaxScaler(), StandardScaler(), RobustScaler()] #%% Loading Results pipes = [] coefs = [] for sca, reg in product(scalers, regressors): pipe = Pipeline([('scaler', sca), ('regressor', reg)]) filename = "{!s:.5}_{!s:.5}.joblib".format(pipe.named_steps['scaler'], pipe.named_steps['regressor']) pipe = load(pj(results_dir, filename)) pipes.append(pipe) coefs.append(list(pipe.named_steps['regressor'].coef_)) #%% #Computing scores for each features
%np.sqrt(mean_squared_error(y_true=y, y_pred=lass_train_out))) # In[59]: result = pd.DataFrame() result['Id'] = df_test['Id'] result['SalePrice'] = np.exp(lass_test_out) result.to_csv('lass_result.csv',index=False) # In[60]: en = ElasticNetCV(alphas = [0.0001, 0.0003, 0.001, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60]) en.fit(X_train, y) alpha = en.alpha_ print(alpha) print("Try again for more precision with alphas centered around " + str(alpha)) en = ElasticNetCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], cv = 10) en.fit(X_train, y) alpha = en.alpha_ print("Best alpha :", alpha) # In[61]:
def get_model(X, y, X_sub): # 设置k折交叉验证的参数。 kfolds = KFold(n_splits=10, shuffle=True, random_state=42) # 定义均方根对数误差(Root Mean Squared Logarithmic Error ,RMSLE) def rmsle(y, y_pred): return np.sqrt(mean_squared_error(y, y_pred)) # 创建模型评分函数,根据不同模型的表现打分 # cv表示Cross-validation,交叉验证的意思。 def cv_rmse(model, X=X): rmse = np.sqrt(-cross_val_score( model, X, y, scoring="neg_mean_squared_error", cv=kfolds)) return (rmse) # 个体机器学习模型的创建(即模型声明和参数设置)-【开始】 alphas_alt = [ 14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5 ] alphas2 = [ 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008 ] e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007] e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1] # 定义ridge岭回归模型(使用二范数作为正则化项。不论是使用一范数还是二范数,正则化项的引入均是为了降低过拟合风险。) # 注:正则化项如果使用二范数,那么对于任何需要寻优的参数值,在寻优终止时,它都无法将某些参数值变为严格的0,尽管某些参数估计值变得非常小以至于可以忽略。即使用二范数会保留变量的所有信息,不会进行类似PCA的变量凸显。 # 注:正则化项如果使用一范数,它比L2范数更易于获得“稀疏(sparse)”解,即它的求解结果会有更多的零分量。 ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds)) # 定义LASSO收缩模型(使用L1范数作为正则化项)(由于对目标函数的求解结果中将得到很多的零分量,它也被称为收缩模型。) # 注:正则化项如果使用二范数,那么对于任何需要寻优的参数值,在寻优终止时,它都无法将某些参数值变为严格的0,尽管某些参数估计值变得非常小以至于可以忽略。即使用二范数会保留变量的所有信息,不会进行类似PCA的变量凸显。 # 注:正则化项如果使用一范数,它比L2范数更易于获得“稀疏(sparse)”解,即它的求解结果会有更多的零分量。 lasso = make_pipeline( RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds)) # 定义elastic net弹性网络模型(弹性网络实际上是结合了岭回归和lasso的特点,同时使用了L1和L2作为正则化项。) elasticnet = make_pipeline( RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)) # 定义SVM支持向量机模型 svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003)) # 定义GB梯度提升模型(展开到一阶导数) gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42) # 定义lightgbm模型 lightgbm = LGBMRegressor( objective='regression', num_leaves=4, learning_rate=0.01, n_estimators=5000, max_bin=200, bagging_fraction=0.75, bagging_freq=5, bagging_seed=7, feature_fraction=0.2, feature_fraction_seed=7, verbose=-1, # min_data_in_leaf=2, # min_sum_hessian_in_leaf=11 ) # 定义xgboost模型(展开到二阶导数) xgboost = XGBRegressor( learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=27, reg_alpha=0.00006, param={ 'objective': 'reg:squarederror', # Specify multiclass classification 'tree_method': 'gpu_hist' # Use GPU accelerated algorithm }) # 个体机器学习模型的创建(即模型声明和参数设置)-【结束】 # 集成多个个体学习器-【开始】 stack_gen = StackingCVRegressor( regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm), meta_regressor=xgboost, use_features_in_secondary=True) # regressors=(...)中并没有纳入前面的svr模型 # 集成多个个体学习器-【结束】 # 进行交叉验证打分-【开始】 # 进行交叉验证,并对不同模型的表现打分 # (由于是交叉验证,将使用不同的数据集对同一模型进行评分,故每个模型对应一个得分序列。展示模型得分序列的平均分、标准差) print('进行交叉验证,计算不同模型的得分TEST score on CV') # 打印二范数rideg岭回归模型的得分 score = cv_rmse(ridge) print( "二范数rideg岭回归模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) # 打印一范数LASSO收缩模型的得分 score = cv_rmse(lasso) print( "一范数LASSO收缩模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) # 打印elastic net弹性网络模型的得分 score = cv_rmse(elasticnet) print( "elastic net弹性网络模型的得分: {:.4f} ({:.4f})\n".format( score.mean(), score.std()), datetime.now(), ) # 打印SVR支持向量机模型的得分 score = cv_rmse(svr) print( "SVR支持向量机模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) # 打印lightgbm轻梯度提升模型的得分 score = cv_rmse(lightgbm) print( "lightgbm轻梯度提升模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) # 打印gbr梯度提升回归模型的得分 score = cv_rmse(gbr) print( "gbr梯度提升回归模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) # 打印xgboost模型的得分 score = cv_rmse(xgboost) print( "xgboost模型的得分: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), ) # 进行交叉验证打分-【结束】 # 使用训练数据特征矩阵作为输入,训练数据对数处理后的预测房价作为输出,进行各个模型的训练-【开始】 # 开始集合所有模型,使用stacking方法 print('进行模型参数训练 START Fit') print(datetime.now(), '对stack_gen集成器模型进行参数训练') stack_gen_model = stack_gen.fit(np.array(X), np.array(y)) print(datetime.now(), '对elasticnet弹性网络模型进行参数训练') elastic_model_full_data = elasticnet.fit(X, y) print(datetime.now(), '对一范数lasso收缩模型进行参数训练') lasso_model_full_data = lasso.fit(X, y) print(datetime.now(), '对二范数ridge岭回归模型进行参数训练') ridge_model_full_data = ridge.fit(X, y) print(datetime.now(), '对svr支持向量机模型进行参数训练') svr_model_full_data = svr.fit(X, y) print(datetime.now(), '对GradientBoosting梯度提升模型进行参数训练') gbr_model_full_data = gbr.fit(X, y) print(datetime.now(), '对xgboost二阶梯度提升模型进行参数训练') xgb_model_full_data = xgboost.fit(X, y) print(datetime.now(), '对lightgbm轻梯度提升模型进行参数训练') lgb_model_full_data = lightgbm.fit(X, y) # 使用训练数据特征矩阵作为输入,训练数据对数处理后的预测房价作为输出,进行各个模型的训练-【结束】 # 进行交叉验证打分-【结束】 # 定义个体学习器的预测值融合函数,检测预测值融合策略的效果-【开始】 # 综合多个模型产生的预测值,作为多模型组合学习器的预测值 def blend_models_predict(X): return ((0.05 * elastic_model_full_data.predict(X)) + (0.05 * lasso_model_full_data.predict(X)) + (0.05 * ridge_model_full_data.predict(X)) + (0.05 * svr_model_full_data.predict(X)) + (0.05 * gbr_model_full_data.predict(X)) + (0.3 * xgb_model_full_data.predict(X)) + (0.15 * lgb_model_full_data.predict(X)) + (0.3 * stack_gen_model.predict(np.array(X)))) # 打印在上述模型配比下,多模型组合学习器的均方根对数误差(Root Mean Squared Logarithmic Error ,RMSLE) # 使用训练数据对创造的模型进行k折交叉验证,以训练创造出的模型的参数配置。交叉验证训练过程结束后,将得到模型的参数配置。使用得出的参数配置下,在全体训练数据上进行验证,验证模型对全体训练数据重构的误差。 print('融合后的训练模型对原数据重构时的均方根对数误差RMSLE score on train data:') print(rmsle(y, blend_models_predict(X))) # 定义个体学习器的预测值融合函数,检测预测值融合策略的效果-【结束】 # 将测试集的特征矩阵作为输入,传入训练好的模型,得出的输出写入.csv文件的第2列-【开始】 # print('使用测试集特征进行房价预测 Predict submission', datetime.now(), ) # submission = pd.read_csv("../data/sample_submission.csv") # # 函数注释:.iloc[:,1]是基于索引位来选取数据集,[索引1:索引2],左闭右开。 # submission.iloc[:, 1] = np.floor(np.expm1(blend_models_predict(X_sub))) # 将测试集的特征矩阵作为输入,传入训练好的模型,得出的输出写入.csv文件的第2列-【结束】 return np.floor(np.expm1(blend_models_predict(X_sub)))
def predict_holdout(gene): try: exppheno = expression_phenotypes.loc[gene].values # Select cis-snps and corresponding dosages gene_annotation = gene_annotations.loc[gene] gene_chr = gene_annotation["chr"] gene_start, gene_end = int(gene_annotation["start"]), int(gene_annotation["end"]) ciswindow_start, ciswindow_end = gene_start - ciswindow_size, gene_end + ciswindow_size snp_annotations_chr = snp_annotations_by_chr[gene_chr] cissnp_annotations = snp_annotations_chr[snp_annotations_chr["pos"].between(ciswindow_start, ciswindow_end)] cissnps = list(cissnp_annotations["varID"]) # Extract cissnp dosages cissnps_subset = list(set(cissnps).intersection(list(dosages.index))) cisgenos = dosages.loc[cissnps_subset,:] # TODO: Ensure all cissnps are in cisgenos (necessary?) # Extract cishaplo clusterings cissnps_haplo = [] for cissnp in cissnps: cissnps_haplo += [cissnp+"_h0", cissnp+"_h1"] cissnps_haplo = [c for c in cissnps_haplo if c in clusterings.index] cisclusterings = clusterings.loc[cissnps_haplo,:] # Skip this gene if there are fewer than two variants if len(cisgenos) < 2: return [gene] + ["NA"]*(dosages.shape[1]//10), (gene, "NA", "NA", 0, 0, "CS<2") # Encode categorical clusterings with dummy variables for _ in range(0, len(cissnps_haplo), 2): dummies = get_haplotype_cluster_dummies(cisclusterings.iloc[0:2,:].values) dummies_index = [cisclusterings.iloc[0:2,:].index[0].split("h0")[0]+"d"+str(i) for i in range(dummies.shape[0])] dummies_df = pd.DataFrame(dummies, index=dummies_index, columns=cisclusterings.columns) cisclusterings = cisclusterings.append(dummies_df) cisclusterings.drop(cisclusterings.index[0:2], axis=0, inplace=True) cisgenos = cisgenos.append(cisclusterings).T # Separate cisgenos and expphenos into train and test sets cisgenos_train = cisgenos.drop(cisgenos.index[::10], axis=0) cisgenos_test = cisgenos.loc[cisgenos.index[::10]] exppheno_train = [Y for i,Y in enumerate(exppheno) if i%10 != 0] exppheno_test = [Y for i,Y in enumerate(exppheno) if i%10 == 0] # Fit model for this gene model = ElasticNetCV(l1_ratio=0.5, n_alphas=n_alphas, cv=n_folds, tol=tol, random_state=SEED, n_jobs=1) model.fit(cisgenos_train.values, exppheno_train) if all(coef==0 for coef in model.coef_): # NOTE: Near-zero betas are not captured here r_train = "NA" return [gene] + ["NA"]*(dosages.shape[1]//10), (gene, "NA", "NA", 0, 0, "NB") # Save regression coefficients pd.DataFrame(model.coef_, index=cisgenos.columns).T.to_csv("output/model{}/betas/{}/{}.txt".format(MODEL_NUM, tissue, gene), sep=" ", index=False) n_dosage_nonzero_betas = np.count_nonzero(model.coef_[:-len(cisclusterings)]) n_clustering_nonzero_betas = np.count_nonzero(model.coef_[-len(cisclusterings):]) r_train = pearsonr(exppheno_train, model.predict(cisgenos_train.values))[0] exppheno_pred = model.predict(cisgenos_test.values) r_test = pearsonr(exppheno_test, exppheno_pred)[0] return [gene] + list(exppheno_pred), (gene, r_train, r_test, n_dosage_nonzero_betas, n_clustering_nonzero_betas, "OK") except: print("{}: unknown error".format(gene)) return [gene] + ["NA"]*(dosages.shape[1]//10), (gene, "NA", "NA", "NA", "NA", "ERR")
from sklearn.model_selection import cross_val_predict X_train_stack = pd.DataFrame(pd.DataFrame(columns=['econ_r', 'nlp_r'])) X_train_stack['econ_r'] = cross_val_predict(ridge_e, X_train_e, y_train, cv=10) X_train_stack['nlp_r'] = cross_val_predict(ridge_n, X_train_n, y_train, cv=10) X_test_stack = pd.DataFrame(pd.DataFrame(columns=['econ_r', 'nlp_r'])) X_test_stack['econ_r'] = ridge_e.predict(X_test_e) X_test_stack['nlp_r'] = ridge_n.predict(X_test_n) X_train_stack.to_csv("Stack_train.csv") X_test_stack.to_csv("Stack_test.csv") from sklearn.linear_model import ElasticNetCV stack = ElasticNetCV(cv=tscv) stack.fit(X_train_stack, y_train) cv_score = cross_val_score(stack, X_train_stack, y_train, cv=tscv, scoring=scorer) stack_performance = {'Model':'XGB', 'MSE':np.mean(cv_score), 'SD':(np.std(cv_score))} stack_performance mape(y_test, stack.predict(X_test_stack)) # In[81]: coefs = stack.coef_ ridge_coefs = pd.DataFrame({'Coef': coefs, 'Name': list(X_train_stack.columns)}) ridge_coefs["abs"] = ridge_coefs.Coef.apply(np.abs)
x.shape = -1, 1 y.shape = -1, 1 models = [ Pipeline([('Poly', PolynomialFeatures()), ('Linear', LinearRegression(fit_intercept=False))]), Pipeline([('Poly', PolynomialFeatures()), ('Linear', RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]), Pipeline([('Poly', PolynomialFeatures()), ('Linear', LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]), Pipeline([('Poly', PolynomialFeatures()), ('Linear', ElasticNetCV(alphas=np.logspace(-3, 2, 50), l1_ratio=[.1, .5, .7, .9, .95, 1], fit_intercept=False))]) ] plt.figure(facecolor='w') degree = np.arange(1, N, 4) # 阶 dm = degree.size colors = [] # 颜色 for c in np.linspace(16711680, 255, dm): colors.append('#%06x' % c) model = models[0] for i, d in enumerate(degree): plt.subplot(int(np.ceil(dm / 2.0)), 2, i + 1) plt.plot(x, y, 'ro', ms=10, zorder=N)
def EAS_VAR(scrY, scrX, steps, burnin, po=None, d=None, N=None, weights=None): p, n = scrY.shape N = (200 if N is None else N) # By defaut, assume the least restrictive sparsity for po, given fixed n # and p. Note that this is not compatible with asymptotic considerations, # but suffices for finite samples. po = (min(p**2, n) if po is None else po) Y = (scrY[:, ].flatten('F')).reshape((n * p, 1)) vecX = (scrX[:, ].flatten('F')).reshape((n * p, 1)) scrZ = sparse.csr_matrix(np.kron(np.transpose(scrX), np.eye(p))) # Jacobian columns corresponding only to nonzero A. Ip = np.eye(p) scrB_s = sparse.csr_matrix((n * p, 0)) scrB_A = sparse.csr_matrix((n * p, 0)) for k in range(p): for l in range(p): coef_mat = linalg.block_diag( *([Ip[:, k].reshape((p, 1)) @ Ip[:, l].reshape((1, p))] * n)) scrB_A = sparse.hstack( [scrB_A, sparse.csr_matrix(coef_mat @ vecX)], format='csr') if l == k: scrB_s = sparse.hstack( [scrB_s, sparse.csr_matrix(coef_mat)], format='csr') # If d or weights are not provided, compute default values using elastic net if d is None or weights is None: tscv = TimeSeriesSplit(n_splits=3) preprocess = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=tscv, verbose=False, fit_intercept=False) #preprocess = RidgeCV(cv=tscv, fit_intercept=False) preprocess.fit(scrZ, Y.flatten()) G_enet = np.where((preprocess.coef_)**2 > 0)[0] G_enet = (G_enet if len(G_enet) > 0 else [0]) if d is None: m_G_enet = comp_m_G(scrY, scrX, Y, scrZ, G_enet, p)[0] # Set d equal to 10 percent of the minimum m_G component for the lasso # selected graph d = min(m_G_enet) * .1 if weights is None: min_coef_sq = min((preprocess.coef_[G_enet])**2) weights = (preprocess.coef_)**2 + (min_coef_sq / 10 if min_coef_sq > 0 else 1) # Starting graph G = [np.where(weights == np.max(weights))[0][0]] logf = log_mass_fun(scrY, scrX, Y, scrZ, scrB_s, scrB_A, G, n, p, N, po, d) # Begin MCMC ---------------------------------------------------------------- AcceptRatio = 0 for t in range(0, steps): # Propose a new graph AddVar = np.random.randint(0, 3) proposed_G, qratio = proposal_graph(AddVar, G, p, weights) # Compute/estimate the log mass of the proposed graph newlogf = log_mass_fun(scrY, scrX, Y, scrZ, scrB_s, scrB_A, proposed_G, n, p, N, po, d) # Determine whether to accept or reject the proposed graph if np.log(np.random.uniform(0, 1)) < newlogf - logf + np.log(qratio): G = proposed_G logf = newlogf AcceptRatio = AcceptRatio + 1 # Record the Markov Chain after the burn-in period ------------------------ if t == burnin: graph = np.zeros(p**2, dtype=bool) * 1 graph[G] = 1 postSample = graph.reshape((1, p**2)) postProbs = [1] chain = np.zeros((steps - burnin - 1, p**2), dtype=bool) * 1 elif t > burnin: graph = np.zeros(p**2, dtype=bool) * 1 graph[G] = 1 chain[t - burnin - 1, :] = graph for k in range(postSample.shape[0]): if np.array_equal(graph, postSample[k, :]) == True: postProbs[k] = postProbs[k] + 1 break if (k == (postSample.shape[0] - 1) and np.array_equal(graph, postSample[k, :]) == False): postSample = np.append(postSample, graph.reshape((1, p**2)), axis=0) postProbs.append(1) #print(G) print('---> ', t, flush=True) postProbs = np.array(postProbs) / sum(postProbs) AcceptRatio = AcceptRatio / steps return pd.Series( [chain, postSample, postProbs, AcceptRatio, d], index=['chain', 'postSample', 'postProbs', 'AcceptRatio', 'd'])
def test_multioutput_enetcv_error(): rng = np.random.RandomState(0) X = rng.randn(10, 2) y = rng.randn(10, 2) clf = ElasticNetCV() assert_raises(ValueError, clf.fit, X, y)
import numpy as np from scipy import sparse from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV from sklearn.cross_validation import KFold data = np.array([[int(tok) for tok in line.split('\t')[:3]] for line in open('data/ml-100k/u.data')]) ij = data[:, :2] ij -= 1 # original data is in 1-based system values = data[:, 2] reviews = sparse.csc_matrix((values, ij.T)).astype(float) reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) def movie_norm(xc): '''Normalize per movie''' xc = xc.copy().toarray() # x1 is the mean of the positive items x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] return xc, x1 def learn_for(i): u = reviews[i] us = np.delete(np.arange(reviews.shape[0]), i)
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=25) # Average CV score on the training set was: -0.7849110533550745 exported_pipeline = make_pipeline( StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.1, tol=0.0001)), RandomForestRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=8, min_samples_split=8, n_estimators=100)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 25) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
w_min=w_min) #获取训练集 if window_type == "expanding": all_pred.append(linear_i(train, linear_func=linear_func)) else: all_pred.append( np.mean([linear_i(x, linear_func=linear_func) for x in train])) return pd.Series(all_pred, index=out_months) #广义交叉验证的岭回归 ridge_pre_expanding = linear_prediction(RidgeCV()) #五折交叉验证的Lasso回归 lasso_pre_expanding = linear_prediction(LassoCV(cv=5)) #五折交叉验证的弹性网络 elasticnet_pre_expanding = linear_prediction(ElasticNetCV(cv=5)) #五折交叉验证的最小角回归 lars_pre_expanding = linear_prediction(LarsCV(cv=5)) #五折交叉验证的正交匹配追踪 omp_pre_expanding = linear_prediction(OrthogonalMatchingPursuitCV(cv=5)) #保存结果 pd.DataFrame({ "ridge": ridge_pre_expanding, "lasso": lasso_pre_expanding, "elastic net": elasticnet_pre_expanding, "lars": lars_pre_expanding, "OMP": omp_pre_expanding }).to_csv(os.path.join(path, "预测结果", "expanding方式窗口预测.csv")) linear_pre_expanding = pd.read_csv(os.path.join(path, "预测结果", "expanding方式窗口预测.csv"),
Y_train = X_train['SalePrice_log'] X_train.drop(labels='SalePrice_log', axis=1, inplace=True) X_test = pd.read_csv(data_dir + 'clean_test.csv') # ---- # # ─── BUILD MODEL ──────────────────────────────────────────────────────────────── # # Build ElasticNet regressor, # Optimise hyperparameters with CV enr_cv = ElasticNetCV( n_jobs=-1, random_state=5, cv=5, l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], fit_intercept=True, precompute='auto', copy_X=True, verbose=2) enr_cv.fit(X_train, Y_train) print(enr_cv.alpha_) print(enr_cv.mse_path_) # # ─── CREATE .CSV FOR SUBMISSION ────────────────────────────────────────────────── # X_test['SalePrice'] = pd.DataFrame( enr_cv.predict(X_test.drop(labels='Id', axis=1))).apply(lambda x: 10**x)
def regression_stage1(dmi, divs, dtreatments, gm_wbc, dinfections, pidmeta, dw, wbc_type='neutrophils'): """Stage 1 Elastic Net regression dmi: microbiota composition for intervals divs: microbiota diversity for intervals, contains sample additional sample info (pid) dtreatments: immunomodulatory medications and treatments administered during interval gm_wbc: geometric mean of absolute white blood cell counts during interval dinfections: positive blood cultures detected during interval pidmeta: patient and HCT meta data dw: daily change in WBC count, "y" wbc_type: WBC type considered """ # identify patients with microbiota data (stage 2) to exclude in stage 1 # by inner-joining with microbiota diversity table Xmi = pd.merge(divs, dw, on=["pid", "anon-date"], how="inner") mi_pids = Xmi.reset_index().pid.unique() # stage 1 feature selection regression (and stage 2 as regularized ML version instead of full Bayesian for internal checks) for MIINDICATOR in ["stage1", "stage2"]: X = pd.merge( dw, gm_wbc, on=["pid", "anon-date"], how="inner", suffixes=["", "_REMOVEgmwbc"]) if MIINDICATOR == "stage2": # ML-version of the Bayesian model for stage 2, include microbiome X = pd.merge( X, dmi, on=["pid", "anon-date"], how="inner", suffixes=["", "_REMOVE_dmi"]) X = pd.merge( X, divs.reset_index()[["pid", "anon-date", "inverseSimpson"]], on=["pid", "anon-date"], how="inner", suffixes=["", "_REMOVE_ivs"]) X = pd.merge( X, dtreatments, on=["pid", "anon-date"], how="left", suffixes=["", "_REMOVEdreatments"]) X = pd.merge( X, dinfections, on=["pid", "anon-date"], how="left", suffixes=["", "_REMOVEdinfections"]) X = pd.merge( X, pidmeta, on=["pid", "n_bmt"], how="left", suffixes=["", "_REMOVEpidmeta"]) X = X.loc[~X["log_%s" % WBCTYPE].isna()] X = X[[x for x in X.columns if "REMOVE" not in x]] X = X.drop(columns=["n_bmt"]) X_columns = X.columns if MIINDICATOR == "stage1": # mi_pids: patients with microbiota data X = X.loc[X.pid.apply(lambda v: v not in mi_pids)] elif MIINDICATOR == "stage2": # ML-version of the Bayesian model for stage 2 X = X.loc[X.pid.apply(lambda v: v in mi_pids)] # intercepts per transplant type X = X.join(pd.get_dummies(X["hct_source"])) # drop original HCT type column X = X.drop(columns=['hct_source', "PBSC"]) #PBSC as reference # intercepts per intensity X = X.join(pd.get_dummies(X["Intensity"].fillna("unknown"))) X = X.drop(columns=["ABLATIVE", "unknown", "Intensity"]) #ABLATIVE as reference # intercepts female X = X.join(pd.get_dummies(X.sex)['F']) X = X.drop(columns='sex') # only after engraftment and before day 100 X = X.query('day>6 ').copy() #smallest observed engraftment da # only analyze daily changes X = X.query('dt == 1').copy() # # from join, fill gaps X.loc[:, dinfections.columns] = X[dinfections.columns].fillna(0) X.loc[:, dtreatments.columns] = X[dtreatments.columns].fillna(0) # missing patient ages, fill with mean for ML feature selection X["age"] = X["age"].fillna(X["age"].mean()) ### drop columns # delta time columns dtcols = [x for x in X.columns if 'dt' in x] X = X.drop(columns=dtcols) # time point columns anoncols = [x for x in X.columns if 'anon' in x] X = X.drop(columns=anoncols) # patient id columns pidcols = [x for x in X.columns if ('pid' in x) and (x != 'pid')] remaining_pids = X.reset_index().pid.unique() print("pid count in regression", len(remaining_pids)) X = X.drop(columns=pidcols) print("shape before dropna", X.shape) X = X.dropna() print("shape after dropna (should not change)", X.shape) # drop all zero columns drop_zero_columns = (X.sum() == 0) & (X.max() == 0) drop_zero_columns = drop_zero_columns.loc[ drop_zero_columns.values].index X = X.drop(columns=drop_zero_columns) # drop HCT day daycolumns = [ x for x in X.columns if ("day" in x) and (x not in ["day", "eday"]) ] X.drop(columns=daycolumns, inplace=True) #### data transformations and standardizations from sklearn.preprocessing import StandardScaler X.loc[:, [ 'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils', 'platelets' ]] = np.log10(X[[ 'neutrophils', 'lymphocytes', 'monocytes', 'eosinophils', 'platelets' ]] + 0.1 / 2) from sklearn.linear_model import ElasticNetCV # Fit and summarize OLS model # CV on patient sub samples, X contains 'pid' column Xpid = X.copy() X = X.drop(columns=['pid']) _drug_cols = [x for x in X.columns if x in dtreatments.columns] _other_cols = [x for x in X.columns if x not in dtreatments.columns] groups = Xpid.pid from sklearn.model_selection import GroupKFold group_kfold = GroupKFold(n_splits=10) cv = list( group_kfold.split( X.drop(columns='log_%s' % wbc_type), X['log_%s' % wbc_type], groups)) mod = ElasticNetCV( cv=cv, positive=False, normalize=False, fit_intercept=True) res = mod.fit( MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)), X['log_%s' % wbc_type], ) r2 = mod.score( MinMaxScaler().fit_transform(X.drop(columns='log_%s' % wbc_type)), X['log_%s' % wbc_type], ) print('chosen alpha: %f' % mod.alpha_) print("R2 = %f" % r2) coefs = pd.Series( res.coef_, index=X.drop(columns='log_%s' % wbc_type).columns).replace( 0, np.nan).dropna().sort_values() coefs["gr"] = mod.intercept_ coefs["N"] = len(np.unique(Xpid.pid)) coefs["n"] = X.shape[0] coefs["r2"] = r2 coefs["alpha"] = mod.alpha_ return (coefs)
'filesReg': filesReg, 'filesBinClass': filesBinClass, 'filesMultiClass': filesMultiClass } regsNames = ['LinearRegression', 'RidgeCV', 'LassoCV', 'ElasticNetCV'] Regs = [ LinearRegression(normalize=True), RidgeCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0], cv=10, normalize=True), LassoCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0], cv=10, normalize=True), ElasticNetCV(alphas=[0.1, 0.3, 0.5, 0.7, 1.0, 3.0, 5.0, 7.0, 10.0], cv=10, normalize=True) ] Classifiers = [ LogisticRegressionCV(cv=10), DecisionTreeClassifier(max_depth=3), svm.SVC(kernel='rbf', probability=True), svm.SVC(kernel='linear', probability=True), neighbors.KNeighborsClassifier(n_neighbors=7) ] #naive_bayes.GaussianNB(),neighbors.KNeighborsClassifier(n_neighbors=7)] ClassifiersNames = [ 'LogisticRegressionCV', 'DecisionTreeClassifier', 'svm.SVC_rbf', 'svm.SVC_linear', 'neighbors.KNeighborsClassifier' ]
def main_method(): configure_logging() train = load_training_dataset() logging.info(train.shape) # Duplicates check check_for_duplicates(train) exit() # ## Pre-processing plot_col_vs_sale_price(train, "GrLivArea", title="Looking for outliers") # Drop the houses with more than 4000 sq feet following dataset author recommendations: # https://ww2.amstat.org/publications/jse/v19n3/decock.pdf # ### Pre-processing steps added # # * Filter large house outliers # * Log transform the target (Sale Price) # * errors have same effect whether the house is cheap or not # train = preprocessing_pipeline(train) y = train.SalePrice plot_col_vs_sale_price(train, "GrLivArea", title="Area vs Sale Price AFTER Log transform") train = fill_null_values(train) # Numerical features that are really categories # TODO: pull into pipeline method train = create_sub_class_categories(train) train = create_month_sold_category(train) # Encode categoricals as ordered number features # when there is information in the order train = create_ordinal_categories(train) # Simplifications of existing features create_simple_overall_quality(train) create_simple_overall_condition(train) create_simple_pool_quality(train) create_simple_garage_condition(train) create_simple_garage_quality(train) create_simple_fireplace_quality(train) create_simple_functional_feature(train) create_simple_kitchen_quality(train) create_simple_heating_quality(train) create_simple_basement_finish(train, "BsmtFinType1") create_simple_basement_finish(train, "BsmtFinType2") create_simple_basement_condition(train) create_simple_basement_quality(train) create_simple_exterior_condition(train) create_simple_exterior_quality(train) # Combinations of existing features create_interaction_features(train) # Has masonry veneer or not create_simple_has_masonry_veneer(train) create_house_bought_pre_build(train) create_polynomial_features(train) # #### Split numerical and categorical features categorical_features = train.select_dtypes(include=["object"]).columns numerical_features = train.select_dtypes(exclude=["object"]).columns.drop("SalePrice") logging.info(f"Numerical features: {len(numerical_features)}") logging.info(f"Categorical features: {len(categorical_features)}") train_num = train[numerical_features] train_cat = train[categorical_features] # Handle missing values in numerical features by using the median logging.info(f"Missing numerical values: {train_num.isnull().values.sum()}") train_num = train_num.fillna(train_num.median()) logging.info(f"Remaining missing numerical values: {train_num.isnull().values.sum()}") ## Log transform skewed numerical features to lessen impact of outliers # Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models # As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed skewness = train_num.apply(lambda x: skew(x)) skewness = skewness[abs(skewness) > 0.5] skewed_features = skewness.index train_num[skewed_features] = np.log1p(train_num[skewed_features]) ## One hot encode categorical variables train_cat = pd.get_dummies(train_cat) train = pd.concat([train_num, train_cat], axis=1) logging.info(f"Number of features: {train.shape[1]}") # ## Modelling # # * Split dataset # * Standardisation (don't want to fit on observations that will be in the test set) # # ### Modelling techniques tried # # * Linear regression # * Ridge Regression (L2) # * LASSO (L1) # * ElasticNET (L1 AND L2) # # Split training set X_train, X_test, y_train, y_test = train_test_split( train, y, test_size=0.3, random_state=0 ) logging.info("X_train", str(X_train.shape)) logging.info("X_test", str(X_test.shape)) logging.info("y_train", str(y_train.shape)) logging.info("y_test", str(y_test.shape)) # Standard scale the features # Done after partitioning to avoid fitting scaler to observations in the test set # Should the scaler be pickled for deployment use cases then? scaler = StandardScaler() X_train.loc[:, numerical_features] = scaler.fit_transform(X_train.loc[:, numerical_features]) X_test.loc[:, numerical_features] = scaler.transform(X_test.loc[:, numerical_features]) # Official error measure for scoring: RMSE scorer = make_scorer(mean_squared_error, greater_is_better=False) # ## Linear regression without regularisation linear_regression = LinearRegression() linear_regression.fit(X_train, y_train) logging.info(f"RMSE on Training set: {rmse_cv_train(linear_regression).mean()}") logging.info(f"RMSE on Test set: {rmse_cv_test(linear_regression).mean()}") y_train_pred = linear_regression.predict(X_train) y_test_pred = linear_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() # ## Linear Regression with Ridge Regression (L2 Penalty) # # * Regularisation is a good way to hadnle collinearity, filter out noise and prevent overfitting. # * L2 penalty add the squared sum of weights to cost function ridge_regression = RidgeCV( alphas=[0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60] ) ridge_regression.fit(X_train, y_train) best_alpha = ridge_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info("Re-fit with alphas around the best alpha") ridge_regression = RidgeCV( alphas=[ best_alpha * .6, best_alpha * .65, best_alpha * .7, best_alpha * .75, best_alpha * .8, best_alpha * .85, best_alpha * .9, best_alpha * .9, best_alpha * .95, best_alpha, best_alpha * 1.05, best_alpha * 1.1, best_alpha * 1.15, best_alpha * 1.2, best_alpha * 1.25, best_alpha * 1.3, best_alpha * 1.35, best_alpha * 1.4, ], cv=10 ) ridge_regression.fit(X_train, y_train) best_alpha = ridge_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info(f"Ridge RMSE on Training set: {rmse_cv_train(ridge_regression).mean()}") logging.info(f"Ridge RMSE on Test set: {rmse_cv_test(ridge_regression).mean()}") y_train_pred = ridge_regression.predict(X_train) y_test_pred = ridge_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression with Ridge regularisation") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression with Ridge regularisation") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() ## Plot important coefficients coefs = pd.Series(ridge_regression.coef_, index=X_train.columns) logging.info(f"Ridge picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features") important_coefficients = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)]) important_coefficients.plot(kind="barh") plt.title("Coefficients in the Ridge Model") plt.show() # Results: # * Better RMSE for Ridge # * Small difference between training and test suggests that overfitting has been eliminated # * Ridge used almost all of the features (only 3 dropped) # # TODO: Understand the coefficient plot (if possible?) # ## LASSO Regression # # Least Absolute Shrinkage and Selection Operator. # # Alternative regularisation method, L1 Regularisation, use the absolute value of weights rather than squares. # # Most weights will be 0. Useful in high dimensional dataset where most features are irrelevant. # # Hypothesis: more efficient than Ridge Regression. lasso_regression = LassoCV( alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10 ) lasso_regression.fit(X_train, y_train) best_alpha = lasso_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info("Re-fit with alphas around the best alpha") lasso_regression = LassoCV( alphas=[ best_alpha * .6, best_alpha * .65, best_alpha * .7, best_alpha * .75, best_alpha * .8, best_alpha * .85, best_alpha * .9, best_alpha * .9, best_alpha * .95, best_alpha, best_alpha * 1.05, best_alpha * 1.1, best_alpha * 1.15, best_alpha * 1.2, best_alpha * 1.25, best_alpha * 1.3, best_alpha * 1.35, best_alpha * 1.4, ], max_iter=50000, cv=10 ) lasso_regression.fit(X_train, y_train) best_alpha = lasso_regression.alpha_ logging.info(f"Best alpha {best_alpha}") logging.info(f"LASSO RMSE on Training set: {rmse_cv_train(lasso_regression).mean()}") logging.info(f"LASSO RMSE on Test set: {rmse_cv_test(lasso_regression).mean()}") y_train_pred = lasso_regression.predict(X_train) y_test_pred = lasso_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression with LASSO regularisation") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression with Ridge regularisation") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() ## Plot important coefficients coefs = pd.Series(lasso_regression.coef_, index=X_train.columns) logging.info(f"LASSO picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features") important_coefficients = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)]) important_coefficients.plot(kind="barh") plt.title("Coefficients in the Ridge Model") plt.show() # ## ElasticNET # # * Compromise between L1 and L2 penalties elastic_net_regression = ElasticNetCV( l1_ratio=[0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1], alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10 ) elastic_net_regression.fit(X_train, y_train) best_l1_ratio = elastic_net_regression.l1_ratio_ best_alpha = elastic_net_regression.alpha_ logging.info(f"Best L1 Ratio {best_l1_ratio}") logging.info(f"Best alpha {best_alpha}") logging.info("Re-fit with alphas around the best alpha") elastic_net_regression = ElasticNetCV( l1_ratio=[ best_l1_ratio * .85, best_l1_ratio * .9, best_l1_ratio * .9, best_l1_ratio * .95, best_l1_ratio, best_l1_ratio * 1.05, best_l1_ratio * 1.1, best_l1_ratio * 1.15 ], alphas=[ best_alpha * .6, best_alpha * .65, best_alpha * .7, best_alpha * .75, best_alpha * .8, best_alpha * .85, best_alpha * .9, best_alpha * .9, best_alpha * .95, best_alpha, best_alpha * 1.05, best_alpha * 1.1, best_alpha * 1.15, best_alpha * 1.2, best_alpha * 1.25, best_alpha * 1.3, best_alpha * 1.35, best_alpha * 1.4, ], max_iter=50000, cv=10 ) elastic_net_regression.fit(X_train, y_train) best_l1_ratio = elastic_net_regression.l1_ratio_ best_alpha = elastic_net_regression.alpha_ logging.info(f"Best L1 Ratio {best_l1_ratio}") logging.info(f"Best alpha {best_alpha}") logging.info(f"ElasticNet RMSE on Training set: {rmse_cv_train(elastic_net_regression).mean()}") logging.info(f"ElasticNet RMSE on Test set: {rmse_cv_test(elastic_net_regression).mean()}") y_train_pred = elastic_net_regression.predict(X_train) y_test_pred = elastic_net_regression.predict(X_test) # Plot residuals plt.scatter(y_train_pred, y_train_pred - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test_pred - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear Regression with ElasticNET regularisation") plt.xlabel("Predicted Values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_pred, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_pred, y_test, c="lightgreen", marker="s", label="Validation") plt.title("Linear Regression with ElasticNET regularisation") plt.xlabel("Predicted Values") plt.ylabel("Real Values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() ## Plot important coefficients coefs = pd.Series(elastic_net_regression.coef_, index=X_train.columns) logging.info(f"ElasticNET picked {sum(coefs != 0)} features and eliminated the other {sum(coefs == 0)} features") important_coefficients = pd.concat([coefs.sort_values().head(10), coefs.sort_values().tail(10)]) important_coefficients.plot(kind="barh") plt.title("Coefficients in the ElasticNET Model") plt.show()
classifiers = [ SVC(kernel="rbf", probability=True), SVC(kernel='linear', probability=True), SVC(kernel='sigmoid', probability=True), SVC(kernel='poly', probability=True, degree=3), SVC(kernel='poly', probability=True, degree=4), SVC(kernel='poly', probability=True, degree=5), DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), QuadraticDiscriminantAnalysis(), LinearDiscriminantAnalysis(), ElasticNetCV(max_iter=10000), LarsCV(), LassoCV(max_iter=10000), LassoLarsCV(), LogisticRegressionCV(), #17 MultiTaskElasticNetCV(), MultiTaskLassoCV(), OrthogonalMatchingPursuitCV(), RidgeClassifierCV() ] algorithm = 17 if len(sys.argv) > 1: algorithm = int(sys.argv[1]) name = names[algorithm] clf = classifiers[algorithm]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=12) np_Xtrain = np.array(X_train.iloc[:, -len(X_train.columns) + 1:-1]) np_Xtest = np.array(X_test.iloc[:, -len(X_test.columns) + 1:-1]) np_ytest = np.array(y_test.iloc[:, 1]) np_ytrain = np.array(y_train.iloc[:, 1]) #Performing elastic net cv to determine optimal alpha and l1 ratio X_cv = np.array(X.iloc[:, -len(X.columns) + 1:-1]) y_cv = np.array(y.iloc[:, 1]) from sklearn.linear_model import ElasticNetCV enet_cv = ElasticNetCV(cv=10, n_jobs=2, tol=0.5) enet_cvfit = enet_cv.fit(X_cv, y_cv) #Performing elastic net from sklearn.linear_model import ElasticNet alpha = enet_cvfit.alpha_ l1ratio = enet_cvfit.l1_ratio_ tolerance = 0.5 enet = ElasticNet(alpha=alpha, l1_ratio=l1ratio, tol=tolerance, max_iter=10000) enet_fitted = enet.fit(np_Xtrain, np_ytrain) y2_pred = enet_fitted.predict(np_Xtest) from sklearn.metrics import r2_score r2_score_model = r2_score(np_ytest, y2_pred) print(enet) print("r^2 on test data : %f" % r2_score_model)
ax = plt.gca() ax.plot(lasso_alphas, lasso_coefs) ax.set_xscale('log') plt.xlabel('alpha') plt.ylabel('weights') plt.title('Lasso coefficients as a function of the regularization') plt.axis('tight') plt.show() # In[51]: lcv.coef_ # 弹性网络 # In[52]: from sklearn.linear_model import ElasticNetCV l1_ratio = [0, .1, .5, .7, .9, .95, .99, 1] encv = ElasticNetCV(l1_ratio=l1_ratio) encv.fit(X, y) # In[53]: print('The best l1_ratio is {}'.format(encv.l1_ratio_)) print('The best alpha is {}'.format(encv.alpha_))
def train_linear_model( X, y, random_state=1, test_size=0.2, regularization_type="elasticnet", k_fold=5, max_iter=1000000, tol=0.0001, l1_ratio=None, ): """ Function to train linear model with regularization and cross-validation. Args: X (pandas.DataFrame): dataframe of descriptors. y (pandas.DataFrame): dataframe of cycle lifetimes. random_state (int): seed for train/test split. test_size (float): proportion of the dataset reserved for model evaluation. regularization_type (str): lasso or ridge or elastic-net (with cv). k_fold (int): k in k-fold cross-validation. max_iter (int): maximum number of iterations for model fitting. tol (float): tolerance for optimization. l1_ratio ([float]): list of lasso to ridge ratios for elasticnet. Returns: sklearn.linear_model.LinearModel: fitted model. mu (float): Mean value of descriptors used in training. s (float): Std dev of descriptors used in training. """ if l1_ratio is None: l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 1] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state) # Standardize (training) data after train/test split mu = np.mean(X_train, axis=0) s = np.std(X_train, axis=0) X_scaled = (X_train - mu) / s hyperparameters = { "random_state": random_state, "test_size": test_size, "k_fold": k_fold, "tol": tol, "max_iter": max_iter, } if regularization_type == "lasso" and y.shape[1] == 1: lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol, cv=k_fold, max_iter=max_iter) lassocv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = lassocv.alpha_ linear_model = Lasso(fit_intercept=True, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train.values) hyperparameters["l1_ratio"] = 1 elif regularization_type == "ridge" and y.shape[1] == 1: ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold) ridgecv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = ridgecv.alpha_ linear_model = Ridge(fit_intercept=True, alpha=alpha_opt) linear_model.fit(X_scaled, y_train) hyperparameters["l1_ratio"] = 0 elif regularization_type == "elasticnet" and y.shape[1] == 1: elasticnetcv = ElasticNetCV( fit_intercept=True, normalize=False, alphas=None, cv=k_fold, l1_ratio=l1_ratio, max_iter=max_iter, ) elasticnetcv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and l1_ratio. Refit model alpha_opt = elasticnetcv.alpha_ l1_ratio_opt = elasticnetcv.l1_ratio_ linear_model = ElasticNet( fit_intercept=True, normalize=False, l1_ratio=l1_ratio_opt, alpha=alpha_opt, max_iter=max_iter, ) linear_model.fit(X_scaled, y_train) hyperparameters["l1_ratio"] = l1_ratio_opt # If more than 1 outcome present, perform multitask regression elif regularization_type == "elasticnet" and y.shape[1] > 1: multi_elasticnet_CV = MultiTaskElasticNetCV( fit_intercept=True, cv=k_fold, normalize=False, l1_ratio=l1_ratio, max_iter=max_iter, ) multi_elasticnet_CV.fit(X_scaled, y_train) # Set optimal alpha and l1_ratio. Refit model alpha_opt = multi_elasticnet_CV.alpha_ l1_ratio_opt = multi_elasticnet_CV.l1_ratio_ linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False, max_iter=max_iter) linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt) linear_model.fit(X_scaled, y_train) hyperparameters["l1_ratio"] = l1_ratio_opt else: raise NotImplementedError y_pred = linear_model.predict((X_test - mu) / s) Rsq = linear_model.score((X_test - mu) / s, y_test) # Compute 95% confidence interval # Multioutput = 'raw_values' provides prediction error per output pred_actual_ratio = [x / y for x, y in zip(y_pred, np.array(y_test))] relative_prediction_error = 1.96 * np.sqrt( mean_squared_error(np.ones(y_pred.shape), pred_actual_ratio, multioutput="raw_values") / y_pred.shape[0]) hyperparameters["alpha"] = alpha_opt return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
# n_jobs=-1 allows your computer to use all its cores for the computation. # Lasso is more computationally intensive than ridge for a small number of # features. # The lasso regression is more resistant to outliers compared to ridge, and it # has the added advantage of being able to eliminate less important features. # For this reason it is used when the number of features is very large and you # don't know which are important. lasso = LassoCV(n_alphas=50, cv=5) # ElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, # fit_intercept=True, normalize=False, precompute=’auto’, max_iter=1000, # tol=0.0001, cv=’warn’, copy_X=True, verbose=0, n_jobs=None, positive=False, # random_state=None, selection=’cyclic’) # A combination of Ridge and Lasso for large datasets. # l1_ratio determines how much of Ridge or Lasso to use. 1 is Lasso, 0 is # Ridge. elastic = ElasticNetCV(cv=5) def search_timer(c, X_train, y_train): start = time.time() c.fit(X_train, y_train) end = time.time() total = end - start return total # RandomizedSearchCV(estimator, param_distributions, n_iter=10, scoring=None, # n_jobs=None, refit=True, cv='warn', verbose=0, pre_dispatch='2*n_jobs', # random_state=None, return_train_score=False) searches for the best parameters # for a model randomly. The best model, score and parameters can then be # accessed using the object's properties.
from sklearn.linear_model import ElasticNetCV, enet_path data = np.loadtxt('spike_cluster5.csv', delimiter=",") waves = np.loadtxt('waves_cluster5.csv', delimiter=",") ndir = np.loadtxt('ndir_cluster5.csv', delimiter=",") #print(waves.shape) # waves (90(sum of all directions) x 25704(12S*102ch*20ms)) #print(data.shape) # data (2040(102channels*20ms) x 273(nspikes*R)) # speeds = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] nspikes = len(ndir) # number of spikes in the cluster R = int(data.shape[1]/nspikes) # number of shifts in sliding window S = int(waves.shape[1]/data.shape[0]) # number of propagation speeds T = data.shape[0] regression = ElasticNetCV(positive=True, normalize=True, cv=5, max_iter=10000) # elastic net regression cumdir = 0 bestind = np.zeros([nspikes,], dtype=int) # indices of best speeds numdir = np.zeros(nspikes,) # number of nonzero directions in optimum #bestcoef = np.zeros([nspikes, int(ndir.T.max())]) finalscore = np.zeros(nspikes,) for ind_sp in range(0,nspikes): Ndir = int(ndir[ind_sp]) # number of propagation directions datasp = data[:, ind_sp*R:(ind_sp + 1)*R] # spike 21x2040 wavessp = waves[cumdir:cumdir + Ndir] # waves for this spike Ndir x 12S*102ch*20ms coefs = np.zeros([R,S,Ndir]) # regression coefficients intercept = np.zeros([R,S]) # regression intercept score = np.zeros([R,S]) # R-squared scores
warm_start=True) if (method == 5): print('Random forest 02') str_method = 'RandomForest02' r = RandomForestRegressor(n_estimators=90, max_depth=4, n_jobs=-1, random_state=ra1, verbose=0, warm_start=True) if (method == 6): print('ElasticNet') str_method = 'Elastic Net' r = ElasticNetCV() if (method == 7): print('GradientBoosting 01') str_method = 'GradientBoosting01' r = GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.05, random_state=ra1, verbose=0, warm_start=True, subsample=0.6, max_features=0.6) if (method == 8): print('GradientBoosting 02') str_method = 'GradientBoosting02'
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5] alphas2 = [ 5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008 ] e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007] e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1] ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds)) lasso = make_pipeline( RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds)) elasticnet = make_pipeline( RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)) svr = make_pipeline(RobustScaler(), SVR( C=20, epsilon=0.008, gamma=0.0003, )) gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42)
#Lasso lasso = Lasso(alpha=0.0005, random_state=1) print("Lasso score: {:.4f} \n".format(cv_rmse(lasso).mean())) alphas_alt = [ 14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5, 10, 5 ] e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007] #Ridge ridge = RidgeCV(alphas=alphas_alt, cv=kfolds) print("Ridge score: {:.4f} \n".format(cv_rmse(ridge).mean())) #ElasticNet elasticnet = ElasticNetCV(cv=kfolds, alphas=e_alphas) print("ElasticNet score: {:.4f} \n".format(cv_rmse(elasticnet).mean())) #Svr #svr = SVR() #print(cv_rmse(svr).mean()) #XGBoost xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear',
# -*- coding: utf-8 -*- """ Created on Sun Mar 26 15:48:46 2017 @author: kaifeng """ import numpy as np from sklearn.datasets import load_svmlight_file from sklearn.cross_validation import KFold from sklearn.linear_model import ElasticNetCV data, target = load_svmlight_file('C:/Users/carne/Desktop/E2006.train') met = ElasticNetCV(fit_intercept=True) kf = KFold(len(target), n_folds=2) for train, test in kf: met.fit(data[train], target[train]) p = map(met.predict, data[test]) p = np.array(p).ravel() e = p - target[test] err += np.dot(e, e) rmse_10cv = np.sqrt(err / len(target)) # 别用个人笔记本跑。。。有点浪费时间了~
min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostAuto") build_auto(ARDRegression(normalize=True), "BayesianARDAuto") build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAuto", compact=False) build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto( OptimalLGBMRegressor(objective="regression", n_estimators=17, num_iteration=11), "LGBMAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto( BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75),
import numpy as np from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MinMaxScaler, StandardScaler # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( StandardScaler(), MinMaxScaler(), ElasticNetCV(l1_ratio=0.15000000000000002, tol=0.0001) ) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)