def test_multitask_enet_and_lasso_cv(): X, y, _, _ = build_dataset(n_features=50, n_targets=3) clf = MultiTaskElasticNetCV(cv=3).fit(X, y) assert_almost_equal(clf.alpha_, 0.00556, 3) clf = MultiTaskLassoCV(cv=3).fit(X, y) assert_almost_equal(clf.alpha_, 0.00278, 3) X, y, _, _ = build_dataset(n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=10, eps=1e-3, max_iter=100, l1_ratio=[0.3, 0.5], tol=1e-3, cv=3) clf.fit(X, y) assert 0.5 == clf.l1_ratio_ assert (3, X.shape[1]) == clf.coef_.shape assert (3, ) == clf.intercept_.shape assert (2, 10, 3) == clf.mse_path_.shape assert (2, 10) == clf.alphas_.shape X, y, _, _ = build_dataset(n_targets=3) clf = MultiTaskLassoCV(n_alphas=10, eps=1e-3, max_iter=100, tol=1e-3, cv=3) clf.fit(X, y) assert (3, X.shape[1]) == clf.coef_.shape assert (3, ) == clf.intercept_.shape assert (10, 3) == clf.mse_path_.shape assert 10 == len(clf.alphas_)
class MultiLasso_model(Lasso_model): def __init__(self, train_path, test_path, pred_path): super().__init__(train_path, test_path, pred_path) self.multiLasso_model = MultiTaskLassoCV( alphas=[float(i) * 0.05 for i in range(1, 100)], cv=8, max_iter=1000000) def train(self, X_train, Y_train): self.multiLasso_model.fit(X_train, Y_train) def pred(self, X_test): return self.multiLasso_model.predict(X_test) def run(self): X_train_PMNF, X_test_PMNF, y_trains, y_tests = super().get_train_test() self.train(X_train_PMNF, np.asarray(y_trains).T) y_preds = self.pred(X_test_PMNF).T print(y_preds.shape, np.asarray(y_tests).shape) with open(self.pred_path, "w", newline='') as f: csv_writer = csv.writer(f) for i in range(len(y_trains)): for row in self.data_train_split[i]: csv_writer.writerow(row) group = self.data_test_split[i][self.split_train_len:, :] for j in range(len(group)): row = np.append(group[j, :], y_preds[i][j]) csv_writer.writerow(row)
def _informativeness(self, z_p, z): if isinstance(self.regressor, LassoCV): regressor = MultiTaskLassoCV(cv=self.regressor.cv, max_iter=2000, selection='random') regressor.fit(z_p, z) return self.regressor.score(z_p)
def test_1d_multioutput_lasso_and_multitask_lasso_cv(): X, y, _, _ = build_dataset(n_features=10) y = y[:, np.newaxis] clf = LassoCV(n_alphas=5, eps=2e-3) clf.fit(X, y[:, 0]) clf1 = MultiTaskLassoCV(n_alphas=5, eps=2e-3) clf1.fit(X, y) assert_almost_equal(clf.alpha_, clf1.alpha_) assert_almost_equal(clf.coef_, clf1.coef_[0]) assert_almost_equal(clf.intercept_, clf1.intercept_[0])
class MultiTaskLassoCVImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')): ''' methods = ('variance', 'correlation', 'l1', 'forest') - variance: use variance threshold to discard features that are mostly 0 or 1 - correlation: use chi2 test to remove most very correlated features - l1: use l1 penalty to remove features that make solution sparse - forest: use ExtraTreesClassifier to point out importance of features select important ones ''' features = x.loc[:,'Feature_1':'Feature_2'] if 'variance' in methods: vt = VT(threshold=(0.99*(1-0.99))) vt.fit(features) if 'correlation' in methods: cr = SP(f_regression, percentile=80) if 'l1' in methods: rgr = MultiTaskLassoCV(cv=5, n_jobs=-1) m = SFM(rgr) if 'forest' in methods: clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y) m = SFM(clf) m.fit(x.values, y.values) for indices in idx_list: x_indices = x_indices & indices print 'All: %s' % len(x_indices) return list(x_indices)
def _MTLassoCV_MatchSpace(X, Y, v_pens=None, n_v_cv=5, sample_frac=1, Y_col_block_size=None, se_factor=None, normalize=True, **kwargs): # pylint: disable=missing-param-doc, unused-argument # A fake MT would do Lasso on y_mean = Y.mean(axis=1) if sample_frac < 1: N = X.shape[0] sample = np.random.choice(N, int(sample_frac * N), replace=False) X = X[sample, :] Y = Y[sample, :] if Y_col_block_size is not None: Y = _block_summ_cols(Y, Y_col_block_size) varselectorfit = MultiTaskLassoCV(normalize=normalize, cv=n_v_cv, alphas=v_pens).fit(X, Y) best_v_pen = varselectorfit.alpha_ if se_factor is not None: best_v_pen = _neg_se_rule(varselectorfit, factor=se_factor) varselectorfit = MultiTaskLasso(alpha=best_v_pen, normalize=normalize).fit(X, Y) V = np.sqrt(np.sum(np.square(varselectorfit.coef_), axis=0)) # n_tasks x n_features -> n_feature m_sel = V != 0 transformer = SelMatchSpace(m_sel) return transformer, V[m_sel], best_v_pen, (V, varselectorfit)
def fit(self, df_X, df_y): logger.info("Fitting MultiTaskLasso") if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if self.scale: logger.debug("Scaling motif scores") # Scale motif scores df_X.loc[:, :] = scale(df_X, axis=0) # logger.debug("Scaling y") # Normalize across samples and features # y = df_y.apply(scale, 1).apply(scale, 0) y = df_y X = df_X.loc[y.index] model = Pipeline([ ("scale", StandardScaler()), ( "reg", MultiTaskLassoCV(fit_intercept=False, n_alphas=20, n_jobs=self.ncpus), ), ]) logger.debug("Fitting model") model.fit(df_X, df_y) logger.info("Done") self.act_ = pd.DataFrame(model.steps[1][1].coef_, index=y.columns, columns=X.columns).T
def _MTLassoMixed_MatchSpace(X, Y, fit_model_wrapper, v_pens=None, n_v_cv = 5, **kwargs): #pylint: disable=missing-param-doc, unused-argument #Note that MultiTaskLasso(CV).path with the same alpha doesn't produce same results as MultiTaskLasso(CV) mtlasso_cv_fit = MultiTaskLassoCV(normalize=True, cv=n_v_cv, alphas = v_pens).fit(X, Y) #V_cv = np.sqrt(np.sum(np.square(mtlasso_cv_fit.coef_), axis=0)) #n_tasks x n_features -> n_feature #v_pen_cv = mtlasso_cv_fit.alpha_ #m_sel_cv = (V_cv!=0) #sc_fit_cv = fit_model_wrapper(SelMatchSpace(m_sel_cv), V_cv[m_sel_cv]) v_pens = mtlasso_cv_fit.alphas_ #fits_single = {} Vs_single = {} scores = np.zeros((len(v_pens))) #R2s = np.zeros((len(v_pens))) for i, v_pen in enumerate(v_pens): mtlasso_i_fit = MultiTaskLasso(alpha=v_pen, normalize=True).fit(X, Y) V_i = np.sqrt(np.sum(np.square(mtlasso_i_fit.coef_), axis=0)) m_sel_i = (V_i!=0) sc_fit_i = fit_model_wrapper(SelMatchSpace(m_sel_i), V_i[m_sel_i]) #fits_single[i] = sc_fit_i Vs_single[i] = V_i scores[i] = sc_fit_i.score #R2s[i] = sc_fit_i.score_R2 i_best = np.argmin(scores) #v_pen_best = v_pens[i_best] #i_cv = np.where(v_pens==v_pen_cv)[0][0] #print("CV alpha: " + str(v_pen_cv) + " (" + str(R2s[i_cv]) + ")." + " Best alpha: " + str(v_pen_best) + " (" + str(R2s[i_best]) + ") .") best_v_pen = v_pens[i_best] V_best = Vs_single[i_best] m_sel_best = (V_best!=0) return SelMatchSpace(m_sel_best), V_best[m_sel_best], best_v_pen, V_best
def initialize(self, experiences=[]): scaler = StandardScaler() # 価値関数の定義 if self.estimate_func == "Linear": estimator = LinearRegression() elif self.estimate_func == "NN": estimator = MLPRegressor(hidden_layer_sizes=(10, 10), max_iter=1) elif self.estimate_func == "Ridge": estimator = Ridge(alpha=self.alpha) elif self.estimate_func == "Ridge(withoutIntercept)": estimator = Ridge(alpha=self.alpha, fit_intercept=False) elif self.estimate_func == "Lasso": estimator = Lasso(alpha=self.alpha) elif self.estimate_func == "RidgeCV": estimator = RidgeCV(alphas=10**np.arange(-6, 1, 0.1), cv=5) elif self.estimate_func == "LassoCV": estimator = MultiTaskLassoCV(alphas=10**np.arange(-6, 1, 0.1), cv=5) self.model = Pipeline([("scaler", scaler), ("estimator", estimator)]) states = np.vstack([e.s for e in experiences]) # self.my_logger.write(states) self.model.named_steps["scaler"].fit(states) # Avoid the predict before fit. self.update([experiences[0]], gamma=0) self.initialized = True
def test_uniform_targets(): enet = ElasticNetCV(n_alphas=3) m_enet = MultiTaskElasticNetCV(n_alphas=3) lasso = LassoCV(n_alphas=3) m_lasso = MultiTaskLassoCV(n_alphas=3) models_single_task = (enet, lasso) models_multi_task = (m_enet, m_lasso) rng = np.random.RandomState(0) X_train = rng.random_sample(size=(10, 3)) X_test = rng.random_sample(size=(10, 3)) y1 = np.empty(10) y2 = np.empty((10, 2)) for model in models_single_task: for y_values in (0, 5): y1.fill(y_values) assert_array_equal(model.fit(X_train, y1).predict(X_test), y1) assert_array_equal(model.alphas_, [np.finfo(float).resolution] * 3) for model in models_multi_task: for y_values in (0, 5): y2[:, 0].fill(y_values) y2[:, 1].fill(2 * y_values) assert_array_equal(model.fit(X_train, y2).predict(X_test), y2) assert_array_equal(model.alphas_, [np.finfo(float).resolution] * 3)
def Lasso(train, test): #Selecting features using Lasso (L1) regularisation print(len(train.Events[0])) sfm = SelectFromModel(MultiTaskLassoCV()) sfm.fit(train.Events[:270010], test.Events) trainE = sfm.transform(train.Events[:270010]) testE = sfm.transform(test.Events) print(len(testE[0]))
def test_multi_task_lasso_cv_dtype(): n_samples, n_features = 10, 3 rng = np.random.RandomState(42) X = rng.binomial(1, .5, size=(n_samples, n_features)) X = X.astype(int) # make it explicit that X is int y = X[:, [0, 0]].copy() est = MultiTaskLassoCV(n_alphas=5, fit_intercept=True).fit(X, y) assert_array_almost_equal(est.coef_, [[1, 0, 0]] * 2, decimal=3)
def _MTLassoCV_MatchSpace(X, Y, v_pens=None, n_v_cv = 5, **kwargs): #pylint: disable=missing-param-doc, unused-argument #A fake MT would do Lasso on y_mean = Y.mean(axis=1) varselectorfit = MultiTaskLassoCV(normalize=True, cv=n_v_cv, alphas = v_pens).fit(X, Y) V = np.sqrt(np.sum(np.square(varselectorfit.coef_), axis=0)) #n_tasks x n_features -> n_feature best_v_pen = varselectorfit.alpha_ m_sel = (V!=0) transformer = SelMatchSpace(m_sel) return transformer, V[m_sel], best_v_pen, V
def select_mtlasso(self, X, y): mtlasso_alphas = MultiTaskLassoCV(alphas=[ 0.00001, .0001, .001, .002, .003, .004, .005, .006, .007, .008, .009, .099, .01, .011, .012, .013, .014, .015, .016, .017, .018, .019, .02, .025, .03, .035, .036, .037, .038, .039, .04, .041, .042, .043, .044, .045, .05, .06, .075, .1, .2, .225, .23, .24, .245, .246, .247, .248, .249, .25, .251, .252, .253, .254, .255, .26, .27, .275, .3, .35, .4, .45, .46, .47, .48, .481, .482, .483, .484, .485, .486, .487, .488, .489, .49, .491, .492, .493, .494, .495, .496, .497, .498, .499, .5, .51, .511, .512, .513, .514, .515, .516, .517, .518, .519, .52, .525, .53, .54, .55, .6, .75, .752, .7527, .7528, .7529, .753, .7531, .754, .7545, .755, .756, .76, .765, .77, .78, .79, .8, .9, 1.0, 1.2, 1.25, 1.5, 1.75, 2.0 ]) sel_alpha = mtlasso_alphas.fit(X, y) sel_alpha.alpha_ print(sel_alpha.alpha_)
def test_model_multi_task_lasso_cv(self): model, X = fit_regression_model(MultiTaskLassoCV(), n_targets=2) model_onnx = convert_sklearn( model, "mutli-task lasso cv", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, verbose=False, basename="SklearnMultiTaskLassoCV-Dec4")
def compare_to_lasso_analysis(adata, ccdtranscript): '''Perform a comparison of pseudotime analysis to LASSO analysis for finding CCD proteins''' prevPlotSize = plt.rcParams['figure.figsize'] plt.rcParams['figure.figsize'] = (6, 5) print("ANALYZING SC-RNA-SEQ WITH LASSO") warnings.filterwarnings("ignore") fucci_rna_data = [(adata.obs["Red585"][ii], adata.obs["Green530"][ii]) for ii in np.arange(len(adata.obs))] imputer = KNNImputer(missing_values=0) expression = imputer.fit_transform(adata.X) fucci_rna_path = "output/pickles/fucci_rna_imputed_lasso.pkl" if os.path.exists(fucci_rna_path): fucci_rna = np.load(open(fucci_rna_path, 'rb'), allow_pickle=True) else: fucci_rna = MultiTaskLassoCV() fucci_rna.fit(expression, fucci_rna_data) pickle.dump(fucci_rna, open(fucci_rna_path, 'wb')) nz_coef = np.sum(fucci_rna.coef_, axis=0) != 0 print(f"{sum(nz_coef)}: number of nonzero lasso coefficients") print(f"{adata.var_names[nz_coef]}: genes with nonzero lasso coeff") print( f"{sum(ccdtranscript[nz_coef]) / sum(nz_coef)}: % nonzero lasso found as CCD transcripts" ) print( f"{np.sum(fucci_rna.coef_, axis=0)[nz_coef]}: coefficients for nonzero lasso coeff" ) # Generate UMAP for CCD and nonCCD for the LASSO model adataCCd = adata[:, nz_coef] sc.pp.neighbors(adataCCd, n_neighbors=10, n_pcs=40) sc.tl.umap(adataCCd) sc.pl.umap(adataCCd, color="fucci_time", show=False, save=True) shutil.move("figures/umap.pdf", f"figures/umapRNALassoCCD.pdf") adataNonCCd = adata[:, ~nz_coef] sc.pp.neighbors(adataNonCCd, n_neighbors=10, n_pcs=40) sc.tl.umap(adataNonCCd) sc.pl.umap(adataNonCCd, color="fucci_time", show=False, save=True) shutil.move("figures/umap.pdf", f"figures/umapRNALassoNonCCD.pdf") plt.rcParams['figure.figsize'] = prevPlotSize warnings.filterwarnings("default")
def fit_lasso(X, flavors): # derive the flavory profiles by fitting the LASSO flavors[flavors == 0] = 0.01 # logit(0) and logit(1) are not finite flavors[flavors == 1] = 0.99 y = logit(flavors) idx = np.all(np.isfinite(y), axis=1) print 'Performing multi-task LASSO...' lasso = MultiTaskLassoCV(cv=7, n_jobs=7, fit_intercept=False, verbose=1).fit(X[idx], y[idx]) weights = inv_logit(lasso.coef_.T) # transform to 0 to 1 scale return weights
def fit(self, X, Y): assert shape(X)[0] == shape(Y)[0] assert ndim(Y) <= 2 self.needs_unravel = False if ndim(Y) == 2 and shape(Y)[1] > 1: self.model = MultiTaskLassoCV(*self.args, **self.kwargs) else: if ndim(Y) == 2 and shape(Y)[1] == 1: Y = np.ravel(Y) self.needs_unravel = True self.model = LassoCV(*self.args, **self.kwargs) self.model.fit(X, Y) return self
def pls_screen_as726x(x, y, n_comps=8): fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(8, 4)) pls = PLSRegression(n_components=n_comps) lasso = MultiTaskLassoCV(max_iter=40000) regr = make_pipeline(PolynomialFeatures(), pls) # regr = make_pipeline(PolynomialFeatures(), lasso) plot_learning_curve(regr, "Learning Curve", x, y, ax=ax2) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=42) print('1') plt.show()
def fit_force_params(self, alpha=None): """ fit sparse linear regression on remaining n_variables-q variables alpha is penalization parameter, None triggers cross validation """ if alpha is None: # do cross validation self.force_model = \ MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1, fit_intercept=False, normalize=False) else: self.force_model = \ MultiTaskLasso(alpha=alpha, fit_intercept=False, normalize=False) self.force_model.fit(self.features_forcing[self.mask_f], self.eps)
def fit_lin_model(self, alpha=None): """ fit sparse linear regression on first q variables alpha is penalization parameter, None triggers cross validation """ if alpha is None: # do cross validation self.lin_model = \ MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1, fit_intercept=False, normalize=False, max_iter=3500) else: self.lin_model = \ MultiTaskLasso(alpha=alpha, fit_intercept=False, normalize=False) self.lin_model.fit(self.features_lin_model[self.mask_l_m], self.delta_v[self.mask_l_m])
def _D_LassoCV_MatchSpace(X, Y, X_full, D_full, v_pens=None, n_v_cv=5, sample_frac=1, y_V_share=0.5, **kwargs): # pylint: disable=missing-param-doc, unused-argument if sample_frac < 1: N_y = X.shape[0] sample_y = np.random.choice(N_y, int(sample_frac * N_y), replace=False) X = X[sample_y, :] Y = Y[sample_y, :] N_d = D_full.shape[0] sample_d = np.random.choice(N_d, int(sample_frac * N_d), replace=False) X_full = X_full[sample_d, :] D_full = D_full[sample_d] y_varselectorfit = MultiTaskLassoCV(normalize=True, cv=n_v_cv, alphas=v_pens).fit(X, Y) y_V = np.sqrt(np.sum(np.square(y_varselectorfit.coef_), axis=0)) # n_tasks x n_features -> n_feature best_y_v_pen = y_varselectorfit.alpha_ d_varselectorfit = LassoCV(normalize=True, cv=n_v_cv, alphas=v_pens).fit(X_full, D_full) d_V = np.abs(d_varselectorfit.coef_) best_d_v_pen = d_varselectorfit.alpha_ m_sel = (y_V + d_V) != 0 transformer = SelMatchSpace(m_sel) if y_V.sum() == 0: V = d_V elif d_V.sum() == 0: V = y_V else: V = y_V_share * y_V / (y_V.sum()) + (1 - y_V_share) * d_V / (2 * d_V.sum()) return transformer, V[m_sel], (best_y_v_pen, best_d_v_pen), V
def _compare_with_lasso_cv(self, lasso_X, lasso_y, wlasso_X, wlasso_y, sample_weight, alphas, lasso_cv=3, wlasso_cv=3, params={}, tol=1e-8): # Check if multitask if np.ndim(lasso_y) > 1: lassoCV = MultiTaskLassoCV(alphas=alphas, cv=lasso_cv) wlassoCV = WeightedMultiTaskLassoCV(alphas=alphas, cv=wlasso_cv) else: lassoCV = LassoCV(alphas=alphas, cv=lasso_cv) wlassoCV = WeightedLassoCV(alphas=alphas, cv=wlasso_cv) lassoCV.set_params(**params) lassoCV.fit(lasso_X, lasso_y) wlassoCV.set_params(**params) wlassoCV.fit(wlasso_X, wlasso_y, sample_weight) # Check that same alpha is chosen self.assertEqual(lassoCV.alpha_, wlassoCV.alpha_) # Check that the coefficients are similar if np.ndim(lasso_y) > 1: for i in range(lasso_y.shape[1]): np.testing.assert_allclose(lassoCV.coef_[i], wlassoCV.coef_[i], atol=tol) if lassoCV.get_params()["fit_intercept"]: self.assertAlmostEqual(lassoCV.intercept_[i], wlassoCV.intercept_[i]) else: np.testing.assert_allclose(lassoCV.coef_, wlassoCV.coef_, atol=tol) self.assertAlmostEqual(lassoCV.intercept_, wlassoCV.intercept_)
def lassoCV(self, name): ''' Lasso Regression ''' sciLasso = MultiTaskLassoCV( fit_intercept=True, normalize=False, cv=12, tol = 0.001 ) sciLasso.fit(self.X_train, self.Y_train) predict_test = sciLasso.predict(self.X_test) MSE = mean_squared_error(predict_test,self.Y_test) s = "Sci LassoCV (MSE: %f)" % (MSE) print s # print sciLasso.score(self.X_test, self.Y_test) print sciLasso.coef_ print np.nonzero(sciLasso.coef_) predict_final = sciLasso.predict(self.X_final) genCSV( name + '_MSE' + str(MSE), self.index_final, predict_final )
X_train_scaled = scaler.transform(X_train_raw) X_test_scaled = scaler.transform(X_test_raw) ## PCA and Feature Selection pca = PCA(n_components=800) selection = SelectKBest(k=850) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) combined_features.fit(X_train_scaled, train_labels.ravel()) #print(pca.explained_variance_ratio_) X_train_reduced = combined_features.transform(X_train_scaled) X_test_reduced = combined_features.transform(X_test_scaled) ## Lasso CV for parameter optimization t1 = time.time() alps = np.linspace(.1,.625,15) model = MultiTaskLassoCV(cv=5, alphas=alps).fit(X_train_reduced, Y_train_raw) t_lasso_cv = time.time() - t1 print 'time to train', t_lasso_cv # Display results m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend()
print("Getting data") path_train = 'data_train.txt' path_test = 'data_test.txt' X, Y = get_data_own(path_train) print(X.shape) print(Y.shape) print("Split data for CV") X_train, X_test , y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1) lasso = MultiTaskLasso(max_iter = max_iter, normalize = True) print("Init train with multitasklassocv") lassocv = MultiTaskLassoCV(alphas=None, cv=10, max_iter=max_iter, verbose=True, normalize=True) lassocv.fit(X_train, y_train) print("Fit multitasklasso with alpha from cv lasso") lasso.set_params(alpha=lassocv.alpha_) lasso.fit(X_train, y_train) print("get mean square error") mae = mean_absolute_error(y_test, lasso.predict(X_test)) print("mae: {}".format(mae)) rmsle = mean_squared_log_error(y_test, lasso.predict(X_test)) print("rmsle: {}".format(rmsle)) mape = mean_absolute_percentage_error(y_test, lasso.predict(X_test)) print("mape: {}".format(mape))
def __init__(self, train_path, test_path, pred_path): super().__init__(train_path, test_path, pred_path) self.multiLasso_model = MultiTaskLassoCV( alphas=[float(i) * 0.05 for i in range(1, 100)], cv=8, max_iter=1000000)
def GetAllModelsForComparison(X_train, Y_train): models = { 'ARDRegression': ARDRegression(), 'BayesianRidge': BayesianRidge(), 'ElasticNet': ElasticNet(), 'ElasticNetCV': ElasticNetCV(), 'Hinge': Hinge(), #'Huber': Huber(), 'HuberRegressor': HuberRegressor(), 'Lars': Lars(), 'LarsCV': LarsCV(), 'Lasso': Lasso(), 'LassoCV': LassoCV(), 'LassoLars': LassoLars(), 'LassoLarsCV': LassoLarsCV(), 'LinearRegression': LinearRegression(), 'Log': Log(), 'LogisticRegression': LogisticRegression(), 'LogisticRegressionCV': LogisticRegressionCV(), 'ModifiedHuber': ModifiedHuber(), 'MultiTaskElasticNet': MultiTaskElasticNet(), 'MultiTaskElasticNetCV': MultiTaskElasticNetCV(), 'MultiTaskLasso': MultiTaskLasso(), 'MultiTaskLassoCV': MultiTaskLassoCV(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), 'Perceptron': Perceptron(), 'RANSACRegressor': RANSACRegressor(), #'RandomizedLasso': RandomizedLasso(), #'RandomizedLogisticRegression': RandomizedLogisticRegression(), 'Ridge': Ridge(), 'RidgeCV': RidgeCV(), 'RidgeClassifier': RidgeClassifier(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), 'SquaredLoss': SquaredLoss(), 'TheilSenRegressor': TheilSenRegressor(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LinearClassifierMixin': LinearClassifierMixin(), 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'StandardScaler': StandardScaler(), 'TransformerMixin': TransformerMixin(), 'BaseEstimator': BaseEstimator(), 'KernelRidge': KernelRidge(), 'RegressorMixin': RegressorMixin(), 'LinearSVC': LinearSVC(), 'LinearSVR': LinearSVR(), 'NuSVC': NuSVC(), 'NuSVR': NuSVR(), 'OneClassSVM': OneClassSVM(), 'SVC': SVC(), 'SVR': SVR(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), #'BallTree': BallTree(), #'DistanceMetric': DistanceMetric(), #'KDTree': KDTree(), 'KNeighborsClassifier': KNeighborsClassifier(), 'KNeighborsRegressor': KNeighborsRegressor(), 'KernelDensity': KernelDensity(), #'LSHForest': LSHForest(), 'LocalOutlierFactor': LocalOutlierFactor(), 'NearestCentroid': NearestCentroid(), 'NearestNeighbors': NearestNeighbors(), 'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor': RadiusNeighborsRegressor(), #'GaussianProcess': GaussianProcess(), 'GaussianProcessRegressor': GaussianProcessRegressor(), 'GaussianProcessClassifier': GaussianProcessClassifier(), 'CCA': CCA(), 'PLSCanonical': PLSCanonical(), 'PLSRegression': PLSRegression(), 'PLSSVD': PLSSVD(), #'ABCMeta': ABCMeta(), #'BaseDiscreteNB': BaseDiscreteNB(), 'BaseEstimator': BaseEstimator(), #'BaseNB': BaseNB(), 'BernoulliNB': BernoulliNB(), 'ClassifierMixin': ClassifierMixin(), 'GaussianNB': GaussianNB(), 'LabelBinarizer': LabelBinarizer(), 'MultinomialNB': MultinomialNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'DecisionTreeRegressor': DecisionTreeRegressor(), 'ExtraTreeClassifier': ExtraTreeClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'AdaBoostRegressor': AdaBoostRegressor(), 'BaggingClassifier': BaggingClassifier(), 'BaggingRegressor': BaggingRegressor(), #'BaseEnsemble': BaseEnsemble(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'IsolationForest': IsolationForest(), 'RandomForestClassifier': RandomForestClassifier(), 'RandomForestRegressor': RandomForestRegressor(), 'RandomTreesEmbedding': RandomTreesEmbedding(), #'VotingClassifier': VotingClassifier(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LabelBinarizer': LabelBinarizer(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'OneVsOneClassifier': OneVsOneClassifier(), #'OneVsRestClassifier': OneVsRestClassifier(), #'OutputCodeClassifier': OutputCodeClassifier(), 'Parallel': Parallel(), #'ABCMeta': ABCMeta(), 'BaseEstimator': BaseEstimator(), #'ClassifierChain': ClassifierChain(), 'ClassifierMixin': ClassifierMixin(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'MultiOutputClassifier': MultiOutputClassifier(), #'MultiOutputEstimator': MultiOutputEstimator(), #'MultiOutputRegressor': MultiOutputRegressor(), 'Parallel': Parallel(), 'RegressorMixin': RegressorMixin(), 'LabelPropagation': LabelPropagation(), 'LabelSpreading': LabelSpreading(), 'BaseEstimator': BaseEstimator(), 'IsotonicRegression': IsotonicRegression(), 'RegressorMixin': RegressorMixin(), 'TransformerMixin': TransformerMixin(), 'BernoulliRBM': BernoulliRBM(), 'MLPClassifier': MLPClassifier(), 'MLPRegressor': MLPRegressor() } return models
SVC(kernel='poly', probability=True, degree=4), SVC(kernel='poly', probability=True, degree=5), DecisionTreeClassifier(), KNeighborsClassifier(), GaussianNB(), RandomForestClassifier(), AdaBoostClassifier(), QuadraticDiscriminantAnalysis(), LinearDiscriminantAnalysis(), ElasticNetCV(max_iter=10000), LarsCV(), LassoCV(max_iter=10000), LassoLarsCV(), LogisticRegressionCV(scoring=multi_class_log_loss), MultiTaskElasticNetCV(), MultiTaskLassoCV(), OrthogonalMatchingPursuitCV(), RidgeClassifierCV() ] algorithm = 17 if len(sys.argv) > 1: algorithm = int(sys.argv[1]) name = names[algorithm] clf = classifiers[algorithm] output_file_name = output_file_names[algorithm] + file_identifier t = time.time() random_state = np.random.RandomState(0) print "Fitting classifier " + name classifier = OneVsRestClassifier(clf, n_jobs=2)
# print('## Logistic Regression Results ##') # logreg = LogisticRegression(penalty='l2') # logreg.fit(X_train, y_train) # y_pred_logreg = logreg.predict(X_test) # print('R2: ', r2_score(y_test, y_pred_logreg)) # print('MAE: ', metrics.mean_absolute_error(y_test, y_pred_logreg)) # print('MSE: ', metrics.mean_squared_error(y_test, y_pred_logreg)) # print('RMSE: ', np.sqrt(np.absolute(metrics.mean_squared_error(y_test, y_pred_logreg)))) # print('variance score:', explained_variance_score(y_test, y_pred_logreg, multioutput='uniform_average')) # ----------------------------------------------------------------------------- # Method 3: MultiTaskLassoCV regression with 10-fold CV # ----------------------------------------------------------------------------- print(' ') print('## 2. Lasso Regression Results ##') lasso = MultiTaskLassoCV(cv=10, eps=0.01, max_iter=1000) t = time.time() lasso.fit(X_train, y_train) t_lasso = time.time() - t y_pred_lasso = lasso.predict(X_test) print('R2: ', r2_score(y_test, y_pred_lasso)) print('MAE: ', metrics.mean_absolute_error(y_test, y_pred_lasso)) print('MSE: ', metrics.mean_squared_error(y_test, y_pred_lasso)) print('RMSE: ', np.sqrt(np.absolute(metrics.mean_squared_error(y_test, y_pred_lasso)))) print( 'variance score: ', explained_variance_score(y_test, y_pred_lasso, multioutput='uniform_average')) print('training time: ', t_lasso)
Xl_tr[:,k] = (Xl_tr[:,k]-mea_l[k])/sig_l[k] mea_h = np.zeros(Dh) sig_h = np.zeros(Dh) for k in range(Dh): mea_h[k] = np.mean(Xh_tr[:,k]) sig_h[k] = np.std(Xh_tr[:,k]) Xh_tr[:,k] = (Xh_tr[:,k]-mea_h[k])/sig_h[k] ############## LassoCV ######################################################## from sklearn.linear_model import MultiTaskLassoCV n_alphas = 5 alphas = np.logspace(-10, 0, n_alphas) lasso = MultiTaskLassoCV(alphas = alphas, cv = 5, fit_intercept=False, normalize=False,n_jobs=3) lasso.fit(Xl_tr, Xh_tr) Lasso_lambda_opt = lasso.alpha_ print('\n Optimal lambda:', Lasso_lambda_opt) ############ Validation curve ################################################# """ # validation curve from sklearn.linear_model import Lasso from sklearn.learning_curve import validation_curve lambdas_range= np.append(0, np.logspace(0, 6, 28)) train_MSE, test_MSE = validation_curve(Lasso(),Xl_tr, Xh_tr, param_name="alpha", param_range=lambdas_range, scoring = "mean_squared_error", cv=10)