def findLassoAlpha(alpha, y, X, returnPred=False): X_train, X_test = X.loc['2013-10-01':'2015-04-01'], X.loc[ '2015-05-01':'2016-04-01'] y_train, y_test = y.loc['2013-10-01':'2015-04-01'], y.loc[ '2015-05-01':'2016-04-01'] datestotest = y_test.index dt = datestotest[0] lassoreg2 = MultiTaskLasso(alpha=alpha, max_iter=1e5) lassoreg2.fit(X_train, y_train) y_pred2 = lassoreg2.predict(X_test.loc[dt].reshape(1, -1)) y_pred2 = pd.DataFrame(y_pred2) y_pred2.columns = y.columns prediction = y_pred2 X_train = X.loc['2013-10-01':dt] y_train = y.loc['2013-10-01':dt] for dt in datestotest[1:]: lassoreg2 = MultiTaskLasso(alpha=alpha, max_iter=1e5) lassoreg2.fit(X_train, y_train) y_pred2 = lassoreg2.predict(X_test.loc[dt].reshape(1, -1)) y_pred2 = pd.DataFrame(y_pred2) y_pred2.columns = y.columns prediction = pd.concat([prediction, y_pred2]) X_train = X.loc['2013-10-01':dt] y_train = y.loc['2013-10-01':dt] prediction.index = y_test.index if (returnPred): return (y_test, prediction) else: return mean_squared_error(y_test, prediction)
def test_dml(self): ################################# # Single treatment and outcome # ################################# X = TestPandasIntegration.df[TestPandasIntegration.features] W = TestPandasIntegration.df[TestPandasIntegration.controls] Y = TestPandasIntegration.df[TestPandasIntegration.outcome] T = TestPandasIntegration.df[TestPandasIntegration.cont_treat] # Test LinearDML est = LinearDML(model_y=LassoCV(), model_t=LassoCV()) est.fit(Y, T, X=X, W=W, inference='statsmodels') treatment_effects = est.effect(X) lb, ub = est.effect_interval(X, alpha=0.05) self._check_input_names( est.summary()) # Check that names propagate as expected # Test re-fit X1 = X.rename(columns={c: "{}_1".format(c) for c in X.columns}) est.fit(Y, T, X=X1, W=W, inference='statsmodels') self._check_input_names(est.summary(), feat_comp=X1.columns) # Test SparseLinearDML est = SparseLinearDML(model_y=LassoCV(), model_t=LassoCV()) est.fit(Y, T, X=X, W=W, inference='debiasedlasso') treatment_effects = est.effect(X) lb, ub = est.effect_interval(X, alpha=0.05) self._check_input_names( est.summary()) # Check that names propagate as expected # ForestDML est = ForestDML(model_y=GradientBoostingRegressor(), model_t=GradientBoostingRegressor()) est.fit(Y, T, X=X, W=W, inference='blb') treatment_effects = est.effect(X) lb, ub = est.effect_interval(X, alpha=0.05) #################################### # Mutiple treatments and outcomes # #################################### Y = TestPandasIntegration.df[TestPandasIntegration.outcome_multi] T = TestPandasIntegration.df[TestPandasIntegration.cont_treat_multi] # Test LinearDML est = LinearDML(model_y=MultiTaskLasso(), model_t=MultiTaskLasso()) est.fit(Y, T, X=X, W=W, inference='statsmodels') self._check_input_names(est.summary(), True, True) # Check that names propagate as expected self._check_popsum_names( est.effect_inference(X).population_summary(), True) est.fit(Y, T, X=X, W=W, inference='bootstrap') # Check bootstrap as well self._check_input_names(est.summary(), True, True) self._check_popsum_names( est.effect_inference(X).population_summary(), True) # Test SparseLinearDML est = SparseLinearDML(model_y=MultiTaskLasso(), model_t=MultiTaskLasso()) est.fit(Y, T, X=X, W=W, inference='debiasedlasso') treatment_effects = est.effect(X) lb, ub = est.effect_interval(X, alpha=0.05) self._check_input_names(est.summary(), True, True) # Check that names propagate as expected self._check_popsum_names( est.effect_inference(X).population_summary(), True)
def test_warm_start_multitask_lasso(): X, y, X_test, y_test = build_dataset() Y = np.c_[y, y] clf = MultiTaskLasso(alpha=0.1, max_iter=5, warm_start=True) ignore_warnings(clf.fit)(X, Y) ignore_warnings(clf.fit)(X, Y) # do a second round with 5 iterations clf2 = MultiTaskLasso(alpha=0.1, max_iter=10) ignore_warnings(clf2.fit)(X, Y) assert_array_almost_equal(clf2.coef_, clf.coef_)
def main(): pickledname = sys.argv[1] _qmDL = qmDL() dataset = _qmDL.load(pickledname=pickledname) X, Y, labels = dataset['XX'], dataset['T'], dataset['names'] #5000 training samples, with 2211 test samples X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=2211, random_state=42) print 'Len X train , test:', len(X_train), len(X_test) regressor = MultiTaskLasso().fit(X_train, Y_train) #r = SVR() #regressor = multiTargetRegressor(rObject=r).fit(X_train,Y_train) Y_pred = regressor.predict(X_test) print Y_pred print 'Y_pred', Y_pred.shape for i in xrange(len(labels)): print '*** MAE ', labels[i], print mean_absolute_error(Y_test[:, i], Y_pred[:, i])
def get_signature_genes(X, n, lda=10): W = np.zeros((X.shape[0], X.shape[0])) # coarse search from the bottom while (abs(W).sum(1) > 0).sum() < n: lda /= 10. model = MultiTaskLasso(alpha=lda, max_iter=100, tol=.001, selection='random', warm_start=True) model.fit(X.T, X.T) W = model.coef_.T #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T) # fine search from the top while (abs(W).sum(1) > 0).sum() > n * 1.2: lda *= 2. model.set_params(alpha=lda) model.fit(X.T, X.T) W = model.coef_.T #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T) # finer search while (abs(W).sum(1) > 0).sum() > n: lda *= 1.1 model.set_params(alpha=lda) model.fit(X.T, X.T) W = model.coef_.T #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T) return np.nonzero(abs(W).sum(1))[0]
def _MTLassoMixed_MatchSpace(X, Y, fit_model_wrapper, v_pens=None, n_v_cv = 5, **kwargs): #pylint: disable=missing-param-doc, unused-argument #Note that MultiTaskLasso(CV).path with the same alpha doesn't produce same results as MultiTaskLasso(CV) mtlasso_cv_fit = MultiTaskLassoCV(normalize=True, cv=n_v_cv, alphas = v_pens).fit(X, Y) #V_cv = np.sqrt(np.sum(np.square(mtlasso_cv_fit.coef_), axis=0)) #n_tasks x n_features -> n_feature #v_pen_cv = mtlasso_cv_fit.alpha_ #m_sel_cv = (V_cv!=0) #sc_fit_cv = fit_model_wrapper(SelMatchSpace(m_sel_cv), V_cv[m_sel_cv]) v_pens = mtlasso_cv_fit.alphas_ #fits_single = {} Vs_single = {} scores = np.zeros((len(v_pens))) #R2s = np.zeros((len(v_pens))) for i, v_pen in enumerate(v_pens): mtlasso_i_fit = MultiTaskLasso(alpha=v_pen, normalize=True).fit(X, Y) V_i = np.sqrt(np.sum(np.square(mtlasso_i_fit.coef_), axis=0)) m_sel_i = (V_i!=0) sc_fit_i = fit_model_wrapper(SelMatchSpace(m_sel_i), V_i[m_sel_i]) #fits_single[i] = sc_fit_i Vs_single[i] = V_i scores[i] = sc_fit_i.score #R2s[i] = sc_fit_i.score_R2 i_best = np.argmin(scores) #v_pen_best = v_pens[i_best] #i_cv = np.where(v_pens==v_pen_cv)[0][0] #print("CV alpha: " + str(v_pen_cv) + " (" + str(R2s[i_cv]) + ")." + " Best alpha: " + str(v_pen_best) + " (" + str(R2s[i_best]) + ") .") best_v_pen = v_pens[i_best] V_best = Vs_single[i_best] m_sel_best = (V_best!=0) return SelMatchSpace(m_sel_best), V_best[m_sel_best], best_v_pen, V_best
def _MTLassoCV_MatchSpace(X, Y, v_pens=None, n_v_cv=5, sample_frac=1, Y_col_block_size=None, se_factor=None, normalize=True, **kwargs): # pylint: disable=missing-param-doc, unused-argument # A fake MT would do Lasso on y_mean = Y.mean(axis=1) if sample_frac < 1: N = X.shape[0] sample = np.random.choice(N, int(sample_frac * N), replace=False) X = X[sample, :] Y = Y[sample, :] if Y_col_block_size is not None: Y = _block_summ_cols(Y, Y_col_block_size) varselectorfit = MultiTaskLassoCV(normalize=normalize, cv=n_v_cv, alphas=v_pens).fit(X, Y) best_v_pen = varselectorfit.alpha_ if se_factor is not None: best_v_pen = _neg_se_rule(varselectorfit, factor=se_factor) varselectorfit = MultiTaskLasso(alpha=best_v_pen, normalize=normalize).fit(X, Y) V = np.sqrt(np.sum(np.square(varselectorfit.coef_), axis=0)) # n_tasks x n_features -> n_feature m_sel = V != 0 transformer = SelMatchSpace(m_sel) return transformer, V[m_sel], best_v_pen, (V, varselectorfit)
def asd_multitasklasso(): model = MultiTaskLasso() f = "/home/vandal.t/repos/pydownscale/pydownscale/test_data/testdata.pkl" data = pickle.load(open(f, 'r')) asdm = ASDMultitask(data, model, season='JJA') asdm.train() out = asdm.predict(test_set=False) out.to_netcdf("test_data/mtl_test.nc")
def test_multi_task_lasso_readonly_data(): X, y, X_test, y_test = build_dataset() Y = np.c_[y, y] with TempMemmap((X, Y)) as (X, Y): Y = np.c_[y, y] clf = MultiTaskLasso(alpha=1, tol=1e-8).fit(X, Y) assert 0 < clf.dual_gap_ < 1e-5 assert_array_almost_equal(clf.coef_[0], clf.coef_[1])
def constrained_multiclass_solve(w, psi, alpha=1.0, **lasso_kws): """ Solve .. math:: \\text{argmin}_s \\|s\\|_0 \ \\text{subject to} \\|w - psi s\\|_2^2 \\leq tol """ model = MultiTaskLasso(alpha=alpha, **lasso_kws) model.fit(psi, w) return model.coef_.T
def test_model_multi_task_lasso(self): model, X = fit_regression_model(MultiTaskLasso(), n_targets=2) model_onnx = convert_sklearn( model, "multi-task lasso", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, verbose=False, basename="SklearnMultiTaskLasso-Dec4")
def get_hyperparameters_model(): param_dist = {} clf = MultiTaskLasso() model = { 'multi_task_lasso': { 'model': clf, 'param_distributions': param_dist } } return model
def __init__(self, scale=True, kfolds=4, alpha_stepsize=1.0, ncpus=None): """Predict motif activities using Lasso MultiTask regression Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification kfolds : integer, optional, default 5 number of kfolds for parameter search alpha_stepsize : float, optional, default 1.0 stepsize for use in alpha gridsearch ncpus : int, optional Number of threads. Default is the number specified in the config. Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted motif activities sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.kfolds = kfolds self.act_description = "activity values: coefficients from " "fitted model" self.scale = scale if ncpus is None: ncpus = int(MotifConfig().get_default_params().get("ncpus", 2)) self.ncpus = ncpus # initialize attributes self.act_ = None self.sig_ = None mtk = MultiTaskLasso() parameters = { "alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)] } self.clf = GridSearchCV(mtk, parameters, cv=kfolds, n_jobs=self.ncpus, scoring="r2") self.pref_table = "score" self.supported_tables = ["score", "count"] self.ptype = "regression"
def make_dictionary(X, n_components=20, alpha=5., write_dir='/tmp/', contrasts=[], method='multitask', l1_ratio=.5, n_subjects=13): """Create dictionary + encoding""" from sklearn.decomposition import dict_learning_online, sparse_encode from sklearn.preprocessing import StandardScaler from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet mem = Memory(write_dir, verbose=0) dictionary = mem.cache(initial_dictionary)(n_components, X) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) if method == 'online': components, dictionary = dict_learning_online(X.T, n_components, alpha=alpha, dict_init=dictionary, batch_size=200, method='cd', return_code=True, shuffle=True, n_jobs=1, positive_code=True) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) elif method == 'sparse': components = sparse_encode(X.T, dictionary, alpha=alpha, max_iter=10, n_jobs=1, check_input=True, verbose=0, positive=True) elif method == 'multitask': # too many hard-typed parameters !!! n_voxels = X.shape[1] // n_subjects components = np.zeros((X.shape[1], n_components)) clf = MultiTaskLasso(alpha=alpha) clf = MultiTaskElasticNet(alpha=alpha, l1_ratio=l1_ratio) for i in range(n_voxels): x = X[:, i:i + n_subjects * n_voxels:n_voxels] components[i: i + n_subjects * n_voxels: n_voxels] =\ clf.fit(dictionary.T, x).coef_ return dictionary, components
def main(): rng = np.random.RandomState(42) # Generate some 2D coefficients with sine waves with random frequency and phase n_samples, n_features, n_tasks = 100, 30, 40 n_relevant_features = 5 coef = np.zeros((n_tasks, n_features)) times = np.linspace(0, 2 * np.pi, n_tasks) for k in range(n_relevant_features): coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1)) X = rng.randn(n_samples, n_features) Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks) coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T]) coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_ # ############################################################################# # Plot support and time series fig = plt.figure(figsize=(8, 5)) plt.subplot(1, 2, 1) plt.spy(coef_lasso_) plt.xlabel('Feature') plt.ylabel('Time (or Task)') plt.text(10, 5, 'Lasso') plt.subplot(1, 2, 2) plt.spy(coef_multi_task_lasso_) plt.xlabel('Feature') plt.ylabel('Time (or Task)') plt.text(10, 5, 'MultiTaskLasso') fig.suptitle('Coefficient non-zero location') feature_to_plot = 0 plt.figure() lw = 2 plt.plot(coef[:, feature_to_plot], color='seagreen', linewidth=lw, label='Ground truth') plt.plot(coef_lasso_[:, feature_to_plot], color='cornflowerblue', linewidth=lw, label='Lasso') plt.plot(coef_multi_task_lasso_[:, feature_to_plot], color='gold', linewidth=lw, label='MultiTaskLasso') plt.legend(loc='upper center') plt.axis('tight') plt.ylim([-1.1, 1.1]) plt.show()
def fit_force_params(self, alpha=None): """ fit sparse linear regression on remaining n_variables-q variables alpha is penalization parameter, None triggers cross validation """ if alpha is None: # do cross validation self.force_model = \ MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1, fit_intercept=False, normalize=False) else: self.force_model = \ MultiTaskLasso(alpha=alpha, fit_intercept=False, normalize=False) self.force_model.fit(self.features_forcing[self.mask_f], self.eps)
def test_multi_task_lasso_and_enet(): X, y, X_test, y_test = build_dataset() Y = np.c_[y, y] # Y_test = np.c_[y_test, y_test] clf = MultiTaskLasso(alpha=1, tol=1e-8).fit(X, Y) assert 0 < clf.dual_gap_ < 1e-5 assert_array_almost_equal(clf.coef_[0], clf.coef_[1]) clf = MultiTaskElasticNet(alpha=1, tol=1e-8).fit(X, Y) assert 0 < clf.dual_gap_ < 1e-5 assert_array_almost_equal(clf.coef_[0], clf.coef_[1]) clf = MultiTaskElasticNet(alpha=1.0, tol=1e-8, max_iter=1) assert_warns_message(ConvergenceWarning, 'did not converge', clf.fit, X, Y)
def _get_minimizer(self): """Return the estimator for the method""" # The factor 0.5 for alpha in the Lasso/LassoLars problem is to compensate # 1/(2 * n_sample) factor in OLS term. if self.method == "multi-task": return MultiTaskLasso( alpha=self.cv_lambdas[0] / 2.0, fit_intercept=False, # normalize=False, # precompute=True, max_iter=self.max_iterations, tol=self.tolerance, copy_X=True, # positive=self.positive, random_state=None, warm_start=True, selection="random", ) if self.method == "gradient_decent": return Lasso( alpha=self.cv_lambdas[0] / 2.0, fit_intercept=False, # normalize=False, precompute=True, max_iter=self.max_iterations, tol=self.tolerance, copy_X=True, positive=self.positive, random_state=None, warm_start=True, selection="random", ) if self.method == "lars": return LassoLars( alpha=self.cv_lambdas[0] / 2.0, fit_intercept=False, verbose=True, # normalize=False, precompute="auto", max_iter=self.max_iterations, eps=2.220446049250313e-16, copy_X=True, fit_path=False, positive=self.positive, jitter=None, random_state=None, )
def fit_lin_model(self, alpha=None): """ fit sparse linear regression on first q variables alpha is penalization parameter, None triggers cross validation """ if alpha is None: # do cross validation self.lin_model = \ MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1, fit_intercept=False, normalize=False, max_iter=3500) else: self.lin_model = \ MultiTaskLasso(alpha=alpha, fit_intercept=False, normalize=False) self.lin_model.fit(self.features_lin_model[self.mask_l_m], self.delta_v[self.mask_l_m])
def get_regressors_multitask(nmodels='all'): """ Returns one or all of Multi-task linear regressors """ # 1. MultiTaskElasticNet lr1 = MultiTaskElasticNet() # 2. MultiTaskLasso lr2 = MultiTaskLasso() if (nmodels == 'all'): models = [lr1, lr2] else: models = ['lr' + str(nmodels)] return models
def mtlasso_model(self, X_train, y_train, X_test, y_test): mtlasso_model = MultiTaskLasso(alpha=.005) mtlasso_model.fit(X_train, y_train) y_train_pred = mtlasso_model.predict(X_train) y_test_pred = mtlasso_model.predict(X_test) # Scoring the model print(mtlasso_model.score(X_train, y_train)) print(mtlasso_model.score(X_test, y_test)) print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.6f, R^2 test: %.6f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
def MultiTaskLasso_regression(self, X_train, y_train, X_test, y_test): alphas = np.logspace(-5, 5, 100) tuned_parameters = [{"alpha": alphas}] my_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42) model = MultiTaskLasso() gsearch_cv = GridSearchCV(estimator = model, param_grid = tuned_parameters, scoring = "neg_mean_squared_error", cv = my_cv, n_jobs=-1) gsearch_cv.fit(X_train, y_train) best_model = gsearch_cv.best_estimator_ best_model.fit(X_train, y_train) y_pred = best_model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) return best_model, mse, mae, r2
def run_one_configuration( full_train_covariate_matrix, complete_target, new_valid_covariate_data_frames, new_valid_target_data_frame, std_data_frame, target_clusters, featurizer, model_name, parameters, log_file, ): model_baseline = dict() model_baseline["type"] = model_name model_baseline["target_clusters"] = target_clusters if model_name == "multi_task_lasso": model = MultiTaskLasso(max_iter=5000, **parameters) elif model_name == "xgboost": model = MultiOutputRegressor( XGBRegressor(n_jobs=10, objective="reg:squarederror", verbosity=0, **parameters)) model.fit(featurizer(full_train_covariate_matrix), complete_target.to_numpy(copy=True)) model_baseline["model"] = lambda x: model.predict(featurizer(x)) skill, _, _, _ = location_wise_metric( new_valid_target_data_frame, new_valid_covariate_data_frames, std_data_frame, model_baseline, "skill", ) cos_sim, _, _, _ = location_wise_metric( new_valid_target_data_frame, new_valid_covariate_data_frames, std_data_frame, model_baseline, "cosine-sim", ) with open(log_file, "a") as f: f.write(f"{len(target_clusters)} {parameters} {skill} {cos_sim}\n")
def test_multitasklasso(gaussian_data, fit_intercept, normalize, alpha): X, y = gaussian_data X = [X[0], X[0]] n_samples = y.shape[1] Xty = np.array([xx.T.dot(yy) for xx, yy in zip(X, y)]) alpha_max = np.linalg.norm(Xty, axis=0).max() alpha *= alpha_max / n_samples est = GroupLasso(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize) est.fit(X, y) assert hasattr(est, 'is_fitted_') mtlasso = MultiTaskLasso(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize) mtlasso.fit(X[0], y.T) assert_allclose(est.coef_, mtlasso.coef_.T, rtol=1e-2)
def constrained_multiclass_solve(w, psi, alpha=1.0, quiet=False, **lasso_kws): """ Solve .. math:: \\text{argmin}_s \\|s\\|_0 \\\\ \\text{subject to} \\|w - \\psi s\\|_2^2 \\leq tol """ model = MultiTaskLasso(alpha=alpha, **lasso_kws) if quiet: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=UserWarning) model.fit(psi, w) else: model.fit(psi, w) return model.coef_.T
def fit(self, X, y): # check label has form of 2-dim array X, y, = copy.deepcopy(X), copy.deepcopy(y) self.sample_weight = None if y.shape.__len__() != 2: self.classes_ = np.unique(y) self.n_classes_ = self.classes_.__len__() y = self.__one2array(y, self.n_classes_) else: self.classes_ = np.arange(y.shape[1]) self.n_classes_ = self.classes_.__len__() self.W = np.random.uniform(self.lower_bound, self.upper_bound, size=(X.shape[1], self.n_hidden)) self.b = np.random.uniform(self.lower_bound, self.upper_bound, size=self.n_hidden) H = expit(np.dot(X, self.W) + self.b) self.multi_lasso = MultiTaskLasso(self.C, max_iter=self.max_iter).fit(H, y)
def multivariate_regression(output_filename): regression_output = open(output_filename, 'w') lm = MultiTaskLasso(alpha=0.1) reg_name = "MTLassoRegression" gcvr2, gr2 = cv_regression(lm, n_data, Game_cols, ["NormalizedLearningGain", "Presence"], show=True) gccvr2, gcr2 = cv_regression(lm, n_data, Game_cols + Comp_cols, ["NormalizedLearningGain", "Presence"], show=True) gaucvr2, gaur2 = cv_regression(lm, n_data, Game_cols + AU_cols, ["NormalizedLearningGain", "Presence"], show=True)
def __init__(self, scale=True, kfolds=5, alpha_stepsize=1 / 3.0): """Predict motif activities using Lasso MultiTask regression Parameters ---------- scale : boolean, optional, default True If ``True``, the motif scores will be scaled before classification kfolds : integer, optional, default 5 number of kfolds for parameter search alpha_stepsize : float, optional, default 0.333 stepsize for use in alpha gridsearch Attributes ---------- act_ : DataFrame, shape (n_motifs, n_clusters) fitted motif activities sig_ : DataFrame, shape (n_motifs,) boolean values, if coefficients are higher/lower than the 1%t from random permutation """ self.kfolds = kfolds self.act_description = ("activity values: coefficients from " "fitted model") # initialize attributes self.act_ = None self.sig_ = None mtk = MultiTaskLasso() parameters = { "alpha": [np.exp(-x) for x in np.arange(0, 10, alpha_stepsize)], } self.clf = GridSearchCV(mtk, parameters, cv=kfolds, n_jobs=4)
def multi_task_lasso(df): X = df[['X0', 'X1']] # X = df[['X0', 'X1', 'X2', 'X3']] Y = df[['y1', 'y2', 'y3']] mtl_scorer = make_scorer(mtl_roc_auc, greater_is_better=True) mtl_parameters = { 'alpha': uniform(0, 10) } grid_search = RandomizedSearchCV( MultiTaskLasso(fit_intercept=False, alpha=0.05), mtl_parameters, n_iter=200, scoring=mtl_scorer, verbose=10, n_jobs=1, cv=5 ) grid_search.fit(X, Y) print(grid_search.best_params_) print(grid_search.best_score_) print(grid_search.best_estimator_.coef_)
def predict( self, forecast_length: int, future_regressor=[], just_point_forecast: bool = False, ): """Generates forecast data immediately following dates of index supplied to .fit() Args: forecast_length (int): Number of periods of data to forecast ahead regressor (numpy.Array): additional regressor just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts Returns: Either a PredictionObject of forecasts and metadata, or if just_point_forecast == True, a dataframe of point forecasts """ if not _has_tsfresh: raise ImportError("Package tsfresh is required") # num_subsamples = 10 predictStartTime = datetime.datetime.now() # from tsfresh import extract_features from tsfresh.utilities.dataframe_functions import make_forecasting_frame # from sklearn.ensemble import AdaBoostRegressor from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters max_timeshift = 10 regression_model = 'Adaboost' feature_selection = None max_timeshift = self.max_timeshift regression_model = self.regression_model feature_selection = self.feature_selection sktraindata = self.df_train.copy() X = pd.DataFrame() y = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, default_fc_parameters=EfficientFCParameters(), n_jobs=1, ) # current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) X = pd.concat([X, current_X], axis=1) y = pd.concat([y, current_y], axis=1) counter += 1 # drop constant features X = X.loc[:, X.apply(pd.Series.nunique) != 1] X = X.replace([np.inf, -np.inf], np.nan) X = X.fillna(0) y = y.fillna(method='ffill').fillna(method='bfill') if feature_selection == 'Variance': from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(0.15)) X = pd.DataFrame(sel.fit_transform(X)) if feature_selection == 'Percentile': from sklearn.feature_selection import SelectPercentile, chi2 X = pd.DataFrame( SelectPercentile(chi2, percentile=20).fit_transform( X, y[y.columns[0]])) if feature_selection == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor from sklearn.feature_selection import SelectFromModel clf = DecisionTreeRegressor() clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) if feature_selection == 'Lasso': from sklearn.linear_model import MultiTaskLasso from sklearn.feature_selection import SelectFromModel clf = MultiTaskLasso(max_iter=2000) clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) """ decisionTreeList = X.columns[model.get_support()] LassoList = X.columns[model.get_support()] feature_list = decisionTreeList.to_list() set([x for x in feature_list if feature_list.count(x) > 1]) from collections import Counter repeat_features = Counter(feature_list) repeat_features = repeat_features.most_common(20) """ # Drop first line X = X.iloc[1:, ] y = y.iloc[1:] y = y.fillna(method='ffill').fillna(method='bfill') index = self.create_forecast_index(forecast_length=forecast_length) if regression_model == 'ElasticNet': from sklearn.linear_model import MultiTaskElasticNet regr = MultiTaskElasticNet(alpha=1.0, random_state=self.random_seed) elif regression_model == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor regr = DecisionTreeRegressor(random_state=self.random_seed) elif regression_model == 'MLP': from sklearn.neural_network import MLPRegressor # relu/tanh lbfgs/adam layer_sizes (100) (10) regr = MLPRegressor( hidden_layer_sizes=(10, 25, 10), verbose=self.verbose_bool, max_iter=200, activation='tanh', solver='lbfgs', random_state=self.random_seed, ) elif regression_model == 'KNN': from sklearn.multioutput import MultiOutputRegressor from sklearn.neighbors import KNeighborsRegressor regr = MultiOutputRegressor( KNeighborsRegressor(random_state=self.random_seed)) elif regression_model == 'Adaboost': from sklearn.multioutput import MultiOutputRegressor from sklearn.ensemble import AdaBoostRegressor regr = MultiOutputRegressor(AdaBoostRegressor( n_estimators=200)) # , random_state=self.random_seed)) else: regression_model = 'RandomForest' from sklearn.ensemble import RandomForestRegressor regr = RandomForestRegressor(random_state=self.random_seed, n_estimators=1000, verbose=self.verbose) regr.fit(X, y) combined_index = self.df_train.index.append(index) forecast = pd.DataFrame() sktraindata.columns = [x for x in range(len(sktraindata.columns))] for x in range(forecast_length): x_dat = pd.DataFrame() y_dat = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata.tail(max_timeshift)[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, n_jobs=1, default_fc_parameters=EfficientFCParameters(), ) # default_fc_parameters=MinimalFCParameters(), current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) x_dat = pd.concat([x_dat, current_X], axis=1) y_dat = pd.concat([y_dat, current_y], axis=1) counter += 1 x_dat = x_dat[X.columns] rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values)) forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True) sktraindata = pd.concat([sktraindata, rfPred], axis=0, ignore_index=True) sktraindata.index = combined_index[:len(sktraindata.index)] forecast.columns = self.column_names forecast.index = index if just_point_forecast: return forecast else: upper_forecast, lower_forecast = Point_to_Probability( self.df_train, forecast, prediction_interval=self.prediction_interval) predict_runtime = datetime.datetime.now() - predictStartTime prediction = PredictionObject( model_name=self.name, forecast_length=forecast_length, forecast_index=forecast.index, forecast_columns=forecast.columns, lower_forecast=lower_forecast, forecast=forecast, upper_forecast=upper_forecast, prediction_interval=self.prediction_interval, predict_runtime=predict_runtime, fit_runtime=self.fit_runtime, model_parameters=self.get_params(), ) return prediction