def apply_regression_model(X_train, y_train, X_test, y_test, indices, selectModel): Result = {}; Result['X_train'] = X_train; Result['y_train'] = y_train; Result['X_test'] = X_test; Result['y_test'] = y_test; Result['indices'] = indices; if(selectModel==0): print "Linear Regression"; model = linear_model.LinearRegression(); model.fit(X_train, y_train); predictions = model.predict(X_test); predictions_train = model.predict(X_train); if(selectModel==1): print "Ridge Regression"; model = linear_model.RidgeCV(alphas = (0.1,0.1,10)); model.fit(X_train, y_train); predictions = model.predict(X_test); predictions_train = model.predict(X_train); if(selectModel==2): print "Lasso Regression"; model = linear_model.MultiTaskLassoCV(eps=0.001, n_alphas=100, alphas=(0.1,0.1,10)); model.fit(X_train, y_train); predictions = model.predict(X_test); predictions_train = model.predict(X_train); Result['predictions'] = predictions; Result['model'] = model; Result['predictions_train'] = predictions_train; return Result;
def test_model_multi_task_lasso_cv(self): model, X = fit_regression_model(linear_model.MultiTaskLassoCV(), n_targets=2) model_onnx = convert_sklearn( model, "mutli-task lasso cv", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, verbose=False, basename="SklearnMultiTaskLassoCV-Dec4")
def multi_task_lasso(X, q, cv=False, alpha=0.002): ''' Multi Task Lasso with dimensions forced to share features Running multi task Lasso with cross-validation gives 0.002 ''' if cv: clf = lm.MultiTaskLassoCV(eps=1e-3, n_alphas=100, alphas=None, fit_intercept=False, cv=10, verbose=True, n_jobs=-1) else: clf = lm.MultiTaskLasso(alpha=alpha, fit_intercept=False) clf.fit(X, q) theta = clf.coef_.T res = q - np.dot(X, theta) return theta, res
def test_model_multi_task_lasso_cv(self): model, X = _fit_model_multi(linear_model.MultiTaskLassoCV()) model_onnx = convert_sklearn(model, "linear regression", [("input", FloatTensorType([None, 4]))]) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(numpy.float32), model, model_onnx, verbose=False, basename="SklearnMultiTaskLassoCV-Dec4", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
def lasso_regression(self): alphas = [0.0005, 0.001, 0.1, 1, 3] # To find the best alpha model_lasso = linear_model.MultiTaskLassoCV(alphas=alphas, max_iter=100000) model_lasso.fit(self.training_data_X, self.training_data_Y) # print(model_lasso.coef_) coef_for_v3, coef_for_v4 = model_lasso.coef_ coef_num = len(coef_for_v3) for i in range(coef_num): if coef_for_v3[i] != 0: self.lasso_v3.append(i + 5) if coef_for_v4[i] != 0: self.lasso_v4.append(i + 5) return self.lasso_v3, self.lasso_v4
def sklearn_liner_model_regressions(xTrain, xTest, yTrain, yTest): modelForConsideration: DataFrame = pd.DataFrame() LinerModels = \ [ linear_model.ARDRegression(), linear_model.BayesianRidge(), linear_model.ElasticNet(), linear_model.ElasticNetCV(), linear_model.HuberRegressor(), linear_model.Lars(), linear_model.LarsCV(), linear_model.Lasso(), linear_model.LassoCV(), linear_model.LassoLars(), linear_model.LassoLarsCV(), linear_model.LassoLarsIC(), linear_model.LinearRegression(), linear_model.MultiTaskLasso(), linear_model.MultiTaskElasticNet(), linear_model.MultiTaskLassoCV(), linear_model.MultiTaskElasticNetCV(), linear_model.OrthogonalMatchingPursuit(), linear_model.OrthogonalMatchingPursuitCV(), linear_model.PassiveAggressiveClassifier(), linear_model.PassiveAggressiveRegressor(), linear_model.Perceptron(), linear_model.RANSACRegressor(), linear_model.Ridge(), linear_model.RidgeClassifier(), linear_model.RidgeClassifierCV(), linear_model.RidgeCV(), linear_model.SGDClassifier(), linear_model.SGDRegressor(), linear_model.TheilSenRegressor(), linear_model.enet_path(xTrain, yTrain), linear_model.lars_path(xTrain, yTrain), linear_model.lasso_path(xTrain, yTrain), # linear_model.LogisticRegression() # ,linear_model.LogisticRegressionCV(),linear_model.logistic_regression_path(xTrain, yTrain), linear_model.orthogonal_mp(xTrain, yTrain), linear_model.orthogonal_mp_gram(), linear_model.ridge_regression() ] for model in LinerModels: modelName: str = model.__class__.__name__ try: # print(f"Preparing Model {modelName}") if modelName == "LogisticRegression": model = linear_model.LogisticRegression(random_state=0) model.fit(xTrain, yTrain) yTrainPredict = model.predict(xTrain) yTestPredict = model.predict(xTest) errorList = calculate_prediction_error(modelName, yTestPredict, yTest, yTrainPredict, yTrain) if errorList["Test Average Error"][0] < 30 and errorList[ "Train Average Error"][0] < 30: try: modelForConsideration = modelForConsideration.append( errorList) except (Exception) as e: print(e) except (Exception, ArithmeticError) as e: print(f"Error occurred while preparing Model {modelName}") return modelForConsideration
def test_model_multi_task_lasso_cv(self): model, X = fit_regression_model(linear_model.MultiTaskLassoCV(), n_targets=2) model_onnx = convert_sklearn( model, "mutli-task lasso cv", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, verbose=False, basename="SklearnMultiTaskLassoCV-Dec4", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
def build_models(predictors, responses, modelNo): if(modelNo==0): # Linear Regression model = linear_model.LinearRegression(); modelName = "Linear Regression"; if(modelNo==1): # Ridge Regression model = linear_model.RidgeCV(alphas = (0.1,0.1,10)); modelName = "Ridge Regression"; if(modelNo==2): # lasso Regression model = linear_model.MultiTaskLassoCV(eps=0.001, n_alphas=100, alphas=(0.1,0.1,10)); modelName = "Lasso Regression"; model.fit(predictors, responses); predictions = model.predict(predictors); Result = {}; Result['modelName'] = modelName; Result['predictions'] = predictions; Result['model'] = model; Result['Corr'] = pearsonr(predictions,responses)[0][0]; return Result;
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None): from sklearn import datasets, neighbors, linear_model, svm totalTime = 0 startTrainTime = time() logger.info("Start training...") if model_type == 'ARDRegression': model = linear_model.ARDRegression().fit(train_x, train_y) elif model_type == 'BayesianRidge': model = linear_model.BayesianRidge().fit(train_x, train_y) elif model_type == 'ElasticNet': model = linear_model.ElasticNet().fit(train_x, train_y) elif model_type == 'ElasticNetCV': model = linear_model.ElasticNetCV().fit(train_x, train_y) elif model_type == 'HuberRegressor': model = linear_model.HuberRegressor().fit(train_x, train_y) elif model_type == 'Lars': model = linear_model.Lars().fit(train_x, train_y) elif model_type == 'LarsCV': model = linear_model.LarsCV().fit(train_x, train_y) elif model_type == 'Lasso': model = linear_model.Lasso().fit(train_x, train_y) elif model_type == 'LassoCV': model = linear_model.LassoCV().fit(train_x, train_y) elif model_type == 'LassoLars': model = linear_model.LassoLars().fit(train_x, train_y) elif model_type == 'LassoLarsCV': model = linear_model.LassoLarsCV().fit(train_x, train_y) elif model_type == 'LassoLarsIC': model = linear_model.LassoLarsIC().fit(train_x, train_y) elif model_type == 'LinearRegression': model = linear_model.LinearRegression().fit(train_x, train_y) elif model_type == 'LogisticRegression': model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'LogisticRegressionCV': model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'MultiTaskLasso': model = linear_model.MultiTaskLasso().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNet': model = linear_model.MultiTaskElasticNet().fit(train_x, train_y) elif model_type == 'MultiTaskLassoCV': model = linear_model.MultiTaskLassoCV().fit(train_x, train_y) elif model_type == 'MultiTaskElasticNetCV': model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuit': model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y) elif model_type == 'OrthogonalMatchingPursuitCV': model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y) elif model_type == 'PassiveAggressiveClassifier': model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'PassiveAggressiveRegressor': model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y) elif model_type == 'Perceptron': model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RandomizedLasso': model = linear_model.RandomizedLasso().fit(train_x, train_y) elif model_type == 'RandomizedLogisticRegression': model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y) elif model_type == 'RANSACRegressor': model = linear_model.RANSACRegressor().fit(train_x, train_y) elif model_type == 'Ridge': model = linear_model.Ridge().fit(train_x, train_y) elif model_type == 'RidgeClassifier': model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeClassifierCV': model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'RidgeCV': model = linear_model.RidgeCV().fit(train_x, train_y) elif model_type == 'SGDClassifier': model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SGDRegressor': model = linear_model.SGDRegressor().fit(train_x, train_y) elif model_type == 'TheilSenRegressor': model = linear_model.TheilSenRegressor().fit(train_x, train_y) elif model_type == 'lars_path': model = linear_model.lars_path().fit(train_x, train_y) elif model_type == 'lasso_path': model = linear_model.lasso_path().fit(train_x, train_y) elif model_type == 'lasso_stability_path': model = linear_model.lasso_stability_path().fit(train_x, train_y) elif model_type == 'logistic_regression_path': model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'orthogonal_mp': model = linear_model.orthogonal_mp().fit(train_x, train_y) elif model_type == 'orthogonal_mp_gram': model = linear_model.orthogonal_mp_gram().fit(train_x, train_y) elif model_type == 'LinearSVC': model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y) elif model_type == 'SVC': model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y) else: raise NotImplementedError('Model not implemented') logger.info("Finished training.") endTrainTime = time() trainTime = endTrainTime - startTrainTime logger.info("Training time : %d seconds" % trainTime) logger.info("Start predicting train set...") train_pred_y = model.predict(train_x) logger.info("Finished predicting train set.") logger.info("Start predicting test set...") test_pred_y = model.predict(test_x) logger.info("Finished predicting test set.") endTestTime = time() testTime = endTestTime - endTrainTime logger.info("Testing time : %d seconds" % testTime) totalTime += trainTime + testTime train_pred_y = np.round(train_pred_y) test_pred_y = np.round(test_pred_y) np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i') logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y))) logger.info('[TEST] Acc: %.3f' % (accuracy_score(test_y, test_pred_y))) return accuracy_score(test_y, test_pred_y)
print( f'just to double check the shapes spike_count_slice has a shape of {spike_count_slice.shape} and intended_kin_slice has the shape of {intended_kin_slice.shape}' ) # In[ ]: eps = 1e-4 # this is the alpha range that we are interested in, right. CV = 5 n_alphas = 100 print( f'to work with lasso regression, we are using an alpha range of {eps} as defined by alpha_min / alpha_max' ) print(f'and we are doing {CV} fold validation') print(f'and by default, we are using {n_alphas} alphas ') lassoCV_result = linear_model.MultiTaskLassoCV(eps=eps, cv=CV).fit( spike_count_slice, intended_kin_slice) print(f'lassoCV_result returns the best alpha of {lassoCV_result.alpha_}') print(f'and the alphas we used are {lassoCV_result.alphas}') # ## Visualize the lasso path # In[ ]: from sklearn.linear_model import lasso_path #determine the error tolerance eps = 3e-5 alpha_lasso, coefs_lasso, _ = lasso_path(intended_kin_slice, spike_count_slice)
def estimate_baseline_lasso(Ton, Tamb=273.0, order=0, timechunk=80, cv=None, progress=True, **kwargs): """Estimate ultra-wideband baseline using the multi-task LASSO. Args: Ton (xarray.DataArray): Calibrated De:code array of ON point. Tamb (float, optional): Ambient temperature used in calibration. order (int, optional): Maximum order of a polynomial function which is assumed to represent a continuum emission spectrum. Default is 0 (flat continuum emission). timechunk (int, optional): The number of samples to be used for a multi-task LASSO. Default is 80 (~0.5 s for DESHIMA data). cv (int, optional): The number of fold for cross validation (CV). If not spacified, CV is not conducted (default alpha is used). progress (bool, optional): If True, then a progress bar is shown. kwargs (dict, optional): Keyword arguments for model initialization. Returns: Tbase (xarray.DataArray): De:code array of estimated baseline. """ freq = np.asarray(Ton.kidfq).copy() slope = fn.models._calculate_dtau_dpwv(freq) freq -= np.median(freq) N_freq = len(freq) N_poly = order + 1 X = np.zeros([N_freq, N_poly + 1]) X[:, 0] = slope / np.linalg.norm(slope) for i in range(N_poly): poly = freq**i X[:, i + 1] = poly / np.linalg.norm(poly) default_kwargs = {'fit_intercept': False} kwargs = {**default_kwargs, **kwargs} n_chunk = int(len(Ton) / timechunk) is_cv = cv is not None and cv > 1 if is_cv: model = linear_model.MultiTaskLassoCV(cv=cv, **kwargs) else: model = linear_model.MultiTaskLasso(**kwargs) with tqdm(total=n_chunk, disable=not progress) as bar: def func(Ton_): model.fit(X, Ton_.values.T) Tbase_ = np.outer(model.coef_[:, 0], X[:, 0]) Tbase_ = dc.full_like(Ton_, Tbase_) for i in range(N_poly + 1): Tbase_.coords[f'basis_{i}'] = 'ch', X[:, i] Tbase_.coords[f'coeff_{i}'] = 't', model.coef_[:, i] if is_cv: alpha = np.full(len(Ton_), model.alpha_) Tbase_.coords['alpha'] = 't', alpha bar.update(1) return Tbase_ return Ton.groupby_bins('t', n_chunk).apply(func)
for k in support: coef[:, k] = np.sin((1. + rng.randn(1)) * times + 3 * rng.randn(1)) X = rng.randn(n_samples, n_features) Y = np.dot(X, coef.T) + rng.randn(n_samples, n_tasks) Y /= norm(Y, ord='fro') ############################################################################### # Fit with sklearn and celer, using the same API params = dict(tol=1e-6, cv=4, n_jobs=-1) t0 = time.perf_counter() clf = MultiTaskLassoCV(**params).fit(X, Y) t_celer = time.perf_counter() - t0 t0 = time.perf_counter() clf_sklearn = linear_model.MultiTaskLassoCV(**params).fit(X, Y) t_sklearn = time.perf_counter() - t0 ############################################################################### # Celer is faster print("Time for celer : %.2f s" % t_celer) print("Time for sklearn: %.2f s" % t_sklearn) ############################################################################### # Both packages find the same solution print("Celer's optimal regularizer : %s" % clf.alpha_) print("Sklearn's optimal regularizer: %s" % clf_sklearn.alpha_) print("Relative norm difference between optimal coefs: %.2f %%" % (100 * norm(clf.coef_ - clf_sklearn.coef_) / norm(clf.coef_)))
odor_group_dict = dict(zip(odor_set,range(len(odor_set)))) stim_group = X.index.get_level_values(level='stim1').str[:-2] stim_group = stim_group.map(lambda x: odor_group_dict[x]).tolist() group_kfold = model_selection.GroupKFold(n_splits=len(odor_set)) # stim_kfold = group_kfold.split(X, Y, groups=stim_group) """ Build regression model with CV """ reg_param = dict( max_iter=10000, selection='random' ) from sklearn import linear_model mtlcv = linear_model.MultiTaskLassoCV(copy_X=True, alphas=10**np.linspace(-4,0,num=20), cv=group_kfold.split(X, Y, groups=stim_group), n_jobs=12,#group_kfold.n_splits, verbose=1, **reg_param) """ Fit model """ # mtlcv.fit(X, syn_Y) mtlcv.fit(X, Y) from sklearn.linear_model import lasso_path path_result = mtlcv.path(X, Y, coef_init=np.random.rand(Y.shape[1],X.shape[1])) #https://stackoverflow.com/questions/50410037/multiple-linear-regression-with-specific-constraint-on-each-coefficients-on-pyth # # """ Random simulation """ # syn_coef0 = np.random.rand(Y.shape[1], X.shape[1]) # random coefs for simulation # row_sum = np.sum(syn_coef0, axis=1) # syn_coef = pd.DataFrame(syn_coef0 / row_sum[:, None], index=Y.columns, columns=X.columns)
'Training data \nx-axis: time (sec), y-axis: state value, $x$ - blue, $xd$ - dotted blue, $\\theta$ - red, ' '$\\theta_d$ - dotted red', y=0.94) plt.tight_layout() #==================================================== LEARN MODELS ====================================================: if learn_models: #Learn linear model with DMD: basis = lambda x: x C_dmd = np.eye(n) optimizer_dmd = linear_model.MultiTaskLasso(alpha=alpha_dmd, fit_intercept=False, selection='random') cv_dmd = linear_model.MultiTaskLassoCV(fit_intercept=False, n_jobs=-1, cv=3, selection='random') standardizer_dmd = preprocessing.StandardScaler(with_mean=False) model_dmd = Edmd(n, m, basis, n, n_traj_dc, optimizer_dmd, cv=cv_dmd, standardizer=standardizer_dmd, C=C_dmd, first_obs_const=False, continuous_mdl=False, dt=dt)
from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score import matplotlib.pyplot as plt import numpy as np # 多任务岭回归 x, y = datasets.make_regression(n_samples=1000, n_features=1, n_targets=10, noise=10, random_state=0) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) # 弹性网络 reg = linear_model.MultiTaskElasticNet(0.1) # 多任务弹性网络回归 reg = linear_model.MultiTaskLasso(0.1) # 多任务lasso回归 reg = linear_model.MultiTaskLassoCV(0.1) # 多任务lasso回归 reg = linear_model.MultiTaskElasticNetCV(0.1) # 多任务弹性网络回归 reg.fit(x_train, y_train) print(reg.coef_, reg.intercept_) y_pred = reg.predict(x_test) # 平均绝对误差 print(mean_absolute_error(y_test, y_pred)) # 均方误差 print(mean_squared_error(y_test, y_pred))
def get_regression_estimators(r, regression_models): if r == 'ARDRegression': regression_models[r] = linear_model.ARDRegression() elif r == 'BayesianRidge': regression_models[r] = linear_model.BayesianRidge() elif r == 'ElasticNet': regression_models[r] = linear_model.ElasticNet() elif r == 'ElasticNetCV': regression_models[r] = linear_model.ElasticNetCV() elif r == 'HuberRegressor': regression_models[r] = linear_model.HuberRegressor() elif r == 'Lars': regression_models[r] = linear_model.Lars() elif r == 'LarsCV': regression_models[r] = linear_model.LarsCV() elif r == 'Lasso': regression_models[r] = linear_model.Lasso() elif r == 'LassoCV': regression_models[r] = linear_model.LassoCV() elif r == 'LassoLars': regression_models[r] = linear_model.LassoLars() elif r == 'LassoLarsCV': regression_models[r] = linear_model.LassoLarsCV() elif r == 'LassoLarsIC': regression_models[r] = linear_model.LassoLarsIC() elif r == 'LinearRegression': regression_models[r] = linear_model.LinearRegression() elif r == 'LogisticRegression': regression_models[r] = linear_model.LogisticRegression() elif r == 'LogisticRegressionCV': regression_models[r] = linear_model.LogisticRegressionCV() elif r == 'MultiTaskElasticNet': regression_models[r] = linear_model.MultiTaskElasticNet() elif r == 'MultiTaskElasticNetCV': regression_models[r] = linear_model.MultiTaskElasticNetCV() elif r == 'MultiTaskLasso': regression_models[r] = linear_model.MultiTaskLasso() elif r == 'MultiTaskLassoCV': regression_models[r] = linear_model.MultiTaskLassoCV() elif r == 'OrthogonalMatchingPursuit': regression_models[r] = linear_model.OrthogonalMatchingPursuit() elif r == 'OrthogonalMatchingPursuitCV': regression_models[r] = linear_model.OrthogonalMatchingPursuitCV() elif r == 'PassiveAggressiveClassifier': regression_models[r] = linear_model.PassiveAggressiveClassifier() elif r == 'PassiveAggressiveRegressor': regression_models[r] = linear_model.PassiveAggressiveRegressor() elif r == 'Perceptron': regression_models[r] = linear_model.Perceptron() elif r == 'RANSACRegressor': regression_models[r] = linear_model.RANSACRegressor() elif r == 'Ridge': regression_models[r] = linear_model.Ridge() elif r == 'RidgeClassifier': regression_models[r] = linear_model.RidgeClassifier() elif r == 'RidgeClassifierCV': regression_models[r] = linear_model.RidgeClassifierCV() elif r == 'RidgeCV': regression_models[r] = linear_model.RidgeCV() elif r == 'SGDClassifier': regression_models[r] = linear_model.SGDClassifier() elif r == 'SGDRegressor': regression_models[r] = linear_model.SGDRegressor() elif r == 'TheilSenRegressor': regression_models[r] = linear_model.TheilSenRegressor() else: print( r + " is an unsupported regression type. Check if you have misspelled the name." )
def predict_atlas(fpaths_refspace_train, fpaths_secspace_train, fpaths_refspace_predict, outlier_removal_ref=None, outlier_removal_sec=None, outlier_removal_cov=None, covariates_to_use=None, regressor='MO-SVR', n_jobs=1, save_predictions=False, save_pipeline=False, verbose=False, outlier_options_ref={}, outlier_options_sec={}, outlier_options_cov={}, regressor_options={'kernel': 'rbf'}, pipeline_options={ 'zscore_X': False, 'zscore_y': False, 'pca_X': False, 'pca_y': False, 'rezscore_X': False, 'rezscore_y': False, 'subselect_X': None, 'subselect_y': None, 'add_covariates': None }): """Predict a secondary channel feature space by fitting an atlas regression model on paired "secondary channel - reference channel" training data and then performing regression on "reference channel"-only test data. Input data is retrieved from files specified in lists of file paths and the predicted output data is written to the corresponding paths, appropriately named and tagged as 'PREDICTED'. The channel names for the predicted channels are added to the metadata channels index (also tagged as 'PREDICTED') and the full atlas regression objects are also added to the metadata. Parameters ---------- fpaths_refspace_train : single string or list of strings A path or list of paths (either local from cwd or global) to npy files containing training feature space data for the reference channel used as the basis of prediction (usually the shape space). fpaths_secspace_train : single string or list of strings A path or list of paths (either local from cwd or global) to npy files containing training feature space data for the secondary channel that is to be the target of the regression. fpaths_refspace_predict : single string or list of strings A path or list of paths (either local from cwd or global) to npy files containing prediction feature space data for the reference channel based on which the target secondary channel will be predicted outlier_removal_ref : string or None, optional, default None If None, no outlier removal is done on the reference feature space. Otherwise this must be a string denoting the method for outlier removal (one of `absolute_thresh`, `percentile_thresh`, `merged_percentile_thresh` or `isolation_forest`). Note that outlier removal is only done on training data, not on prediction data. See katachi.utilities.outlier_removal.RemoveOutliers for more info. outlier_removal_sec : string or None, optional, default None If None, no outlier removal is done on the target feature space. Otherwise this must be a string denoting the method for outlier removal (see outlier_removal_ref above). outlier_removal_cov : string or None, optional, default None If None, no outlier removal is done based on covariate information. Otherwise this must be a string denoting the method for outlier removal (see outlier_removal_ref above). covariates_to_use : string, list of strings or None, optional, default None A string denoting the selection tree to select a covariate to be used for outlier detection from the HierarchicalData covariate object. Can also be a list of multiple such strings, in which case the covariates are merged into an fspace. The specified covariates must each be single numeric columns. regressor : string or sklearn regressor instance, optional, default 'MO-SVR' If a string, must be one of 'MO-SVR', 'MT-ENetCV', 'MT-Lasso', 'MLP'. In the first case a multioutput SVR is used for regression, in the second a Multi-Task Elastic Net with Cross Validation, in the third a Multi-Task Lasso linear regression, and in the fourth a Multi-Layer Perceptron. If an sklearn(-like) regressor instance is passed, it must be a multivariate-multivariable regressor that supports the fit and predict methods. n_jobs : int, optional, default 1 Number of processes available for use during multi-processed model fitting and prediction. Works for 'MO-SVR', 'MT-ENetCV' and 'MT-Lasso' regressors. WARNING: The 'MLP' regressor also performs multi-processing but does not seem to support an n_jobs argument. save_predictions : bool, optional, default False If True, the predictions are saved in the corresponding paths and the metadata is updated. save_pipeline : bool, optional, default False If True, the atlas pipeline object is saved in the corresponding paths as a separate file with the name `<prim_ID>_atlas_pipeline.pkl`. verbose : bool, optional, default False If True, more information is printed. outlier_options_ref : dict, optional, default {} kwarg dictionary for the chosen outlier removal method to be applied to the reference feature space. See katachi.utilities.outlier_removal.RemoveOutliers for more info. outlier_options_sec : dict, optional, default {} kwarg dictionary for the chosen outlier removal method to be applied to the target feature space. See katachi.utilities.outlier_removal.RemoveOutliers for more info. outlier_options_cov : dict, optional, default {} kwarg dictionary for the chosen outlier removal method to be applied to the covariates. There default is to fall back to the defaults of katachi.utilities.outlier_removal.RemoveOutliers. regressor_options : dict, optional, default is a standard RBF MO-SVR kwarg dictionary for the chosen regressor's instantiation. See the chosen regressor's doc string for more information. pipeline_options : dict, optional, default is no additional processing kwarg dictionary for AtlasPipeline instantiation. See the AtlasPipeline doc string for more information. Returns ------- secspace_predict : array of shape (n_predict_samples, n_secspace_features) Predicted secondary channel feature space. refspace_predict_idx : array of shape (n_predict_samples) Index array mapping rows (cells) of secspace_predict to paths (prims) in fpaths_refspace_predict. atlas_pipeline : predict_atlas.AtlasPipeline instance Fitted instance of the regressor pipeline. """ #-------------------------------------------------------------------------- ### Load data if verbose: print "\n# Loading data..." # Handle cases of single paths for training data if type(fpaths_secspace_train) == str and type( fpaths_refspace_train) == str: fpaths_secspace_train = [fpaths_secspace_train] fpaths_refspace_train = [fpaths_refspace_train] elif (type(fpaths_secspace_train) == str or type(fpaths_refspace_train) == str or len(fpaths_secspace_train) != len(fpaths_refspace_train)): raise IOError("Different number of secondary and reference space " + "input file paths specified.") # Handle cases of single paths for prediction data if type(fpaths_refspace_predict) == str: fpaths_refspace_predict = [fpaths_refspace_predict] # Load training data secspace_train = [] refspace_train = [] for secpath, refpath in zip(fpaths_secspace_train, fpaths_refspace_train): secspace_train.append(np.load(secpath)) refspace_train.append(np.load(refpath)) secspace_train = np.concatenate(secspace_train, axis=0) refspace_train = np.concatenate(refspace_train, axis=0) # Check that everything is fine if not secspace_train.shape[0] == refspace_train.shape[0]: raise IOError("Secondary and reference space do not have the same " + "number of cells.") # Load prediction data refspace_predict = [] refspace_predict_idx = [] for idx, refpath in enumerate(fpaths_refspace_predict): refspace_predict.append(np.load(refpath)) refspace_predict_idx.append( [idx for v in range(refspace_predict[-1].shape[0])]) refspace_predict = np.concatenate(refspace_predict, axis=0) refspace_predict_idx = np.concatenate(refspace_predict_idx, axis=0) # Check that everything is fine if not refspace_train.shape[1] == refspace_predict.shape[1]: raise IOError("Reference feature spaces for training and prediction " + "do not have the same number of features!") # Handle covariate loading if outlier_removal_cov is not None: # Sanity checks if covariates_to_use is None: raise IOError( "When outlier_removal_cov is not None, covariates " + "to use for determining outliers must be specified " + "in covariates_to_use!") # Handle single covariates if type(covariates_to_use) == str: covariates_to_use = [covariates_to_use] # Load covariates covars = [] for refpath in fpaths_refspace_train: # Create covarpath revdir, reffile = os.path.split(refpath) covpath = os.path.join(revdir, reffile[:10] + '_covariates.pkl') # Load covar file with open(covpath, 'rb') as covfile: covtree = pickle.load(covfile) # Get relevant covariates covs2use = [] for c2u in covariates_to_use: covs2use.append(np.expand_dims(covtree._gad(c2u), -1)) covs2use = np.concatenate(covs2use, axis=1) # Add to other samples covars.append(covs2use) # Concatenate covars = np.concatenate(covars) #-------------------------------------------------------------------------- ### Prepare regressor # Report if verbose: print "\n# Preparing regressor..." # Multi-Output Support Vector Regression with RBF Kernel if regressor == 'MO-SVR': svr = svm.SVR(**regressor_options) regressor = multioutput.MultiOutputRegressor(svr, n_jobs=n_jobs) # Multi-task Elastic Net Regression with Cross Validation elif regressor == 'MT-ENetCV': regressor = linear_model.MultiTaskElasticNetCV(random_state=42, n_jobs=n_jobs) # Multivariate-Multivariable Linear Regression by Multi-Task Lasso elif regressor == 'MT-Lasso': regressor = linear_model.MultiTaskLassoCV(random_state=42, n_jobs=n_jobs, **regressor_options) # Multi-Layer Perceptron Regressor elif regressor == 'MLP': regressor = neural_network.MLPRegressor(random_state=42, **regressor_options) # Other regressor strings elif type(regressor) == str: raise ValueError('Regressor not recognized.') # Regressor object given as argument else: # Check if object has fit method fit_attr = getattr(regressor, "fit", False) if not callable(fit_attr): raise ValueError("Regressor object has no 'fit' method.") # Check if object has predict method predict_attr = getattr(regressor, "predict", False) if not callable(predict_attr): raise ValueError("Regressor object has no 'predict' method.") #-------------------------------------------------------------------------- ### Remove outliers from training data # Find and remove outliers based on covariate values if outlier_removal_cov is not None: # Report if verbose: print "\n# Removing outliers based on covariates..." print "Started with %i," % refspace_train.shape[0], # Find and remove outliers orem_cov = RemoveOutliers(outlier_removal_cov, **outlier_options_cov) orem_cov.fit(covars) covars, (refspace_train, secspace_train) = orem_cov.transform( covars, [refspace_train, secspace_train]) # Report if verbose: print "removed %i, kept %i samples" % (orem_cov.X_removed_, refspace_train.shape[0]) # Find and remove outliers based on reference space if outlier_removal_ref is not None: # Report if verbose: print "\n# Removing reference outliers..." print "Started with %i," % refspace_train.shape[0], # Find and remove outliers orem_ref = RemoveOutliers(outlier_removal_ref, **outlier_options_ref) orem_ref.fit(refspace_train) refspace_train, secspace_train = orem_ref.transform( refspace_train, secspace_train) # Report if verbose: print "removed %i, kept %i samples" % (orem_ref.X_removed_, refspace_train.shape[0]) # Find and remove outliers based on secondary space if outlier_removal_sec is not None: # Report if verbose: print "\n# Removing target outliers..." print "Started with %i," % refspace_train.shape[0], # Find and remove outliers orem_sec = RemoveOutliers(outlier_removal_sec, **outlier_options_sec) orem_sec.fit(secspace_train) secspace_train, refspace_train = orem_sec.transform( secspace_train, refspace_train) # Report if verbose: print "removed %i, kept %i samples" % (orem_sec.X_removed_, refspace_train.shape[0]) #-------------------------------------------------------------------------- ### Fit and predict # Construct pipeline atlas_pipeline = AtlasPipeline(regressor, verbose=verbose, **pipeline_options) # Fit if verbose: print "\n# Fitting..." atlas_pipeline.fit(refspace_train, secspace_train) # Predict if verbose: print "\n# Predicting..." secspace_predict = atlas_pipeline.predict(refspace_predict) #-------------------------------------------------------------------------- ### Update the metadata if save_predictions: if verbose: print "\n# Saving metadata..." # For each path... for idx, refpath in enumerate(fpaths_refspace_predict): # Load metadata file refdir, reffname = os.path.split(refpath) prim_ID = reffname[:10] metapath = os.path.join(refdir, prim_ID + "_stack_metadata.pkl") with open(metapath, "rb") as metafile: metadict = pickle.load(metafile) # Construct channel designation pattern = re.compile("8bit_(.+?(?=_))") secpath = fpaths_secspace_train[0] channel = re.search(pattern, secpath).group(1) + "_PREDICTED" # Add channel to metadata if not channel in metadict["channels"]: metadict["channels"].append(channel) # Save metadata with open(metapath, "wb") as outfile: pickle.dump(metadict, outfile, protocol=pickle.HIGHEST_PROTOCOL) #-------------------------------------------------------------------------- ### Save fitted atlas pipeline as separate metadata file if save_pipeline: if verbose: print "\n# Saving pipeline..." # For each path... for idx, refpath in enumerate(fpaths_refspace_predict): # Load atlas metadata file if it exists refdir, reffname = os.path.split(refpath) prim_ID = reffname[:10] atlaspath = os.path.join(refdir, prim_ID + "_atlas_pipeline.pkl") if os.path.isfile(atlaspath): with open(atlaspath, "rb") as atlasfile: atlasdict = pickle.load(atlasfile) else: atlasdict = {} # Construct designation pattern = re.compile("8bit_(.+?(?=\.))") secpath = fpaths_secspace_train[0] atlasname = re.search(pattern, secpath).group(1) + "_ATLASPIP" # Add pipeline to dict atlasdict[atlasname] = atlas_pipeline # Save atlas dict with open(atlaspath, "wb") as outfile: pickle.dump(atlasdict, outfile, protocol=pickle.HIGHEST_PROTOCOL) #-------------------------------------------------------------------------- ### Save the predictions if save_predictions: if verbose: print "\n# Saving predictions..." # For each path... for idx, refpath in enumerate(fpaths_refspace_predict): # Construct outpath to_replace = refpath[refpath.index("8bit_") + 5:] secpath = fpaths_secspace_train[0] replace_by = secpath[secpath.index("8bit_") + 5:] replace_by = replace_by[:-4] + "_PREDICTED.npy" outpath = refpath.replace(to_replace, replace_by) # Write file np.save(outpath, secspace_predict[refspace_predict_idx == idx]) #-------------------------------------------------------------------------- ### Return results # Report if verbose: print "\nDone!" # Return return secspace_predict, refspace_predict_idx, atlas_pipeline
FullData = TrainData + TestData print("Total samples (training + testing)", len(FullData)) SiteOrder = list(FullData[0]["Alpha"].keys( )) #np.unique([ky for TD in FullData for ky in list(TD["Alpha"].keys())]) TranscOrder = list(existingTable.index) trainDataIn = np.array([[ dat["Alpha"][ky] if ky in dat["Alpha"].keys() else 0 for ky in SiteOrder ] for dat in FullData]) trainDataOut = np.array([[ dat["Transcript"][ky] if ky in dat["Transcript"].keys() else 0 for ky in TranscOrder ] for dat in FullData]) Lasso = lm.MultiTaskLassoCV(max_iter=3000) Lasso.fit(trainDataIn, trainDataOut) parameters["MTLASSOalpha"] = Lasso.alpha_ print("Fit MT LASSO alpha") trainDataOut_byGene = trainDataOut.T regionInfo = pd.read_csv(network_bindingSite_file, sep='\t') network_bygene = {} for gn in TranscOrder: ### get the info for that gene sites = regionInfo[regionInfo.gene == gn]['region'].values # print([si in SiteOrder for si in sites]) network_bygene[gn] = [