def findLassoAlpha(alpha, y, X, returnPred=False): X_train, X_test = X.loc['2013-10-01':'2015-04-01'], X.loc[ '2015-05-01':'2016-04-01'] y_train, y_test = y.loc['2013-10-01':'2015-04-01'], y.loc[ '2015-05-01':'2016-04-01'] datestotest = y_test.index dt = datestotest[0] lassoreg2 = MultiTaskLasso(alpha=alpha, max_iter=1e5) lassoreg2.fit(X_train, y_train) y_pred2 = lassoreg2.predict(X_test.loc[dt].reshape(1, -1)) y_pred2 = pd.DataFrame(y_pred2) y_pred2.columns = y.columns prediction = y_pred2 X_train = X.loc['2013-10-01':dt] y_train = y.loc['2013-10-01':dt] for dt in datestotest[1:]: lassoreg2 = MultiTaskLasso(alpha=alpha, max_iter=1e5) lassoreg2.fit(X_train, y_train) y_pred2 = lassoreg2.predict(X_test.loc[dt].reshape(1, -1)) y_pred2 = pd.DataFrame(y_pred2) y_pred2.columns = y.columns prediction = pd.concat([prediction, y_pred2]) X_train = X.loc['2013-10-01':dt] y_train = y.loc['2013-10-01':dt] prediction.index = y_test.index if (returnPred): return (y_test, prediction) else: return mean_squared_error(y_test, prediction)
def constrained_multiclass_solve(w, psi, alpha=1.0, **lasso_kws): """ Solve .. math:: \\text{argmin}_s \\|s\\|_0 \ \\text{subject to} \\|w - psi s\\|_2^2 \\leq tol """ model = MultiTaskLasso(alpha=alpha, **lasso_kws) model.fit(psi, w) return model.coef_.T
class MultiTaskLassoImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def mtlasso_model(self, X_train, y_train, X_test, y_test): mtlasso_model = MultiTaskLasso(alpha=.005) mtlasso_model.fit(X_train, y_train) y_train_pred = mtlasso_model.predict(X_train) y_test_pred = mtlasso_model.predict(X_test) # Scoring the model print(mtlasso_model.score(X_train, y_train)) print(mtlasso_model.score(X_test, y_test)) print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.6f, R^2 test: %.6f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
def run_one_configuration( full_train_covariate_matrix, complete_target, new_valid_covariate_data_frames, new_valid_target_data_frame, std_data_frame, target_clusters, featurizer, model_name, parameters, log_file, ): model_baseline = dict() model_baseline["type"] = model_name model_baseline["target_clusters"] = target_clusters if model_name == "multi_task_lasso": model = MultiTaskLasso(max_iter=5000, **parameters) elif model_name == "xgboost": model = MultiOutputRegressor( XGBRegressor(n_jobs=10, objective="reg:squarederror", verbosity=0, **parameters)) model.fit(featurizer(full_train_covariate_matrix), complete_target.to_numpy(copy=True)) model_baseline["model"] = lambda x: model.predict(featurizer(x)) skill, _, _, _ = location_wise_metric( new_valid_target_data_frame, new_valid_covariate_data_frames, std_data_frame, model_baseline, "skill", ) cos_sim, _, _, _ = location_wise_metric( new_valid_target_data_frame, new_valid_covariate_data_frames, std_data_frame, model_baseline, "cosine-sim", ) with open(log_file, "a") as f: f.write(f"{len(target_clusters)} {parameters} {skill} {cos_sim}\n")
def constrained_multiclass_solve(w, psi, alpha=1.0, quiet=False, **lasso_kws): """ Solve .. math:: \\text{argmin}_s \\|s\\|_0 \\\\ \\text{subject to} \\|w - \\psi s\\|_2^2 \\leq tol """ model = MultiTaskLasso(alpha=alpha, **lasso_kws) if quiet: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=UserWarning) model.fit(psi, w) else: model.fit(psi, w) return model.coef_.T
def test_multitasklasso(gaussian_data, fit_intercept, normalize, alpha): X, y = gaussian_data X = [X[0], X[0]] n_samples = y.shape[1] Xty = np.array([xx.T.dot(yy) for xx, yy in zip(X, y)]) alpha_max = np.linalg.norm(Xty, axis=0).max() alpha *= alpha_max / n_samples est = GroupLasso(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize) est.fit(X, y) assert hasattr(est, 'is_fitted_') mtlasso = MultiTaskLasso(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize) mtlasso.fit(X[0], y.T) assert_allclose(est.coef_, mtlasso.coef_.T, rtol=1e-2)
def get_signature_genes(X, n, lda=10): W = np.zeros((X.shape[0], X.shape[0])) # coarse search from the bottom while (abs(W).sum(1) > 0).sum() < n: lda /= 10. model = MultiTaskLasso(alpha=lda, max_iter=100, tol=.001, selection='random', warm_start=True) model.fit(X.T, X.T) W = model.coef_.T #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T) # fine search from the top while (abs(W).sum(1) > 0).sum() > n * 1.2: lda *= 2. model.set_params(alpha=lda) model.fit(X.T, X.T) W = model.coef_.T #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T) # finer search while (abs(W).sum(1) > 0).sum() > n: lda *= 1.1 model.set_params(alpha=lda) model.fit(X.T, X.T) W = model.coef_.T #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T) return np.nonzero(abs(W).sum(1))[0]
def make_dictionary(X, n_components=20, alpha=5., write_dir='/tmp/', contrasts=[], method='multitask', l1_ratio=.5, n_subjects=13): """Create dictionary + encoding""" from sklearn.decomposition import dict_learning_online, sparse_encode from sklearn.preprocessing import StandardScaler from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet mem = Memory(write_dir, verbose=0) dictionary = mem.cache(initial_dictionary)(n_components, X) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) if method == 'online': components, dictionary = dict_learning_online(X.T, n_components, alpha=alpha, dict_init=dictionary, batch_size=200, method='cd', return_code=True, shuffle=True, n_jobs=1, positive_code=True) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) elif method == 'sparse': components = sparse_encode(X.T, dictionary, alpha=alpha, max_iter=10, n_jobs=1, check_input=True, verbose=0, positive=True) elif method == 'multitask': # too many hard-typed parameters !!! n_voxels = X.shape[1] // n_subjects components = np.zeros((X.shape[1], n_components)) clf = MultiTaskLasso(alpha=alpha) clf = MultiTaskElasticNet(alpha=alpha, l1_ratio=l1_ratio) for i in range(n_voxels): x = X[:, i:i + n_subjects * n_voxels:n_voxels] components[i: i + n_subjects * n_voxels: n_voxels] =\ clf.fit(dictionary.T, x).coef_ return dictionary, components
#print(pca.explained_variance_ratio_) X_train_reduced = combined_features.transform(X_train_scaled) X_test_reduced = combined_features.transform(X_test_scaled) ## Create K folds k_fold = KFold(Y_train_raw.shape[0], n_folds=10) for train, test in k_fold: X1 = X_train_reduced[train] Y1 = Y_train_raw[train] X2 = X_train_reduced[test] Y2 = Y_train_raw[test] ## Train Classifiers on fold mcl_clf = MultiTaskLasso(alpha=.3) mcl_clf.fit(X1, Y1) ## Score Classifiers on fold mcl_clf_score = mcl_clf.score(X2, Y2) print "MultiTaskLasso: ", mcl_clf_score ## Lasso CV for parameter optimization t1 = time.time() clf = MultiTaskLasso(alpha=.3).fit(X_train_reduced, Y_train_raw) t_lasso_cv = time.time() - t1 print 'time to train', t_lasso_cv
def __init__( self, species: str, reprocess: Optional[bool] = False, gene_selection_method: Optional[Literal['deg', 'lasso', 'elastic-net']] = 'deg', model_cache_dir: Optional[str] = None, alpha: Optional[Union[float, Sequence[float]]] = 1e-2, learning_rate: Optional[float] = 1e-3, equal_weight: Optional[bool] = True, train_split: Optional[float] = 0.8, n_jobs: Optional[int] = 15, remove_correlated: Optional[Literal['both', 'ct', 'region']] = None, normalize: Optional[bool] = False, dim_reduction: Optional[str] = None, n_components: Optional[int] = None): super().__init__() torch.set_num_threads(n_jobs) filename = f'{species}_ex_colors' self.learning_rate = learning_rate self.device = 'cpu' # Used saved data if possible if not reprocess and os.path.exists( f'withcolors_preprocessed/{filename}.pickle'): with open(f'withcolors_preprocessed/{filename}.pickle', mode='rb') as file: data_dict = pickle.load(file) self.data = data_dict['data'] self.ct_axis_mask = data_dict['ct_axis_mask'] self.r_axis_mask = data_dict['r_axis_mask'] # No need to do anything else return species_data = sc.read(f'withcolors/{filename}.h5ad') if dim_reduction is not None: sc.pp.pca(species_data, n_comps=n_components) sc.pp.highly_variable_genes(species_data) sc.pp.neighbors(species_data, n_pcs=n_components) if dim_reduction == 'pca': sc.tl.pca(species_data, n_comps=n_components) elif dim_reduction == 'umap': sc.tl.umap(species_data, n_components=n_components) elif dim_reduction == 'tsne': sc.tl.tsne(species_data, n_pcs=n_components) species_data = AnnData(species_data.obsm[f'X_{dim_reduction}'], obs=species_data.obs) species_data.var.index = pd.Index([ f'{dim_reduction}{x}' for x in range(len(species_data.var.index)) ]) # Label each observation with its subregion and species species_data.obs['clusters'] = species_data.obs['clusters'].apply( lambda s: species[0].upper() + '_' + s) species_data.obs['subregion'] = species_data.obs['clusters'].apply( lambda s: s.split('.')[0]) self.n_var = len(species_data.var.index) self.n_subregions = len(np.unique(species_data.obs['subregion'])) self.n_clusters = len(np.unique(species_data.obs['clusters'])) self.n_obs = len(species_data.obs.index) if gene_selection_method == 'deg': self._deg_select(dim_reduction, species_data) elif gene_selection_method in ['lasso', 'elastic-net']: # if isinstance(alpha, float): # alpha = [alpha] for label in ['subregion', 'clusters']: if equal_weight: # get count of number of occurrences of each label label_to_count = species_data.obs[label].value_counts( normalize=True).to_dict() # Map each observation to its appropriate label appearance frequency w = species_data.obs[label].map(label_to_count) # Diagonalize and take square root to appropriately normalize data w = np.diag(np.sqrt(w)) # normalize data transcriptomes = np.matmul(w, species_data.X.toarray()) else: transcriptomes = species_data.X.toarray() model_file = f'{model_cache_dir}/{gene_selection_method}/' \ f'{species[0].upper()}_normalized-{equal_weight}_{label}_a-{alpha}.pt' if model_cache_dir is not None and os.path.exists(model_file): with open(model_file, 'rb') as file: model = pickle.load(file) else: # Create one-hot encoding of labels num_labels = self.n_subregions if label == 'subregion' else self.n_clusters label_to_id = { r: i for i, r in enumerate( np.unique(species_data.obs[label])) } labels = species_data.obs[label].map(label_to_id) labels_expanded = np.zeros((self.n_obs, num_labels)) labels_expanded[np.arange(self.n_obs), labels] = 1 if gene_selection_method == 'lasso': model = MultiTaskLasso(alpha=alpha, max_iter=10000) else: model = MultiTaskElasticNet(alpha=alpha, max_iter=10000) model.fit(transcriptomes, labels_expanded) with open(model_file, 'wb') as file: pickle.dump(model, file, protocol=5) max_weight_per_gene = (model.coef_ != 0).max(axis=0) # # define the model # model = nn.Sequential( # # nn.BatchNorm1d(self.n_var), # nn.Linear(self.n_var, num_labels) # ) # model_file = f'{model_cache_dir}_{label}.pt' # if model_cache_dir is None or not os.path.exists(model_file): # print(f'\nTraining lasso on {label}.\n') # # Create the dataset and dataloader # ds = SparseDataSet(species_data, label) # train_size = int(train_split * len(ds)) # val_size = len(ds) - train_size # train_ds, val_ds = torch.utils.data.random_split(ds, [train_size, val_size]) # train_dl = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0) # val_dl = DataLoader(val_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0) # optimizer = optim.Adam(model.parameters(), lr=self.learning_rate) # # train # num_nonzero_features_by_alpha = [] # for alpha in alpha: # loss_history = self._train_model(model, train_dl, val_dl, optimizer, alpha=alpha, epochs=50) # max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0] # num_nonzero_features_by_alpha.append([(max_weight_per_gene > 1e-4).sum(), alpha]) # # save the model # torch.save(model.state_dict(), model_file) # plt.plot(loss_history[:, 0], label='train loss') # plt.plot(loss_history[:, 1], label='val loss') # plt.legend() # plt.show() # num_nonzero_features_by_alpha = np.array(num_nonzero_features_by_alpha) # plt.plot(num_nonzero_features_by_alpha[:, 0], num_nonzero_features_by_alpha[:, 1]) # plt.savefig('num_features_selected_v_l1_weight.pdf') # plt.show() # else: # model.load_state_dict(torch.load(model_file)) # # Get the max weight per gene to see whether it's relevant to at least one subregion # with torch.no_grad(): # max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0] # with torch.no_grad(): # sns.distplot(max_weight_per_gene) # plt.show() if label == 'subregion': self.r_axis_mask = max_weight_per_gene != 0 else: self.ct_axis_mask = max_weight_per_gene != 0 print( f'Before removing correlated genes, found {self.r_axis_mask.sum()} region genes ' f'and {self.ct_axis_mask.sum()} cell type genes.') if remove_correlated is not None: self._remove_r_ct_correlated(remove_correlated, species_data) print( f'After removing correlated genes, found {self.r_axis_mask.sum()} region genes ' f'and {self.ct_axis_mask.sum()} cell type genes.') # Average transcriptomes within each cell type and put into data frame with cell types as rows and genes as cols ct_names = np.unique(species_data.obs['clusters']) ct_avg_data = [ species_data[species_data.obs['clusters'] == ct].X.mean(axis=0) for ct in ct_names ] self.data = pd.concat([ pd.DataFrame(data.reshape((1, -1)), columns=species_data.var.index, index=[cluster_name]) for data, cluster_name in zip(ct_avg_data, ct_names) ]) # Divide each row by mean, as in Tosches et al, rename columns, # and transpose so that column labels are genes and rows are cell types # Divide each row by mean if normalize: self.data = self.data.div(self.data.mean(axis=0).to_numpy(), axis=1) # noqa # Save data data_dict = { 'data': self.data, 'ct_axis_mask': self.ct_axis_mask, 'r_axis_mask': self.r_axis_mask } with open(f'withcolors_preprocessed/{filename}.pickle', mode='wb') as file: pickle.dump(data_dict, file)
class classSparser(object): def __init__(self,mapperType='PIMP',support=150,projectOnSubspace=False): #options are #'PIMP' for Moore Penrose Pseudo Inverse #'Regressor' for using a regression task on each dimension self.mapperType = mapperType self.sparsed_X = None self.transformation_matrix = None self.Regressor = None self.support = support self.projectOnSubspace = projectOnSubspace def fit(self,X,Y): self.sparsed_X = list() #First, tranlate points to the origin main_centroid = [ np.mean(x) for x in np.transpose(X) ] print 'Main centroid:', main_centroid X = X - main_centroid byClassDict = defaultdict(list) for i in xrange(len(Y)): byClassDict[Y[i]].append(X[i]) class_centroids = dict() centroids_matrix = list() kindexmap = dict() _i = 0 for k in byClassDict: class_centroid = [ np.mean(x) for x in np.transpose(byClassDict[k]) ] #np.mean(byClassDict[k]) _norm = np.linalg.norm(class_centroid) _scaling_factor = _norm**2#(i+1)**2 #+ (i+_norm) #Play with this using _norm, i and any otrher function/constant _centroid = np.array(class_centroid)#*(_scaling_factor) print '*** Class centroid:', _centroid class_centroids[k] = _centroid centroids_matrix.append(_centroid) kindexmap[k] = _i _i+=1 centroids_matrix = np.array(centroids_matrix) ortho_centroids_matrix = np.array(gram_schmidt.gs(centroids_matrix)) ortho_centroids_matrix = normalize(ortho_centroids_matrix) print '*Centroids matrix',centroids_matrix print '*Ortho centroids matrix', ortho_centroids_matrix newX, newY = list(), list() ks = list() for k in byClassDict: #byClassDict[k] = np.array(byClassDict[k]) - centroids_matrix[kindexmap[k]] + np.array(ortho_centroids_matrix[kindexmap[k]]) #class_centroids[k] #this is the basis vector corresponding to current class classvector = np.array(ortho_centroids_matrix[kindexmap[k]]) kScalingFactor = self.support #This section tries to get a good scaling factor for each orthonormal vector maxks = list() for _k in ks: projs = [scalarProjection(x,classvector) for x in byClassDict[_k]] maxk = max(projs) maxks.append(maxk) maxownk = max([scalarProjection(x,classvector) for x in byClassDict[k]]) if len(ks): kScalingFactor = max(maxks) + abs(maxownk) + self.support for v in byClassDict[k]: vv = np.array(v) - centroids_matrix[kindexmap[k]] + classvector*kScalingFactor self.sparsed_X.append(vv) newX.append(v) newY.append(k) ks.append(k) self.sparsed_X = np.array(self.sparsed_X) if self.projectOnSubspace: #Project on to new subspace spawned by class vectors self.sparsed_X = np.dot(self.sparsed_X,np.transpose(centroids_matrix) ) if self.mapperType == 'PIMP': #self.scaler = preprocessing.StandardScaler().fit(self.sparsed_X) #self.sparsed_X = self.scaler.transform(self.sparsed_X) self.transformation_matrix = self.sparsed_X*(np.transpose(np.linalg.pinv(X) ) ) #self.transformation_matrix = X*(np.transpose(np.linalg.pinv(self.sparsed_X) ) ) if self.mapperType == 'Regressor': self.Regressor = MultiTaskLasso(alpha=0.00000001,max_iter=2000) self.Regressor.fit(newX,self.sparsed_X) return self.sparsed_X, newY def transform(self,X): Xs = X#self.scaler.transform(X) if self.mapperType == 'PIMP': transformed_data = self.transformation_matrix*Xs #transformed_data = Xs*self.transformation_matrix if self.mapperType == 'Regressor': transformed_data = self.Regressor.predict(Xs) return transformed_data
def fit(self, K, s): r"""Fit the model using the coordinate descent method from scikit-learn. Args ---- K: ndarray The :math:`m \times n` kernel matrix, :math:`{\bf K}`. A numpy array of shape (m, n). s: ndarray or CSDM object. A csdm object or an equivalent numpy array holding the signal, :math:`{\bf s}`, as a :math:`m \times m_\text{count}` matrix. """ s_, self.scale = prepare_signal(s) prod = np.asarray(self.f_shape).prod() if K.shape[1] != prod: raise ValueError( "The product of the shape, `f_shape`, must be equal to the length of " f"the axis 1 of kernel, K, {K.shape[1]} != {prod}.") alpha = s_.size * self.hyperparameters["alpha"] Ks, ss = _get_augmented_data(K=K, s=s_, alpha=alpha, regularizer=self.regularizer, f_shape=self.f_shape) # The factor 0.5 for alpha in the Lasso/LassoLars problem is to compensate # 1/(2 * n_sample) factor in OLS term if self.method == "multi-task": estimator = MultiTaskLasso( alpha=self.hyperparameters["lambda"] / 2.0, fit_intercept=False, copy_X=True, max_iter=self.max_iterations, tol=self.tolerance, warm_start=False, random_state=None, selection="random", # positive=self.positive, ) if self.method == "gradient_decent": estimator = Lasso( alpha=self.hyperparameters["lambda"] / 2.0, fit_intercept=False, copy_X=True, max_iter=self.max_iterations, tol=self.tolerance, warm_start=False, random_state=None, selection="random", positive=self.positive, ) if self.method == "lars": estimator = LassoLars( alpha=self.hyperparameters["lambda"] / 2.0, fit_intercept=False, verbose=True, # normalize=False, precompute=True, max_iter=self.max_iterations, eps=2.220446049250313e-16, copy_X=True, fit_path=False, positive=True, jitter=None, random_state=None, ) estimator.fit(Ks, ss) f = estimator.coef_.copy() if s_.shape[1] > 1 and len(self.f_shape) == 2: f.shape = (s_.shape[1], ) + self.f_shape f[:, :, 0] /= 2.0 f[:, 0, :] /= 2.0 elif s_.shape[1] == 1 and len(self.f_shape) == 2: f.shape = self.f_shape f[:, 0] /= 2.0 f[0, :] /= 2.0 f *= self.scale self.estimator = estimator self.f = f self.n_iter = estimator.n_iter_ self._sol_to_csdm(s)
print "测试集得分:", ompCV.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, ompCV.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试MultiTaskLasso类**********" # 在初始化MultiTaskLasso类时, 指定参数alpha, 默认值是1.0. multiTaskLasso = MultiTaskLasso(alpha=1.0) # 拟合训练集 multiTaskLasso.fit(train_X, train_Y) # 打印模型的系数 print "系数:", multiTaskLasso.coef_ print "截距:", multiTaskLasso.intercept_ print '训练集R2: ', r2_score(train_Y, multiTaskLasso.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = multiTaskLasso.predict(test_X) print "测试集得分:", multiTaskLasso.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, multiTaskLasso.predict(X)) print "TSS(Total Sum of Squares): ", tss
from sklearn.cross_validation import train_test_split #splite dataset to get necessary sub-dataset features_train, features_test, labels_train, labels_test = train_test_split( features_sc, label_scm, test_size=0.33, random_state=42) #pre-process: dimensional reduction(SVD) svd1 = TruncatedSVD(n_components=9, random_state=1).fit(features_train) features_train = svd1.transform(features_train) svd2 = TruncatedSVD(n_components=9, random_state=1).fit(features_test) features_test = svd2.transform(features_test) #do regression mtl = MultiTaskLasso(alpha=0.000000001, random_state=1) mtl.fit(features_train, labels_train) print "MultiTaskLasso", mtl.score(features_test, labels_test) ###################################################################### #this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal #load necessary libs from sklearn.feature_selection import SelectKBest from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import MultiTaskElasticNet from sklearn.cross_validation import train_test_split #splite dataset to get necessary sub-dataset features_train, features_test, labels_train, labels_test = train_test_split( features_sc, label_scm, test_size=0.33, random_state=42)
def crossval(labels, features, algorithm): features = np.nan_to_num(features) #features = getPCA(np.nan_to_num(features)) #plot scatter of the first column of the features array (assuming the pca has been done): #''' plt.figure(figsize=(20, 10)) plt.scatter(x=labels, y=features[:, 0]) plt.xlabel('Gestational, age') plt.ylabel('PCA score') plt.title('Gestational age vs principal component one') plt.show() #''' alpha_num = (0.01, 0.1, 0.5, 1.0, 2.0, 4.0, 5.0, 10.0, 50.0, 100.0) DATA_train, DATA_test, LABELS_train, LABELS_test = train_test_split( features, labels, test_size=0.1, random_state=42) if algorithm == 1: model = MultiTaskLasso() param_grid = { 'alpha': np.random.uniform(0.1, 100, 1000), 'fit_intercept': [True, False], 'normalize': [True, False], 'max_iter': np.linspace(100, 10000, num=50, dtype=int), 'tol': np.linspace(0.000001, 0.001, num=50) } clf = RandomizedSearchCV(model, param_distributions=param_grid, cv=10, scoring='r2', random_state=42, n_iter=1000) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid elif algorithm == 2: model = ElasticNet() param_grid = { 'alpha': np.random.uniform(0.1, 100, 1000), 'l1_ratio': np.random.uniform(0, 1, 1000) } clf = RandomizedSearchCV(model, param_distributions=param_grid, cv=10, scoring='r2', random_state=42, n_iter=100) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid elif algorithm == 3: model = BayesianRidge() param_grid = { 'alpha_1': np.random.uniform(0.000001, 10, 1000), 'alpha_2': np.random.uniform(0.000001, 10, 1000), 'lambda_1': np.random.uniform(0.000001, 10, 1000), 'lambda_2': np.random.uniform(0.000001, 10, 1000), 'fit_intercept': [True, False], 'normalize': [True, False], 'max_iter': np.linspace(100, 10000, num=50, dtype=int), 'tol': np.linspace(0.000001, 0.001, num=50) } clf = RandomizedSearchCV(model, param_distributions=param_grid, cv=10, scoring='r2', random_state=42, n_iter=100) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid elif algorithm == 4: model = Lasso() param_grid = { 'alpha': np.random.uniform(0.1, 100, 1000), 'fit_intercept': [True, False], 'normalize': [True, False], 'max_iter': np.linspace(100, 10000, num=50, dtype=int), 'tol': np.linspace(0.000001, 0.001, num=50) } clf = RandomizedSearchCV(model, param_distributions=param_grid, cv=10, scoring='r2', random_state=42, n_iter=1000) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid elif algorithm == 5: model = SVR() param_grid = { 'C': np.random.uniform(0.1, 10000, 100), 'kernel': ['rbf', 'poly', 'sigmoid'], 'normalize': [True, False], 'max_iter': np.arange(100, 10000, 100, dtype=int), 'tol': np.arange(0.000001, 0.001, 0.000001) } clf = RandomizedSearchCV(model, param_distributions=param_grid, cv=10, scoring='r2', random_state=42, n_iter=100) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid elif algorithm == 6: model = RandomForestRegressor() model.fit(DATA_train, LABELS_train) depth_range = np.arange(2, model.n_features_, np.int(0.1 * model.n_features_) + 1) estimator_range = np.arange(20, 2020, 80) feature_range = np.linspace(1, model.n_features_, 0.4 * model.n_features_, dtype=int) param_grid = dict(max_depth=depth_range, n_estimators=estimator_range, max_features=feature_range) clf = RandomizedSearchCV(model, param_distributions=param_grid, cv=10, scoring='r2', random_state=42, n_iter=20) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid elif algorithm == 7: model = Ridge() param_grid = { 'alpha': np.random.uniform(0.1, 100, 1000), } clf = GridSearchCV(model, param_grid=param_grid, cv=10, scoring='r2', return_train_score=True) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid elif algorithm == 8: clf = LinearRegression() clf.fit(DATA_train, LABELS_train) clf.predict(DATA_test) return clf.score(DATA_test, LABELS_test) elif algorithm == 9: model = SVR() param_grid = { 'C': np.random.uniform(0.1, 2000, 100), 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'max_iter': np.arange(100, 2000, 100, dtype=int), 'tol': np.linspace(0.000001, 0.001, num=50) } clf = RandomizedSearchCV(model, param_distributions=param_grid, cv=10, scoring='r2', random_state=42, n_iter=100) clf.fit(DATA_train, LABELS_train) paramopti = clf.best_params_ score = clf.best_score_ score_grid = clf.cv_results_ return paramopti, score, score_grid else: print('Algorithms not recognised')
precedent[4:7, :, :, :] = block[i - 337:i - 334, :, :, :] # 前一周 precedent_frames.append(precedent) #regr = (max_depth=8, random_state=0,n_estimators=1000) model = MultiTaskLasso(alpha=1) X_train, X_val, y_train, y_val = train_test_split(precedent_frames, label_frames, test_size=0.2, random_state=4) # 转化为5D的numpy数组,训练集(920,7,64,64,2), 测试集(231,1,64,64,2) X_train = np.array(X_train) y_train = np.array(y_train) X_val = np.array(X_val) y_val = np.array(y_val) print(X_train.shape) print(X_val.shape) print(y_train.shape) print(y_val.shape) # 把5D数据转化为randomForest输入的2D数据 X_train = X_train.reshape((920, 7 * 64 * 64 * 2)) X_val = X_val.reshape((231, 7 * 64 * 64 * 2)) y_train = y_train.reshape((920, 1 * 64 * 64 * 2)) y_val = y_val.reshape((231, 1 * 64 * 64 * 2)) model.fit(X_train, y_train) y_pred = model.predict(X_val) from sklearn.metrics import mean_squared_error print(mean_squared_error(y_val, y_pred))
X, y = make_regression(noise=4, random_state=0) print("Get dataset X with prediction y:\n", X[:1], y[:5]) reg = LassoCV(cv=5, random_state=0).fit(X, y) print("Train the model LassoCV with 5-fold CV:\n", reg) print("Get the score:\n", reg.score(X, y)) print("Get the prediction using the dataset:\n", reg.predict(X[:1, ])) print("For dataset with more sample than features LassoLarsIC is preferable") print("-" * 200) print("\t"*1 + "1.1.4 Multi-task Lasso") print("It use with L1 and L2 norm") print("Multi-Task Lasso formula is (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\tWith ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}") import numpy as np from sklearn.linear_model import MultiTaskLasso clf = MultiTaskLasso(alpha=0.1) print("Create Multi-Task Lasso model:\n", clf) print("Train Multi-Task Lasso model with 3 sample of 2 features and 3 predictions:\n", clf.fit([[0, 0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])) print("Get the coef W:\n", clf.coef_) print("Get the intercept alpha:\n", clf.intercept_) print("It estimate sparse coefficients for multiple regression problems") print("-" * 200) print("\t"*1 + "1.1.5 Elastic-Net") from sklearn.linear_model import ElasticNet from sklearn.datasets import make_regression X, y = make_regression(n_features=2, random_state=0) print("Create dataset X with prediction y:\n", X[:1], y[:1]) regr = ElasticNet(random_state=0) print("Create ElasticNet model:\n", regr) print("Train ElasticNet model:\n", regr.fit(X, y)) print("Get the coef W:\n", regr.coef_) print("Get the intercept alpha:\n", regr.intercept_) print("Get the score:\n", regr.predict([0, 0]))
# # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state=0) import sys sys.path.insert(0, 'C:\\r workspace\\MultiSconES\\py') from load_data import load_dataset dataset = load_dataset() X = dataset["data"] Y = dataset["labels"] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) clf = MultiTaskLasso(alpha=1) print "train start" clf.fit(X_train, Y_train) print "train end" print "coef start" coef_multi_task_lasso_ = clf.coef_ print "coef end" plot_coef(coef_multi_task_lasso_) zero_coefs = get_stats(coef_multi_task_lasso_) print len(zero_coefs) Y_pred = clf.predict(X_test) clf_score = clf.score(X_test, Y_test) score = r2_score(Y_test[:, 5], Y_pred[:, 5])
import numpy as np from src.common.my_data import Data from sklearn.linear_model import LassoCV from sklearn.linear_model import MultiTaskLasso data = Data() agg_train_have_log = pd.read_table(data.output.sorted_train_agg_have_log_usr).drop('USRID', axis=1) print('agg_train_have_log : ', agg_train_have_log.shape) agg_test_have_log = pd.read_table(data.output.sorted_test_agg_have_log_usr).drop('USRID', axis=1) print('agg_test_have_log : ', agg_test_have_log.shape) agg_all_have_log = pd.concat([agg_train_have_log, agg_test_have_log], axis=0) print('agg_all_have_log : ', agg_all_have_log.shape) tf_idf_all_have_log = pd.read_table(data.feature.tf_idf_have_log_usr_evt_all) tf_idf_all_have_log_name = tf_idf_all_have_log.head(0) print(tf_idf_all_have_log_name) print('tf_idf_all_have_log : ', tf_idf_all_have_log.shape) # print(tf_idf_all) agg_no_have_log = pd.read_table(data.output.sorted_test_agg_no_have_log_usr).drop('USRID', axis=1) print('agg_no_have_log : ', agg_no_have_log.shape) lasso = MultiTaskLasso() lasso.fit(agg_all_have_log, tf_idf_all_have_log) result_lasso = lasso.predict(agg_no_have_log) print(result_lasso) # result_csv = pd.DataFrame(result_lasso) # data.to_csv(data.output.prediction_test_no_log_tf_idf, index=False, sep='\t')
from sklearn.linear_model import MultiTaskLasso from sklearn.cross_validation import train_test_split #splite dataset to get necessary sub-dataset features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42) #pre-process: dimensional reduction(SVD) svd1 = TruncatedSVD(n_components=9,random_state=1).fit(features_train) features_train = svd1.transform(features_train) svd2 = TruncatedSVD(n_components=9,random_state=1).fit(features_test) features_test = svd2.transform(features_test) #do regression mtl = MultiTaskLasso(alpha=0.000000001,random_state=1) mtl.fit(features_train,labels_train) print "MultiTaskLasso",mtl.score(features_test,labels_test) ###################################################################### #this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal #load necessary libs from sklearn.feature_selection import SelectKBest from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import MultiTaskElasticNet from sklearn.cross_validation import train_test_split #splite dataset to get necessary sub-dataset features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42) #pre-process: dimensional reduction(SVD)
class classSparser(object): def __init__(self, mapperType='PIMP', support=150, projectOnSubspace=False): #options are #'PIMP' for Moore Penrose Pseudo Inverse #'Regressor' for using a regression task on each dimension self.mapperType = mapperType self.sparsed_X = None self.transformation_matrix = None self.Regressor = None self.support = support self.projectOnSubspace = projectOnSubspace def fit(self, X, Y): self.sparsed_X = list() #First, tranlate points to the origin main_centroid = [np.mean(x) for x in np.transpose(X)] print 'Main centroid:', main_centroid X = X - main_centroid byClassDict = defaultdict(list) for i in xrange(len(Y)): byClassDict[Y[i]].append(X[i]) class_centroids = dict() centroids_matrix = list() kindexmap = dict() _i = 0 for k in byClassDict: class_centroid = [ np.mean(x) for x in np.transpose(byClassDict[k]) ] #np.mean(byClassDict[k]) _norm = np.linalg.norm(class_centroid) _scaling_factor = _norm**2 #(i+1)**2 #+ (i+_norm) #Play with this using _norm, i and any otrher function/constant _centroid = np.array(class_centroid) #*(_scaling_factor) print '*** Class centroid:', _centroid class_centroids[k] = _centroid centroids_matrix.append(_centroid) kindexmap[k] = _i _i += 1 centroids_matrix = np.array(centroids_matrix) ortho_centroids_matrix = np.array(gram_schmidt.gs(centroids_matrix)) ortho_centroids_matrix = normalize(ortho_centroids_matrix) print '*Centroids matrix', centroids_matrix print '*Ortho centroids matrix', ortho_centroids_matrix newX, newY = list(), list() ks = list() for k in byClassDict: #byClassDict[k] = np.array(byClassDict[k]) - centroids_matrix[kindexmap[k]] + np.array(ortho_centroids_matrix[kindexmap[k]]) #class_centroids[k] #this is the basis vector corresponding to current class classvector = np.array(ortho_centroids_matrix[kindexmap[k]]) kScalingFactor = self.support #This section tries to get a good scaling factor for each orthonormal vector maxks = list() for _k in ks: projs = [ scalarProjection(x, classvector) for x in byClassDict[_k] ] maxk = max(projs) maxks.append(maxk) maxownk = max( [scalarProjection(x, classvector) for x in byClassDict[k]]) if len(ks): kScalingFactor = max(maxks) + abs(maxownk) + self.support for v in byClassDict[k]: vv = np.array(v) - centroids_matrix[ kindexmap[k]] + classvector * kScalingFactor self.sparsed_X.append(vv) newX.append(v) newY.append(k) ks.append(k) self.sparsed_X = np.array(self.sparsed_X) if self.projectOnSubspace: #Project on to new subspace spawned by class vectors self.sparsed_X = np.dot(self.sparsed_X, np.transpose(centroids_matrix)) if self.mapperType == 'PIMP': #self.scaler = preprocessing.StandardScaler().fit(self.sparsed_X) #self.sparsed_X = self.scaler.transform(self.sparsed_X) self.transformation_matrix = self.sparsed_X * (np.transpose( np.linalg.pinv(X))) #self.transformation_matrix = X*(np.transpose(np.linalg.pinv(self.sparsed_X) ) ) if self.mapperType == 'Regressor': self.Regressor = MultiTaskLasso(alpha=0.00000001, max_iter=2000) self.Regressor.fit(newX, self.sparsed_X) return self.sparsed_X, newY def transform(self, X): Xs = X #self.scaler.transform(X) if self.mapperType == 'PIMP': transformed_data = self.transformation_matrix * Xs #transformed_data = Xs*self.transformation_matrix if self.mapperType == 'Regressor': transformed_data = self.Regressor.predict(Xs) return transformed_data
n_samples = 100 n_features = 40 n_tasks = 12 rel_f = 7 coef = np.zeros((n_tasks, n_features)) times = np.linspace(0, 2 * np.pi, n_tasks) for k in range(rel_f): coef[:, k] = np.sin((1.0 + rr.randn(1)) * times + 3 * rr.randn(1)) X = rr.randn(n_samples, n_features) y = np.dot(X, coef.T) + rr.randn(n_samples, n_tasks) X_train = X[:-20] y_train = y[:-20] X_test = X[-20:] y_test = y[-20:] print("Fitting LASSO model...") ll = Lasso(alpha=0.45) ll.fit(X_train, y_train) print("R2 score: {0}".format(r2_score(y_test, ll.predict(X_test)))) print("Fitting Multitask LASSO model...") ml = MultiTaskLasso(alpha=0.45) ml.fit(X_train, y_train) print("R2 score: {0}".format(r2_score(y_test, ml.predict(X_test)))) print("Plotting predictions...") plt.scatter(X[:, 1], y[:, 1]) plt.scatter(X[:, 1], ll.predict(X)[:, 1], color="blue") plt.scatter(X[:, 1], ml.predict(X)[:, 1], color="red") plt.show()
path_test = 'data_test.txt' X, Y = get_data_own(path_train) print(X.shape) print(Y.shape) print("Split data for CV") X_train, X_test , y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1) lasso = MultiTaskLasso(max_iter = max_iter, normalize = True) print("Init train with multitasklassocv") lassocv = MultiTaskLassoCV(alphas=None, cv=10, max_iter=max_iter, verbose=True, normalize=True) lassocv.fit(X_train, y_train) print("Fit multitasklasso with alpha from cv lasso") lasso.set_params(alpha=lassocv.alpha_) lasso.fit(X_train, y_train) print("get mean square error") mae = mean_absolute_error(y_test, lasso.predict(X_test)) print("mae: {}".format(mae)) rmsle = mean_squared_log_error(y_test, lasso.predict(X_test)) print("rmsle: {}".format(rmsle)) mape = mean_absolute_percentage_error(y_test, lasso.predict(X_test)) print("mape: {}".format(mape))
class SparseRegression: def __init__(self, v, delta_v, f, q, lin_args=(), force_args=(), split='shuffle', split_kargs={}): """ v.shape = (n_steps, n_variables), delta_v.shape = (n_steps, n_variables) q (in [1, n_variables]) number of first variables to fit the linear model to, remaining n_variables-q are used as forcing f: (n_steps, n_variables) -> (n_steps, n_features) f will be called with f(..., *lin_args) when fitting the linear model and with f(..., *force_args) when fitting the force """ if v.shape == delta_v.shape and type(q) == int and q > 0 \ and q <= v.shape[1]: self.v, self.delta_v = self._check_reduce(v, delta_v) self.params = [*self.v.shape, q] # [n_steps, n_vars, q] # derivatives used for the model self.delta_v = self.delta_v[:, :q] # calculate features based on first q variables for linear model self.features_lin_model = f(self.v[:, :q], *lin_args) # calculate features based on remaining variables for forcing term self.features_forcing = f(self.v[:, q:], *force_args) # two different types of splitting split_dict = { 'shuffle': self._shuffle_split, 'lorenz': self._lobes_split } # split the timesteps into two parts: # first is used for fitting linear model, second for forcing self.mask_l_m, self.mask_f = split_dict[split](**split_kargs) # self.mask_l_m, self.mask_f = self._split_lobes(self.v[:, 0]) self.feature_generation = f self.feature_generation_args = { 'linear': lin_args, 'forcing': force_args } else: raise Exception('Error: invalid init parameter') def _shuffle_split(self, fraction=0.5): """ creates two masks to split n_steps elements into two disjunct sets where the first has length=fraction*n """ assert fraction > 0 and fraction < 1 n_steps = self.params[0] n_1 = int(n_steps * fraction) shuffled_ind = np.random.permutation(n_steps) ind_1 = shuffled_ind[:n_1] mask_1 = np.zeros(n_steps, dtype=np.bool) mask_1[ind_1] = True ind_2 = shuffled_ind[n_1:] mask_2 = np.zeros(n_steps, dtype=np.bool) mask_2[ind_2] = True # each element is part of either one or the other mask assert np.all(mask_1 ^ mask_2) return mask_1, mask_2 def _lobes_split(self, window_pos=200, window_neg=400): """ use regions in which trajectories are on the lobes to fit the linear model and the remaining steps for modeling the force """ v_1 = self.v[:, 0] n_steps = self.params[0] # find lobe switches m_pos = v_1 > 0 m_neg = v_1 < 0 mask_switch = (m_pos[:-1] & m_neg[1:]) | (m_neg[:-1] & m_pos[1:]) switch_ind = np.nonzero(mask_switch)[0] print('no. of lobe switches detected in v_1: {:d}'.format( len(switch_ind))) force_ind_list = [] for switch in switch_ind: if switch + 1 - window_neg < 0: l_neg = switch else: l_neg = window_neg if switch + 1 + window_pos > n_steps: l_pos = n_steps - switch else: l_pos = window_pos force_ind_list.append(np.arange(switch - l_neg, switch + l_pos)) force_ind = np.concatenate(force_ind_list) assert np.all(force_ind >= 0) and np.all(force_ind < n_steps) mask_lobes = np.ones(n_steps, dtype=np.bool) mask_lobes[force_ind] = False mask_switch = np.zeros(n_steps, dtype=np.bool) mask_switch[force_ind] = True assert np.all(mask_lobes ^ mask_switch) return mask_lobes, mask_switch def _check_reduce(self, v, delta_v): """ check both matrices for columns containg nan and excludes them """ invalid_v = np.any(np.isnan(v), axis=1) if np.any(invalid_v): print('Warning: v matrix contains NaNs') invalid_delta_v = np.any(np.isnan(delta_v), axis=1) if np.any(invalid_delta_v): print('Warning: delta_v matrix contains NaNs') valid_steps = (~invalid_v) & (~invalid_delta_v) valid_fraction = np.sum(valid_steps) / len(valid_steps) if not np.isclose(valid_fraction, 1): print('Warning: only {:.1%} of time steps are valid'.format( valid_fraction)) if valid_fraction < 0.95: raise Exception('Error: less than 95% of time steps are valid') return v[valid_steps], delta_v[valid_steps] def fit_lin_model(self, alpha=None): """ fit sparse linear regression on first q variables alpha is penalization parameter, None triggers cross validation """ if alpha is None: # do cross validation self.lin_model = \ MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1, fit_intercept=False, normalize=False, max_iter=3500) else: self.lin_model = \ MultiTaskLasso(alpha=alpha, fit_intercept=False, normalize=False) self.lin_model.fit(self.features_lin_model[self.mask_l_m], self.delta_v[self.mask_l_m]) def pred_lin_model(self): """ calculate prediction of the linear model on the data set not used for training it """ pred_d_v = self.lin_model.predict(self.features_lin_model[self.mask_f]) d_v = self.delta_v[self.mask_f] # calculate correlation for each variable n_variables = d_v.shape[1] print('corr. of prediction and true delta_v:') for i in range(n_variables): r, p = pearsonr(pred_d_v[:, i], d_v[:, i]) print('{:d}th variable: r={:.2f} (p={:.2f})'.format(i + 1, r, p)) self.eps = d_v - pred_d_v # d_v - Af(v) def fit_force_params(self, alpha=None): """ fit sparse linear regression on remaining n_variables-q variables alpha is penalization parameter, None triggers cross validation """ if alpha is None: # do cross validation self.force_model = \ MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1, fit_intercept=False, normalize=False) else: self.force_model = \ MultiTaskLasso(alpha=alpha, fit_intercept=False, normalize=False) self.force_model.fit(self.features_forcing[self.mask_f], self.eps) def fit(self, alpha_lin=None, alpha_force=None): self.fit_lin_model(alpha=alpha_lin) self.pred_lin_model() self.fit_force_params(alpha=alpha_force) def plot_coefs(self, f_descr=None): """ plot coef matrix of linear and force model f_descr(n_vars, offset, *args) -> n_features """ n_f_lin_model = self.features_lin_model.shape[1] n_f_forcing = self.features_forcing.shape[1] q = self.params[-1] if f_descr is not None: # get names of the features f_lin_model_str = f_descr(q, 0, *self.feature_generation_args['linear']) f_forcing_str = f_descr(self.v.shape[1] - q, q, *self.feature_generation_args['forcing']) assert len(f_lin_model_str) == n_f_lin_model assert len(f_forcing_str) == n_f_forcing else: f_lin_model_str = \ [str(i) for i in range(n_f_lin_model)] f_forcing_str = \ [str(i) for i in range(n_f_forcing)] n_f = n_f_lin_model + n_f_forcing fractions = (n_f_lin_model / n_f, n_f_forcing / n_f) fig, axes = plt.subplots(ncols=2, sharey=True, gridspec_kw={'width_ratios': fractions}) plt.subplots_adjust(wspace=0.2) a = self.lin_model.coef_ b = self.force_model.coef_ assert a.shape[0] == b.shape[0] n_vars = a.shape[0] max_abs_coef = max(abs(a.min()), abs(b.min()), a.max(), b.max()) titles = ['A', 'B'] matrices = [a, b] ticklabels = [f_lin_model_str, f_forcing_str] for i, ax in enumerate(axes): ax.set_title(titles[i]) im = ax.imshow(matrices[i], vmin=-max_abs_coef, vmax=max_abs_coef, origin='upper', cmap='seismic') ax.set_xticks(np.arange(len(ticklabels[i]))) ax.set_xticklabels(ticklabels[i], rotation=45) ax.set_xlabel('features') ax.set_yticks(np.arange(n_vars)) ax.set_yticklabels( ['$v_{:d}$'.format(i + 1) for i in range(n_vars)]) axes[0].set_ylabel('variables') plt.colorbar(im, ax=axes, fraction=0.05, shrink=0.75) def _dv(self, t, v, force): """ v.shape = (q,) force(t) """ # linear part lin_args = self.feature_generation_args['linear'] features_lin = \ self.feature_generation(v.reshape(1, -1), *lin_args).squeeze() lin_contr = np.dot(self.lin_model.coef_, features_lin) # forcing part force_args = self.feature_generation_args['forcing'] features_force = \ self.feature_generation(force(t).reshape(1, -1), *force_args).squeeze() force_contr = np.dot(self.force_model.coef_, features_force) dv = lin_contr + force_contr return dv def solve_model(self, dt, ind_v_init, force=None): """ use time serie of the force variables and simulate the system from ind_v_init """ n_steps, n_vars, q = self.params v_init = self.v[ind_v_init, :q] # resemble the timesteps at which the original data was evaluated n_remaining = n_steps - ind_v_init t_remaining = dt * (n_remaining - 1) t_eval = np.linspace(0, t_remaining, num=n_remaining) if force is None: def f_dummy(t): return np.zeros(n_vars - q) dv = partial(self._dv, force=f_dummy) elif force.shape == (n_remaining, n_vars - q): f_interp = interp1d(t_eval, force, axis=0, kind='quadratic') dv = partial(self._dv, force=f_interp) else: raise Exception('invalid force') result = solve_ivp(dv, [0, t_remaining], v_init, t_eval=t_eval, method='RK45', rtol=1e-6, atol=1e-12) print(result.message) return result