Exemplo n.º 1
0
def findLassoAlpha(alpha, y, X, returnPred=False):
    X_train, X_test = X.loc['2013-10-01':'2015-04-01'], X.loc[
        '2015-05-01':'2016-04-01']
    y_train, y_test = y.loc['2013-10-01':'2015-04-01'], y.loc[
        '2015-05-01':'2016-04-01']
    datestotest = y_test.index
    dt = datestotest[0]
    lassoreg2 = MultiTaskLasso(alpha=alpha, max_iter=1e5)
    lassoreg2.fit(X_train, y_train)
    y_pred2 = lassoreg2.predict(X_test.loc[dt].reshape(1, -1))
    y_pred2 = pd.DataFrame(y_pred2)
    y_pred2.columns = y.columns
    prediction = y_pred2
    X_train = X.loc['2013-10-01':dt]
    y_train = y.loc['2013-10-01':dt]
    for dt in datestotest[1:]:
        lassoreg2 = MultiTaskLasso(alpha=alpha, max_iter=1e5)
        lassoreg2.fit(X_train, y_train)
        y_pred2 = lassoreg2.predict(X_test.loc[dt].reshape(1, -1))
        y_pred2 = pd.DataFrame(y_pred2)
        y_pred2.columns = y.columns
        prediction = pd.concat([prediction, y_pred2])
        X_train = X.loc['2013-10-01':dt]
        y_train = y.loc['2013-10-01':dt]
    prediction.index = y_test.index
    if (returnPred):
        return (y_test, prediction)
    else:
        return mean_squared_error(y_test, prediction)
Exemplo n.º 2
0
def constrained_multiclass_solve(w, psi, alpha=1.0, **lasso_kws):
    """
    Solve
    .. math::

        \\text{argmin}_s \\|s\\|_0 \
        \\text{subject to} \\|w - psi s\\|_2^2 \\leq tol
    """
    model = MultiTaskLasso(alpha=alpha, **lasso_kws)
    model.fit(psi, w)
    return model.coef_.T
Exemplo n.º 3
0
class MultiTaskLassoImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Exemplo n.º 4
0
    def mtlasso_model(self, X_train, y_train, X_test, y_test):

        mtlasso_model = MultiTaskLasso(alpha=.005)

        mtlasso_model.fit(X_train, y_train)

        y_train_pred = mtlasso_model.predict(X_train)
        y_test_pred = mtlasso_model.predict(X_test)

        # Scoring the model
        print(mtlasso_model.score(X_train, y_train))
        print(mtlasso_model.score(X_test, y_test))
        print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error(
            y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.6f, R^2 test: %.6f' %
              (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
Exemplo n.º 5
0
def run_one_configuration(
    full_train_covariate_matrix,
    complete_target,
    new_valid_covariate_data_frames,
    new_valid_target_data_frame,
    std_data_frame,
    target_clusters,
    featurizer,
    model_name,
    parameters,
    log_file,
):
    model_baseline = dict()
    model_baseline["type"] = model_name
    model_baseline["target_clusters"] = target_clusters

    if model_name == "multi_task_lasso":
        model = MultiTaskLasso(max_iter=5000, **parameters)
    elif model_name == "xgboost":
        model = MultiOutputRegressor(
            XGBRegressor(n_jobs=10,
                         objective="reg:squarederror",
                         verbosity=0,
                         **parameters))

    model.fit(featurizer(full_train_covariate_matrix),
              complete_target.to_numpy(copy=True))
    model_baseline["model"] = lambda x: model.predict(featurizer(x))

    skill, _, _, _ = location_wise_metric(
        new_valid_target_data_frame,
        new_valid_covariate_data_frames,
        std_data_frame,
        model_baseline,
        "skill",
    )
    cos_sim, _, _, _ = location_wise_metric(
        new_valid_target_data_frame,
        new_valid_covariate_data_frames,
        std_data_frame,
        model_baseline,
        "cosine-sim",
    )
    with open(log_file, "a") as f:
        f.write(f"{len(target_clusters)} {parameters} {skill} {cos_sim}\n")
Exemplo n.º 6
0
def constrained_multiclass_solve(w, psi, alpha=1.0, quiet=False, **lasso_kws):
    """
    Solve

    .. math::

        \\text{argmin}_s \\|s\\|_0 \\\\
        \\text{subject to} \\|w - \\psi s\\|_2^2 \\leq tol
    """
    model = MultiTaskLasso(alpha=alpha, **lasso_kws)

    if quiet:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=RuntimeWarning)
            warnings.filterwarnings("ignore", category=UserWarning)
            model.fit(psi, w)
    else:
        model.fit(psi, w)

    return model.coef_.T
Exemplo n.º 7
0
def test_multitasklasso(gaussian_data, fit_intercept, normalize, alpha):

    X, y = gaussian_data
    X = [X[0], X[0]]
    n_samples = y.shape[1]

    Xty = np.array([xx.T.dot(yy) for xx, yy in zip(X, y)])
    alpha_max = np.linalg.norm(Xty, axis=0).max()
    alpha *= alpha_max / n_samples
    est = GroupLasso(alpha=alpha,
                     fit_intercept=fit_intercept,
                     normalize=normalize)
    est.fit(X, y)
    assert hasattr(est, 'is_fitted_')

    mtlasso = MultiTaskLasso(alpha=alpha,
                             fit_intercept=fit_intercept,
                             normalize=normalize)
    mtlasso.fit(X[0], y.T)
    assert_allclose(est.coef_, mtlasso.coef_.T, rtol=1e-2)
Exemplo n.º 8
0
def get_signature_genes(X, n, lda=10):
    W = np.zeros((X.shape[0], X.shape[0]))
    # coarse search from the bottom
    while (abs(W).sum(1) > 0).sum() < n:
        lda /= 10.
        model = MultiTaskLasso(alpha=lda,
                               max_iter=100,
                               tol=.001,
                               selection='random',
                               warm_start=True)
        model.fit(X.T, X.T)
        W = model.coef_.T
        #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T)
    # fine search from the top
    while (abs(W).sum(1) > 0).sum() > n * 1.2:
        lda *= 2.
        model.set_params(alpha=lda)
        model.fit(X.T, X.T)
        W = model.coef_.T
        #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T)
    # finer search
    while (abs(W).sum(1) > 0).sum() > n:
        lda *= 1.1
        model.set_params(alpha=lda)
        model.fit(X.T, X.T)
        W = model.coef_.T
        #print len(np.nonzero(abs(W).sum(1))[0]),model.score(X.T,X.T)
    return np.nonzero(abs(W).sum(1))[0]
def make_dictionary(X,
                    n_components=20,
                    alpha=5.,
                    write_dir='/tmp/',
                    contrasts=[],
                    method='multitask',
                    l1_ratio=.5,
                    n_subjects=13):
    """Create dictionary + encoding"""
    from sklearn.decomposition import dict_learning_online, sparse_encode
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet

    mem = Memory(write_dir, verbose=0)
    dictionary = mem.cache(initial_dictionary)(n_components, X)
    np.savez(os.path.join(write_dir, 'dictionary.npz'),
             loadings=dictionary,
             contrasts=contrasts)
    if method == 'online':
        components, dictionary = dict_learning_online(X.T,
                                                      n_components,
                                                      alpha=alpha,
                                                      dict_init=dictionary,
                                                      batch_size=200,
                                                      method='cd',
                                                      return_code=True,
                                                      shuffle=True,
                                                      n_jobs=1,
                                                      positive_code=True)
        np.savez(os.path.join(write_dir, 'dictionary.npz'),
                 loadings=dictionary,
                 contrasts=contrasts)
    elif method == 'sparse':
        components = sparse_encode(X.T,
                                   dictionary,
                                   alpha=alpha,
                                   max_iter=10,
                                   n_jobs=1,
                                   check_input=True,
                                   verbose=0,
                                   positive=True)
    elif method == 'multitask':
        # too many hard-typed parameters !!!
        n_voxels = X.shape[1] // n_subjects
        components = np.zeros((X.shape[1], n_components))
        clf = MultiTaskLasso(alpha=alpha)
        clf = MultiTaskElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        for i in range(n_voxels):
            x = X[:, i:i + n_subjects * n_voxels:n_voxels]
            components[i: i + n_subjects * n_voxels: n_voxels] =\
                clf.fit(dictionary.T, x).coef_
    return dictionary, components
Exemplo n.º 10
0
#print(pca.explained_variance_ratio_) 
X_train_reduced = combined_features.transform(X_train_scaled)
X_test_reduced = combined_features.transform(X_test_scaled)

## Create K folds
k_fold = KFold(Y_train_raw.shape[0], n_folds=10)
for train, test in k_fold:
    X1 = X_train_reduced[train]
    Y1 = Y_train_raw[train]
    
    X2 = X_train_reduced[test]
    Y2 = Y_train_raw[test]    

    ## Train Classifiers on fold
    mcl_clf = MultiTaskLasso(alpha=.3)
    mcl_clf.fit(X1, Y1)


    ## Score Classifiers on fold

    mcl_clf_score = mcl_clf.score(X2, Y2)

    print "MultiTaskLasso:  ", mcl_clf_score



## Lasso CV for parameter optimization
t1 = time.time()
clf = MultiTaskLasso(alpha=.3).fit(X_train_reduced, Y_train_raw)
t_lasso_cv = time.time() - t1
print 'time to train', t_lasso_cv
Exemplo n.º 11
0
    def __init__(
            self,
            species: str,
            reprocess: Optional[bool] = False,
            gene_selection_method: Optional[Literal['deg', 'lasso',
                                                    'elastic-net']] = 'deg',
            model_cache_dir: Optional[str] = None,
            alpha: Optional[Union[float, Sequence[float]]] = 1e-2,
            learning_rate: Optional[float] = 1e-3,
            equal_weight: Optional[bool] = True,
            train_split: Optional[float] = 0.8,
            n_jobs: Optional[int] = 15,
            remove_correlated: Optional[Literal['both', 'ct',
                                                'region']] = None,
            normalize: Optional[bool] = False,
            dim_reduction: Optional[str] = None,
            n_components: Optional[int] = None):
        super().__init__()
        torch.set_num_threads(n_jobs)

        filename = f'{species}_ex_colors'

        self.learning_rate = learning_rate
        self.device = 'cpu'

        # Used saved data if possible
        if not reprocess and os.path.exists(
                f'withcolors_preprocessed/{filename}.pickle'):
            with open(f'withcolors_preprocessed/{filename}.pickle',
                      mode='rb') as file:
                data_dict = pickle.load(file)
                self.data = data_dict['data']
                self.ct_axis_mask = data_dict['ct_axis_mask']
                self.r_axis_mask = data_dict['r_axis_mask']
                # No need to do anything else
                return

        species_data = sc.read(f'withcolors/{filename}.h5ad')
        if dim_reduction is not None:
            sc.pp.pca(species_data, n_comps=n_components)
            sc.pp.highly_variable_genes(species_data)
            sc.pp.neighbors(species_data, n_pcs=n_components)
            if dim_reduction == 'pca':
                sc.tl.pca(species_data, n_comps=n_components)
            elif dim_reduction == 'umap':
                sc.tl.umap(species_data, n_components=n_components)
            elif dim_reduction == 'tsne':
                sc.tl.tsne(species_data, n_pcs=n_components)
            species_data = AnnData(species_data.obsm[f'X_{dim_reduction}'],
                                   obs=species_data.obs)
            species_data.var.index = pd.Index([
                f'{dim_reduction}{x}'
                for x in range(len(species_data.var.index))
            ])

        # Label each observation with its subregion and species
        species_data.obs['clusters'] = species_data.obs['clusters'].apply(
            lambda s: species[0].upper() + '_' + s)
        species_data.obs['subregion'] = species_data.obs['clusters'].apply(
            lambda s: s.split('.')[0])
        self.n_var = len(species_data.var.index)
        self.n_subregions = len(np.unique(species_data.obs['subregion']))
        self.n_clusters = len(np.unique(species_data.obs['clusters']))
        self.n_obs = len(species_data.obs.index)

        if gene_selection_method == 'deg':
            self._deg_select(dim_reduction, species_data)
        elif gene_selection_method in ['lasso', 'elastic-net']:
            # if isinstance(alpha, float):
            #     alpha = [alpha]
            for label in ['subregion', 'clusters']:
                if equal_weight:
                    # get count of number of occurrences of each label
                    label_to_count = species_data.obs[label].value_counts(
                        normalize=True).to_dict()
                    # Map each observation to its appropriate label appearance frequency
                    w = species_data.obs[label].map(label_to_count)
                    # Diagonalize and take square root to appropriately normalize data
                    w = np.diag(np.sqrt(w))
                    # normalize data
                    transcriptomes = np.matmul(w, species_data.X.toarray())
                else:
                    transcriptomes = species_data.X.toarray()
                model_file = f'{model_cache_dir}/{gene_selection_method}/' \
                             f'{species[0].upper()}_normalized-{equal_weight}_{label}_a-{alpha}.pt'
                if model_cache_dir is not None and os.path.exists(model_file):
                    with open(model_file, 'rb') as file:
                        model = pickle.load(file)
                else:
                    # Create one-hot encoding of labels
                    num_labels = self.n_subregions if label == 'subregion' else self.n_clusters
                    label_to_id = {
                        r: i
                        for i, r in enumerate(
                            np.unique(species_data.obs[label]))
                    }
                    labels = species_data.obs[label].map(label_to_id)
                    labels_expanded = np.zeros((self.n_obs, num_labels))
                    labels_expanded[np.arange(self.n_obs), labels] = 1
                    if gene_selection_method == 'lasso':
                        model = MultiTaskLasso(alpha=alpha, max_iter=10000)
                    else:
                        model = MultiTaskElasticNet(alpha=alpha,
                                                    max_iter=10000)
                    model.fit(transcriptomes, labels_expanded)
                    with open(model_file, 'wb') as file:
                        pickle.dump(model, file, protocol=5)
                max_weight_per_gene = (model.coef_ != 0).max(axis=0)
                # # define the model
                # model = nn.Sequential(
                #     # nn.BatchNorm1d(self.n_var),
                #     nn.Linear(self.n_var, num_labels)
                # )
                # model_file = f'{model_cache_dir}_{label}.pt'
                # if model_cache_dir is None or not os.path.exists(model_file):
                #     print(f'\nTraining lasso on {label}.\n')
                #     # Create the dataset and dataloader
                #     ds = SparseDataSet(species_data, label)
                #     train_size = int(train_split * len(ds))
                #     val_size = len(ds) - train_size
                #     train_ds, val_ds = torch.utils.data.random_split(ds, [train_size, val_size])
                #     train_dl = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0)
                #     val_dl = DataLoader(val_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0)
                #     optimizer = optim.Adam(model.parameters(), lr=self.learning_rate)
                #     # train
                #     num_nonzero_features_by_alpha = []
                #     for alpha in alpha:
                #         loss_history = self._train_model(model, train_dl, val_dl, optimizer, alpha=alpha, epochs=50)
                #         max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0]
                #         num_nonzero_features_by_alpha.append([(max_weight_per_gene > 1e-4).sum(), alpha])
                #         # save the model
                #         torch.save(model.state_dict(), model_file)
                #         plt.plot(loss_history[:, 0], label='train loss')
                #         plt.plot(loss_history[:, 1], label='val loss')
                #         plt.legend()
                #         plt.show()
                #     num_nonzero_features_by_alpha = np.array(num_nonzero_features_by_alpha)
                #     plt.plot(num_nonzero_features_by_alpha[:, 0], num_nonzero_features_by_alpha[:, 1])
                #     plt.savefig('num_features_selected_v_l1_weight.pdf')
                #     plt.show()
                # else:
                #     model.load_state_dict(torch.load(model_file))
                # # Get the max weight per gene to see whether it's relevant to at least one subregion
                # with torch.no_grad():
                #     max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0]
                #     with torch.no_grad():
                #         sns.distplot(max_weight_per_gene)
                #         plt.show()
                if label == 'subregion':
                    self.r_axis_mask = max_weight_per_gene != 0
                else:
                    self.ct_axis_mask = max_weight_per_gene != 0
        print(
            f'Before removing correlated genes, found {self.r_axis_mask.sum()} region genes '
            f'and {self.ct_axis_mask.sum()} cell type genes.')

        if remove_correlated is not None:
            self._remove_r_ct_correlated(remove_correlated, species_data)
            print(
                f'After removing correlated genes, found {self.r_axis_mask.sum()} region genes '
                f'and {self.ct_axis_mask.sum()} cell type genes.')

        # Average transcriptomes within each cell type and put into data frame with cell types as rows and genes as cols
        ct_names = np.unique(species_data.obs['clusters'])
        ct_avg_data = [
            species_data[species_data.obs['clusters'] == ct].X.mean(axis=0)
            for ct in ct_names
        ]
        self.data = pd.concat([
            pd.DataFrame(data.reshape((1, -1)),
                         columns=species_data.var.index,
                         index=[cluster_name])
            for data, cluster_name in zip(ct_avg_data, ct_names)
        ])
        # Divide each row by mean, as in Tosches et al, rename columns,
        # and transpose so that column labels are genes and rows are cell types
        # Divide each row by mean
        if normalize:
            self.data = self.data.div(self.data.mean(axis=0).to_numpy(),
                                      axis=1)  # noqa

        # Save data
        data_dict = {
            'data': self.data,
            'ct_axis_mask': self.ct_axis_mask,
            'r_axis_mask': self.r_axis_mask
        }
        with open(f'withcolors_preprocessed/{filename}.pickle',
                  mode='wb') as file:
            pickle.dump(data_dict, file)
Exemplo n.º 12
0
class classSparser(object):
    def __init__(self,mapperType='PIMP',support=150,projectOnSubspace=False):
        #options are
        #'PIMP' for Moore Penrose Pseudo Inverse
        #'Regressor' for using a regression task on each dimension
        self.mapperType = mapperType
        self.sparsed_X = None
        self.transformation_matrix = None
        self.Regressor = None
        self.support = support
        self.projectOnSubspace = projectOnSubspace

    def fit(self,X,Y):
        self.sparsed_X = list()
        #First, tranlate points to the origin
        main_centroid = [ np.mean(x) for x in np.transpose(X) ]
        print 'Main centroid:', main_centroid
        X = X - main_centroid

        byClassDict = defaultdict(list)
        for i in xrange(len(Y)):
            byClassDict[Y[i]].append(X[i])


        class_centroids = dict()

        centroids_matrix = list()
        kindexmap = dict()

        _i = 0
        for k in byClassDict:
            class_centroid = [ np.mean(x) for x in np.transpose(byClassDict[k]) ] #np.mean(byClassDict[k])
            _norm = np.linalg.norm(class_centroid)
            _scaling_factor = _norm**2#(i+1)**2 #+ (i+_norm)  #Play with this using _norm, i and any otrher function/constant
            _centroid = np.array(class_centroid)#*(_scaling_factor)
            print '*** Class centroid:', _centroid
            class_centroids[k] = _centroid
            centroids_matrix.append(_centroid)
            kindexmap[k] = _i
            _i+=1

        centroids_matrix = np.array(centroids_matrix)
        ortho_centroids_matrix = np.array(gram_schmidt.gs(centroids_matrix))
        ortho_centroids_matrix = normalize(ortho_centroids_matrix)

        print '*Centroids matrix',centroids_matrix
        print '*Ortho centroids matrix', ortho_centroids_matrix


        newX, newY = list(), list()
        ks = list()
        for k in byClassDict:
            #byClassDict[k] = np.array(byClassDict[k]) - centroids_matrix[kindexmap[k]] + np.array(ortho_centroids_matrix[kindexmap[k]]) #class_centroids[k]

            #this is the basis vector corresponding to current class
            classvector = np.array(ortho_centroids_matrix[kindexmap[k]])
            kScalingFactor = self.support

            #This section tries to get a good scaling factor for each orthonormal vector
            maxks = list()
            for _k in ks:
                projs = [scalarProjection(x,classvector) for x in byClassDict[_k]]
                maxk = max(projs)
                maxks.append(maxk)

                maxownk = max([scalarProjection(x,classvector) for x in byClassDict[k]])

            if len(ks):
                kScalingFactor = max(maxks) + abs(maxownk) + self.support


            for v in byClassDict[k]:
                vv = np.array(v) - centroids_matrix[kindexmap[k]] + classvector*kScalingFactor
                self.sparsed_X.append(vv)
                newX.append(v)
                newY.append(k)
                ks.append(k)

        self.sparsed_X = np.array(self.sparsed_X)

        if self.projectOnSubspace:
            #Project on to new subspace spawned by class vectors
            self.sparsed_X = np.dot(self.sparsed_X,np.transpose(centroids_matrix) )


        if self.mapperType == 'PIMP':
            #self.scaler = preprocessing.StandardScaler().fit(self.sparsed_X)
            #self.sparsed_X = self.scaler.transform(self.sparsed_X)

            self.transformation_matrix = self.sparsed_X*(np.transpose(np.linalg.pinv(X) ) )
            #self.transformation_matrix = X*(np.transpose(np.linalg.pinv(self.sparsed_X) ) )

        if self.mapperType == 'Regressor':
            self.Regressor = MultiTaskLasso(alpha=0.00000001,max_iter=2000)
            self.Regressor.fit(newX,self.sparsed_X)

        return self.sparsed_X, newY


    def transform(self,X):
        Xs = X#self.scaler.transform(X)
        if self.mapperType == 'PIMP':
            transformed_data = self.transformation_matrix*Xs
            #transformed_data = Xs*self.transformation_matrix
        if self.mapperType == 'Regressor':
            transformed_data = self.Regressor.predict(Xs)

        return transformed_data
Exemplo n.º 13
0
    def fit(self, K, s):
        r"""Fit the model using the coordinate descent method from scikit-learn.

        Args
        ----

        K: ndarray
            The :math:`m \times n` kernel matrix, :math:`{\bf K}`. A numpy array of
            shape (m, n).
        s: ndarray or CSDM object.
            A csdm object or an equivalent numpy array holding the signal,
            :math:`{\bf s}`, as a :math:`m \times m_\text{count}` matrix.
        """
        s_, self.scale = prepare_signal(s)

        prod = np.asarray(self.f_shape).prod()
        if K.shape[1] != prod:
            raise ValueError(
                "The product of the shape, `f_shape`, must be equal to the length of "
                f"the axis 1 of kernel, K, {K.shape[1]} != {prod}.")

        alpha = s_.size * self.hyperparameters["alpha"]
        Ks, ss = _get_augmented_data(K=K,
                                     s=s_,
                                     alpha=alpha,
                                     regularizer=self.regularizer,
                                     f_shape=self.f_shape)

        # The factor 0.5 for alpha in the Lasso/LassoLars problem is to compensate
        # 1/(2 * n_sample) factor in OLS term
        if self.method == "multi-task":
            estimator = MultiTaskLasso(
                alpha=self.hyperparameters["lambda"] / 2.0,
                fit_intercept=False,
                copy_X=True,
                max_iter=self.max_iterations,
                tol=self.tolerance,
                warm_start=False,
                random_state=None,
                selection="random",
                # positive=self.positive,
            )

        if self.method == "gradient_decent":
            estimator = Lasso(
                alpha=self.hyperparameters["lambda"] / 2.0,
                fit_intercept=False,
                copy_X=True,
                max_iter=self.max_iterations,
                tol=self.tolerance,
                warm_start=False,
                random_state=None,
                selection="random",
                positive=self.positive,
            )

        if self.method == "lars":
            estimator = LassoLars(
                alpha=self.hyperparameters["lambda"] / 2.0,
                fit_intercept=False,
                verbose=True,
                # normalize=False,
                precompute=True,
                max_iter=self.max_iterations,
                eps=2.220446049250313e-16,
                copy_X=True,
                fit_path=False,
                positive=True,
                jitter=None,
                random_state=None,
            )

        estimator.fit(Ks, ss)
        f = estimator.coef_.copy()
        if s_.shape[1] > 1 and len(self.f_shape) == 2:
            f.shape = (s_.shape[1], ) + self.f_shape
            f[:, :, 0] /= 2.0
            f[:, 0, :] /= 2.0
        elif s_.shape[1] == 1 and len(self.f_shape) == 2:
            f.shape = self.f_shape
            f[:, 0] /= 2.0
            f[0, :] /= 2.0

        f *= self.scale
        self.estimator = estimator
        self.f = f
        self.n_iter = estimator.n_iter_
        self._sol_to_csdm(s)
Exemplo n.º 14
0
    print "测试集得分:", ompCV.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, ompCV.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试MultiTaskLasso类**********"
    # 在初始化MultiTaskLasso类时, 指定参数alpha, 默认值是1.0.
    multiTaskLasso = MultiTaskLasso(alpha=1.0)
    # 拟合训练集
    multiTaskLasso.fit(train_X, train_Y)
    # 打印模型的系数
    print "系数:", multiTaskLasso.coef_
    print "截距:", multiTaskLasso.intercept_
    print '训练集R2: ', r2_score(train_Y, multiTaskLasso.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = multiTaskLasso.predict(test_X)
    print "测试集得分:", multiTaskLasso.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, multiTaskLasso.predict(X))
    print "TSS(Total Sum of Squares): ", tss
Exemplo n.º 15
0
from sklearn.cross_validation import train_test_split

#splite dataset to get necessary sub-dataset
features_train, features_test, labels_train, labels_test = train_test_split(
    features_sc, label_scm, test_size=0.33, random_state=42)

#pre-process: dimensional reduction(SVD)
svd1 = TruncatedSVD(n_components=9, random_state=1).fit(features_train)
features_train = svd1.transform(features_train)

svd2 = TruncatedSVD(n_components=9, random_state=1).fit(features_test)
features_test = svd2.transform(features_test)

#do regression
mtl = MultiTaskLasso(alpha=0.000000001, random_state=1)
mtl.fit(features_train, labels_train)
print "MultiTaskLasso", mtl.score(features_test, labels_test)

######################################################################
#this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal

#load necessary libs
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.cross_validation import train_test_split

#splite dataset to get necessary sub-dataset
features_train, features_test, labels_train, labels_test = train_test_split(
    features_sc, label_scm, test_size=0.33, random_state=42)
Exemplo n.º 16
0
def crossval(labels, features, algorithm):
    features = np.nan_to_num(features)
    #features = getPCA(np.nan_to_num(features))
    #plot scatter of the first column of the features array (assuming the pca has been done):
    #'''
    plt.figure(figsize=(20, 10))
    plt.scatter(x=labels, y=features[:, 0])
    plt.xlabel('Gestational, age')
    plt.ylabel('PCA score')
    plt.title('Gestational age vs principal component one')
    plt.show()
    #'''
    alpha_num = (0.01, 0.1, 0.5, 1.0, 2.0, 4.0, 5.0, 10.0, 50.0, 100.0)
    DATA_train, DATA_test, LABELS_train, LABELS_test = train_test_split(
        features, labels, test_size=0.1, random_state=42)

    if algorithm == 1:
        model = MultiTaskLasso()
        param_grid = {
            'alpha': np.random.uniform(0.1, 100, 1000),
            'fit_intercept': [True, False],
            'normalize': [True, False],
            'max_iter': np.linspace(100, 10000, num=50, dtype=int),
            'tol': np.linspace(0.000001, 0.001, num=50)
        }
        clf = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 cv=10,
                                 scoring='r2',
                                 random_state=42,
                                 n_iter=1000)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid
    elif algorithm == 2:
        model = ElasticNet()
        param_grid = {
            'alpha': np.random.uniform(0.1, 100, 1000),
            'l1_ratio': np.random.uniform(0, 1, 1000)
        }
        clf = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 cv=10,
                                 scoring='r2',
                                 random_state=42,
                                 n_iter=100)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid
    elif algorithm == 3:
        model = BayesianRidge()
        param_grid = {
            'alpha_1': np.random.uniform(0.000001, 10, 1000),
            'alpha_2': np.random.uniform(0.000001, 10, 1000),
            'lambda_1': np.random.uniform(0.000001, 10, 1000),
            'lambda_2': np.random.uniform(0.000001, 10, 1000),
            'fit_intercept': [True, False],
            'normalize': [True, False],
            'max_iter': np.linspace(100, 10000, num=50, dtype=int),
            'tol': np.linspace(0.000001, 0.001, num=50)
        }
        clf = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 cv=10,
                                 scoring='r2',
                                 random_state=42,
                                 n_iter=100)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid
    elif algorithm == 4:
        model = Lasso()
        param_grid = {
            'alpha': np.random.uniform(0.1, 100, 1000),
            'fit_intercept': [True, False],
            'normalize': [True, False],
            'max_iter': np.linspace(100, 10000, num=50, dtype=int),
            'tol': np.linspace(0.000001, 0.001, num=50)
        }
        clf = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 cv=10,
                                 scoring='r2',
                                 random_state=42,
                                 n_iter=1000)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid
    elif algorithm == 5:
        model = SVR()
        param_grid = {
            'C': np.random.uniform(0.1, 10000, 100),
            'kernel': ['rbf', 'poly', 'sigmoid'],
            'normalize': [True, False],
            'max_iter': np.arange(100, 10000, 100, dtype=int),
            'tol': np.arange(0.000001, 0.001, 0.000001)
        }
        clf = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 cv=10,
                                 scoring='r2',
                                 random_state=42,
                                 n_iter=100)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid

    elif algorithm == 6:
        model = RandomForestRegressor()
        model.fit(DATA_train, LABELS_train)
        depth_range = np.arange(2, model.n_features_,
                                np.int(0.1 * model.n_features_) + 1)
        estimator_range = np.arange(20, 2020, 80)
        feature_range = np.linspace(1,
                                    model.n_features_,
                                    0.4 * model.n_features_,
                                    dtype=int)
        param_grid = dict(max_depth=depth_range,
                          n_estimators=estimator_range,
                          max_features=feature_range)
        clf = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 cv=10,
                                 scoring='r2',
                                 random_state=42,
                                 n_iter=20)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid
    elif algorithm == 7:
        model = Ridge()
        param_grid = {
            'alpha': np.random.uniform(0.1, 100, 1000),
        }
        clf = GridSearchCV(model,
                           param_grid=param_grid,
                           cv=10,
                           scoring='r2',
                           return_train_score=True)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid
    elif algorithm == 8:
        clf = LinearRegression()
        clf.fit(DATA_train, LABELS_train)
        clf.predict(DATA_test)
        return clf.score(DATA_test, LABELS_test)
    elif algorithm == 9:
        model = SVR()
        param_grid = {
            'C': np.random.uniform(0.1, 2000, 100),
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'max_iter': np.arange(100, 2000, 100, dtype=int),
            'tol': np.linspace(0.000001, 0.001, num=50)
        }
        clf = RandomizedSearchCV(model,
                                 param_distributions=param_grid,
                                 cv=10,
                                 scoring='r2',
                                 random_state=42,
                                 n_iter=100)
        clf.fit(DATA_train, LABELS_train)
        paramopti = clf.best_params_
        score = clf.best_score_
        score_grid = clf.cv_results_
        return paramopti, score, score_grid
    else:
        print('Algorithms not recognised')
    precedent[4:7, :, :, :] = block[i - 337:i - 334, :, :, :]  # 前一周
    precedent_frames.append(precedent)

#regr = (max_depth=8, random_state=0,n_estimators=1000)
model = MultiTaskLasso(alpha=1)

X_train, X_val, y_train, y_val = train_test_split(precedent_frames,
                                                  label_frames,
                                                  test_size=0.2,
                                                  random_state=4)
# 转化为5D的numpy数组,训练集(920,7,64,64,2), 测试集(231,1,64,64,2)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)
# 把5D数据转化为randomForest输入的2D数据
X_train = X_train.reshape((920, 7 * 64 * 64 * 2))
X_val = X_val.reshape((231, 7 * 64 * 64 * 2))
y_train = y_train.reshape((920, 1 * 64 * 64 * 2))
y_val = y_val.reshape((231, 1 * 64 * 64 * 2))

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_val, y_pred))
X, y = make_regression(noise=4, random_state=0)
print("Get dataset X with prediction y:\n", X[:1], y[:5])
reg = LassoCV(cv=5, random_state=0).fit(X, y)
print("Train the model LassoCV with 5-fold CV:\n", reg)
print("Get the score:\n", reg.score(X, y))
print("Get the prediction using the dataset:\n", reg.predict(X[:1, ]))
print("For dataset with more sample than features LassoLarsIC is preferable")
print("-" * 200)
print("\t"*1 + "1.1.4 Multi-task Lasso")
print("It use with L1 and L2 norm")
print("Multi-Task Lasso formula is (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21\n\tWith ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2}")
import numpy as np
from sklearn.linear_model import MultiTaskLasso
clf = MultiTaskLasso(alpha=0.1)
print("Create Multi-Task Lasso model:\n", clf)
print("Train Multi-Task Lasso model with 3 sample of 2 features and 3 predictions:\n", clf.fit([[0, 0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]]))
print("Get the coef W:\n", clf.coef_)
print("Get the intercept alpha:\n", clf.intercept_)
print("It estimate sparse coefficients for multiple regression problems")
print("-" * 200)
print("\t"*1 + "1.1.5 Elastic-Net")
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
X, y = make_regression(n_features=2, random_state=0)
print("Create dataset X with prediction y:\n", X[:1], y[:1])
regr = ElasticNet(random_state=0)
print("Create ElasticNet model:\n", regr)
print("Train ElasticNet model:\n", regr.fit(X, y))
print("Get the coef W:\n", regr.coef_)
print("Get the intercept alpha:\n", regr.intercept_)
print("Get the score:\n", regr.predict([0, 0]))
Exemplo n.º 19
0
    #
    #    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.4, random_state=0)

    import sys
    sys.path.insert(0, 'C:\\r workspace\\MultiSconES\\py')
    from load_data import load_dataset

    dataset = load_dataset()
    X = dataset["data"]
    Y = dataset["labels"]

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.33,
                                                        random_state=42)

    clf = MultiTaskLasso(alpha=1)
    print "train start"
    clf.fit(X_train, Y_train)
    print "train end"
    print "coef start"
    coef_multi_task_lasso_ = clf.coef_
    print "coef end"
    plot_coef(coef_multi_task_lasso_)
    zero_coefs = get_stats(coef_multi_task_lasso_)
    print len(zero_coefs)

    Y_pred = clf.predict(X_test)
    clf_score = clf.score(X_test, Y_test)
    score = r2_score(Y_test[:, 5], Y_pred[:, 5])
Exemplo n.º 20
0
import numpy as np
from src.common.my_data import Data
from sklearn.linear_model import LassoCV
from sklearn.linear_model import MultiTaskLasso

data = Data()

agg_train_have_log = pd.read_table(data.output.sorted_train_agg_have_log_usr).drop('USRID', axis=1)
print('agg_train_have_log : ', agg_train_have_log.shape)
agg_test_have_log = pd.read_table(data.output.sorted_test_agg_have_log_usr).drop('USRID', axis=1)
print('agg_test_have_log : ', agg_test_have_log.shape)
agg_all_have_log = pd.concat([agg_train_have_log, agg_test_have_log], axis=0)
print('agg_all_have_log : ', agg_all_have_log.shape)

tf_idf_all_have_log = pd.read_table(data.feature.tf_idf_have_log_usr_evt_all)
tf_idf_all_have_log_name = tf_idf_all_have_log.head(0)
print(tf_idf_all_have_log_name)
print('tf_idf_all_have_log : ', tf_idf_all_have_log.shape)
# print(tf_idf_all)

agg_no_have_log = pd.read_table(data.output.sorted_test_agg_no_have_log_usr).drop('USRID', axis=1)

print('agg_no_have_log : ', agg_no_have_log.shape)

lasso = MultiTaskLasso()
lasso.fit(agg_all_have_log, tf_idf_all_have_log)
result_lasso = lasso.predict(agg_no_have_log)
print(result_lasso)
# result_csv = pd.DataFrame(result_lasso)
# data.to_csv(data.output.prediction_test_no_log_tf_idf, index=False, sep='\t')
Exemplo n.º 21
0
from sklearn.linear_model import MultiTaskLasso
from sklearn.cross_validation import train_test_split

#splite dataset to get necessary sub-dataset
features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42)

#pre-process: dimensional reduction(SVD)
svd1 = TruncatedSVD(n_components=9,random_state=1).fit(features_train)
features_train = svd1.transform(features_train)

svd2 = TruncatedSVD(n_components=9,random_state=1).fit(features_test)
features_test = svd2.transform(features_test)

#do regression
mtl = MultiTaskLasso(alpha=0.000000001,random_state=1)
mtl.fit(features_train,labels_train)
print "MultiTaskLasso",mtl.score(features_test,labels_test)

######################################################################
#this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal 

#load necessary libs 
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.cross_validation import train_test_split

#splite dataset to get necessary sub-dataset
features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42)

#pre-process: dimensional reduction(SVD)
Exemplo n.º 22
0
class classSparser(object):
    def __init__(self,
                 mapperType='PIMP',
                 support=150,
                 projectOnSubspace=False):
        #options are
        #'PIMP' for Moore Penrose Pseudo Inverse
        #'Regressor' for using a regression task on each dimension
        self.mapperType = mapperType
        self.sparsed_X = None
        self.transformation_matrix = None
        self.Regressor = None
        self.support = support
        self.projectOnSubspace = projectOnSubspace

    def fit(self, X, Y):
        self.sparsed_X = list()
        #First, tranlate points to the origin
        main_centroid = [np.mean(x) for x in np.transpose(X)]
        print 'Main centroid:', main_centroid
        X = X - main_centroid

        byClassDict = defaultdict(list)
        for i in xrange(len(Y)):
            byClassDict[Y[i]].append(X[i])

        class_centroids = dict()

        centroids_matrix = list()
        kindexmap = dict()

        _i = 0
        for k in byClassDict:
            class_centroid = [
                np.mean(x) for x in np.transpose(byClassDict[k])
            ]  #np.mean(byClassDict[k])
            _norm = np.linalg.norm(class_centroid)
            _scaling_factor = _norm**2  #(i+1)**2 #+ (i+_norm)  #Play with this using _norm, i and any otrher function/constant
            _centroid = np.array(class_centroid)  #*(_scaling_factor)
            print '*** Class centroid:', _centroid
            class_centroids[k] = _centroid
            centroids_matrix.append(_centroid)
            kindexmap[k] = _i
            _i += 1

        centroids_matrix = np.array(centroids_matrix)
        ortho_centroids_matrix = np.array(gram_schmidt.gs(centroids_matrix))
        ortho_centroids_matrix = normalize(ortho_centroids_matrix)

        print '*Centroids matrix', centroids_matrix
        print '*Ortho centroids matrix', ortho_centroids_matrix

        newX, newY = list(), list()
        ks = list()
        for k in byClassDict:
            #byClassDict[k] = np.array(byClassDict[k]) - centroids_matrix[kindexmap[k]] + np.array(ortho_centroids_matrix[kindexmap[k]]) #class_centroids[k]

            #this is the basis vector corresponding to current class
            classvector = np.array(ortho_centroids_matrix[kindexmap[k]])
            kScalingFactor = self.support

            #This section tries to get a good scaling factor for each orthonormal vector
            maxks = list()
            for _k in ks:
                projs = [
                    scalarProjection(x, classvector) for x in byClassDict[_k]
                ]
                maxk = max(projs)
                maxks.append(maxk)

                maxownk = max(
                    [scalarProjection(x, classvector) for x in byClassDict[k]])

            if len(ks):
                kScalingFactor = max(maxks) + abs(maxownk) + self.support

            for v in byClassDict[k]:
                vv = np.array(v) - centroids_matrix[
                    kindexmap[k]] + classvector * kScalingFactor
                self.sparsed_X.append(vv)
                newX.append(v)
                newY.append(k)
                ks.append(k)

        self.sparsed_X = np.array(self.sparsed_X)

        if self.projectOnSubspace:
            #Project on to new subspace spawned by class vectors
            self.sparsed_X = np.dot(self.sparsed_X,
                                    np.transpose(centroids_matrix))

        if self.mapperType == 'PIMP':
            #self.scaler = preprocessing.StandardScaler().fit(self.sparsed_X)
            #self.sparsed_X = self.scaler.transform(self.sparsed_X)

            self.transformation_matrix = self.sparsed_X * (np.transpose(
                np.linalg.pinv(X)))
            #self.transformation_matrix = X*(np.transpose(np.linalg.pinv(self.sparsed_X) ) )

        if self.mapperType == 'Regressor':
            self.Regressor = MultiTaskLasso(alpha=0.00000001, max_iter=2000)
            self.Regressor.fit(newX, self.sparsed_X)

        return self.sparsed_X, newY

    def transform(self, X):
        Xs = X  #self.scaler.transform(X)
        if self.mapperType == 'PIMP':
            transformed_data = self.transformation_matrix * Xs
            #transformed_data = Xs*self.transformation_matrix
        if self.mapperType == 'Regressor':
            transformed_data = self.Regressor.predict(Xs)

        return transformed_data
Exemplo n.º 23
0
    n_samples = 100
    n_features = 40
    n_tasks = 12
    rel_f = 7
    coef = np.zeros((n_tasks, n_features))
    times = np.linspace(0, 2 * np.pi, n_tasks)
    for k in range(rel_f):
        coef[:, k] = np.sin((1.0 + rr.randn(1)) * times + 3 * rr.randn(1))
    X = rr.randn(n_samples, n_features)
    y = np.dot(X, coef.T) + rr.randn(n_samples, n_tasks)
    X_train = X[:-20]
    y_train = y[:-20]
    X_test = X[-20:]
    y_test = y[-20:]

    print("Fitting LASSO model...")
    ll = Lasso(alpha=0.45)
    ll.fit(X_train, y_train)
    print("R2 score: {0}".format(r2_score(y_test, ll.predict(X_test))))

    print("Fitting Multitask LASSO model...")
    ml = MultiTaskLasso(alpha=0.45)
    ml.fit(X_train, y_train)
    print("R2 score: {0}".format(r2_score(y_test, ml.predict(X_test))))

    print("Plotting predictions...")
    plt.scatter(X[:, 1], y[:, 1])
    plt.scatter(X[:, 1], ll.predict(X)[:, 1], color="blue")
    plt.scatter(X[:, 1], ml.predict(X)[:, 1], color="red")
    plt.show()
Exemplo n.º 24
0
path_test = 'data_test.txt'

X, Y = get_data_own(path_train)

print(X.shape)
print(Y.shape)

print("Split data for CV")
X_train, X_test , y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

lasso = MultiTaskLasso(max_iter = max_iter, normalize = True)

print("Init train with multitasklassocv")
lassocv = MultiTaskLassoCV(alphas=None, cv=10, max_iter=max_iter, verbose=True, normalize=True)
lassocv.fit(X_train, y_train)

print("Fit multitasklasso with alpha from cv lasso")
lasso.set_params(alpha=lassocv.alpha_)
lasso.fit(X_train, y_train)

print("get mean square error")
mae = mean_absolute_error(y_test, lasso.predict(X_test))
print("mae: {}".format(mae))
rmsle = mean_squared_log_error(y_test, lasso.predict(X_test))
print("rmsle: {}".format(rmsle))
mape = mean_absolute_percentage_error(y_test, lasso.predict(X_test))
print("mape: {}".format(mape))



class SparseRegression:
    def __init__(self,
                 v,
                 delta_v,
                 f,
                 q,
                 lin_args=(),
                 force_args=(),
                 split='shuffle',
                 split_kargs={}):
        """
        v.shape = (n_steps, n_variables),
        delta_v.shape = (n_steps, n_variables)

        q (in [1, n_variables]) number of first variables
        to fit the linear model to,
        remaining n_variables-q are used as forcing

        f: (n_steps, n_variables) -> (n_steps, n_features)
        f will be called with f(..., *lin_args) when fitting the linear model
        and with f(..., *force_args) when fitting the force
        """
        if v.shape == delta_v.shape and type(q) == int and q > 0 \
                and q <= v.shape[1]:
            self.v, self.delta_v = self._check_reduce(v, delta_v)
            self.params = [*self.v.shape, q]  # [n_steps, n_vars, q]
            # derivatives used for the model
            self.delta_v = self.delta_v[:, :q]
            # calculate features based on first q variables for linear model
            self.features_lin_model = f(self.v[:, :q], *lin_args)
            # calculate features based on remaining variables for forcing term
            self.features_forcing = f(self.v[:, q:], *force_args)
            # two different types of splitting
            split_dict = {
                'shuffle': self._shuffle_split,
                'lorenz': self._lobes_split
            }
            # split the timesteps into two parts:
            # first is used for fitting linear model, second for forcing
            self.mask_l_m, self.mask_f = split_dict[split](**split_kargs)
            # self.mask_l_m, self.mask_f = self._split_lobes(self.v[:, 0])
            self.feature_generation = f
            self.feature_generation_args = {
                'linear': lin_args,
                'forcing': force_args
            }
        else:
            raise Exception('Error: invalid init parameter')

    def _shuffle_split(self, fraction=0.5):
        """
        creates two masks to split n_steps elements into two disjunct sets
        where the first has length=fraction*n
        """
        assert fraction > 0 and fraction < 1
        n_steps = self.params[0]
        n_1 = int(n_steps * fraction)
        shuffled_ind = np.random.permutation(n_steps)

        ind_1 = shuffled_ind[:n_1]
        mask_1 = np.zeros(n_steps, dtype=np.bool)
        mask_1[ind_1] = True

        ind_2 = shuffled_ind[n_1:]
        mask_2 = np.zeros(n_steps, dtype=np.bool)
        mask_2[ind_2] = True

        # each element is part of either one or the other mask
        assert np.all(mask_1 ^ mask_2)
        return mask_1, mask_2

    def _lobes_split(self, window_pos=200, window_neg=400):
        """
        use regions in which trajectories are on the lobes to fit the
        linear model and the remaining steps for modeling the force
        """
        v_1 = self.v[:, 0]
        n_steps = self.params[0]
        # find lobe switches
        m_pos = v_1 > 0
        m_neg = v_1 < 0
        mask_switch = (m_pos[:-1] & m_neg[1:]) | (m_neg[:-1] & m_pos[1:])
        switch_ind = np.nonzero(mask_switch)[0]
        print('no. of lobe switches detected in v_1: {:d}'.format(
            len(switch_ind)))
        force_ind_list = []
        for switch in switch_ind:
            if switch + 1 - window_neg < 0:
                l_neg = switch
            else:
                l_neg = window_neg
            if switch + 1 + window_pos > n_steps:
                l_pos = n_steps - switch
            else:
                l_pos = window_pos
            force_ind_list.append(np.arange(switch - l_neg, switch + l_pos))
        force_ind = np.concatenate(force_ind_list)
        assert np.all(force_ind >= 0) and np.all(force_ind < n_steps)

        mask_lobes = np.ones(n_steps, dtype=np.bool)
        mask_lobes[force_ind] = False
        mask_switch = np.zeros(n_steps, dtype=np.bool)
        mask_switch[force_ind] = True
        assert np.all(mask_lobes ^ mask_switch)
        return mask_lobes, mask_switch

    def _check_reduce(self, v, delta_v):
        """
        check both matrices for columns containg nan and excludes them
        """
        invalid_v = np.any(np.isnan(v), axis=1)
        if np.any(invalid_v):
            print('Warning: v matrix contains NaNs')
        invalid_delta_v = np.any(np.isnan(delta_v), axis=1)
        if np.any(invalid_delta_v):
            print('Warning: delta_v matrix contains NaNs')
        valid_steps = (~invalid_v) & (~invalid_delta_v)
        valid_fraction = np.sum(valid_steps) / len(valid_steps)
        if not np.isclose(valid_fraction, 1):
            print('Warning: only {:.1%} of time steps are valid'.format(
                valid_fraction))
        if valid_fraction < 0.95:
            raise Exception('Error: less than 95% of time steps are valid')
        return v[valid_steps], delta_v[valid_steps]

    def fit_lin_model(self, alpha=None):
        """
        fit sparse linear regression on first q variables
        alpha is penalization parameter, None triggers cross validation
        """
        if alpha is None:  # do cross validation
            self.lin_model = \
                MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1,
                                 fit_intercept=False, normalize=False,
                                 max_iter=3500)
        else:
            self.lin_model = \
                MultiTaskLasso(alpha=alpha, fit_intercept=False,
                               normalize=False)
        self.lin_model.fit(self.features_lin_model[self.mask_l_m],
                           self.delta_v[self.mask_l_m])

    def pred_lin_model(self):
        """
        calculate prediction of the linear model on the data set not used for
        training it
        """
        pred_d_v = self.lin_model.predict(self.features_lin_model[self.mask_f])
        d_v = self.delta_v[self.mask_f]
        # calculate correlation for each variable
        n_variables = d_v.shape[1]
        print('corr. of prediction and true delta_v:')
        for i in range(n_variables):
            r, p = pearsonr(pred_d_v[:, i], d_v[:, i])
            print('{:d}th variable: r={:.2f} (p={:.2f})'.format(i + 1, r, p))
        self.eps = d_v - pred_d_v  # d_v - Af(v)

    def fit_force_params(self, alpha=None):
        """
        fit sparse linear regression on remaining n_variables-q variables
        alpha is penalization parameter, None triggers cross validation
        """
        if alpha is None:  # do cross validation
            self.force_model = \
                MultiTaskLassoCV(eps=1e-3, n_alphas=50, cv=10, n_jobs=-1,
                                 fit_intercept=False, normalize=False)
        else:
            self.force_model = \
                MultiTaskLasso(alpha=alpha, fit_intercept=False,
                               normalize=False)
        self.force_model.fit(self.features_forcing[self.mask_f], self.eps)

    def fit(self, alpha_lin=None, alpha_force=None):
        self.fit_lin_model(alpha=alpha_lin)
        self.pred_lin_model()
        self.fit_force_params(alpha=alpha_force)

    def plot_coefs(self, f_descr=None):
        """
        plot coef matrix of linear and force model
        f_descr(n_vars, offset, *args) -> n_features
        """
        n_f_lin_model = self.features_lin_model.shape[1]
        n_f_forcing = self.features_forcing.shape[1]
        q = self.params[-1]
        if f_descr is not None:
            # get names of the features
            f_lin_model_str = f_descr(q, 0,
                                      *self.feature_generation_args['linear'])
            f_forcing_str = f_descr(self.v.shape[1] - q, q,
                                    *self.feature_generation_args['forcing'])
            assert len(f_lin_model_str) == n_f_lin_model
            assert len(f_forcing_str) == n_f_forcing
        else:
            f_lin_model_str = \
                [str(i) for i in range(n_f_lin_model)]
            f_forcing_str = \
                [str(i) for i in range(n_f_forcing)]

        n_f = n_f_lin_model + n_f_forcing
        fractions = (n_f_lin_model / n_f, n_f_forcing / n_f)
        fig, axes = plt.subplots(ncols=2,
                                 sharey=True,
                                 gridspec_kw={'width_ratios': fractions})
        plt.subplots_adjust(wspace=0.2)
        a = self.lin_model.coef_
        b = self.force_model.coef_
        assert a.shape[0] == b.shape[0]
        n_vars = a.shape[0]
        max_abs_coef = max(abs(a.min()), abs(b.min()), a.max(), b.max())

        titles = ['A', 'B']
        matrices = [a, b]
        ticklabels = [f_lin_model_str, f_forcing_str]
        for i, ax in enumerate(axes):
            ax.set_title(titles[i])
            im = ax.imshow(matrices[i],
                           vmin=-max_abs_coef,
                           vmax=max_abs_coef,
                           origin='upper',
                           cmap='seismic')
            ax.set_xticks(np.arange(len(ticklabels[i])))
            ax.set_xticklabels(ticklabels[i], rotation=45)
            ax.set_xlabel('features')
            ax.set_yticks(np.arange(n_vars))
            ax.set_yticklabels(
                ['$v_{:d}$'.format(i + 1) for i in range(n_vars)])
        axes[0].set_ylabel('variables')
        plt.colorbar(im, ax=axes, fraction=0.05, shrink=0.75)

    def _dv(self, t, v, force):
        """
        v.shape = (q,)
        force(t)
        """
        # linear part
        lin_args = self.feature_generation_args['linear']
        features_lin = \
            self.feature_generation(v.reshape(1, -1), *lin_args).squeeze()
        lin_contr = np.dot(self.lin_model.coef_, features_lin)
        # forcing part
        force_args = self.feature_generation_args['forcing']
        features_force = \
            self.feature_generation(force(t).reshape(1, -1),
                                    *force_args).squeeze()
        force_contr = np.dot(self.force_model.coef_, features_force)
        dv = lin_contr + force_contr
        return dv

    def solve_model(self, dt, ind_v_init, force=None):
        """
        use time serie of the force variables and simulate the system from
        ind_v_init
        """
        n_steps, n_vars, q = self.params
        v_init = self.v[ind_v_init, :q]
        # resemble the timesteps at which the original data was evaluated
        n_remaining = n_steps - ind_v_init
        t_remaining = dt * (n_remaining - 1)
        t_eval = np.linspace(0, t_remaining, num=n_remaining)
        if force is None:

            def f_dummy(t):
                return np.zeros(n_vars - q)

            dv = partial(self._dv, force=f_dummy)
        elif force.shape == (n_remaining, n_vars - q):
            f_interp = interp1d(t_eval, force, axis=0, kind='quadratic')
            dv = partial(self._dv, force=f_interp)
        else:
            raise Exception('invalid force')

        result = solve_ivp(dv, [0, t_remaining],
                           v_init,
                           t_eval=t_eval,
                           method='RK45',
                           rtol=1e-6,
                           atol=1e-12)
        print(result.message)
        return result