def transform(self, X): """Transform X separately by each transformer, concatenate results. Parameters ---------- X : iterable or array-like, depending on transformers Input data to be transformed. Returns ------- X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ Xs = Parallel(n_jobs=self.n_jobs)( delayed(_transform_one)(trans, X, None, weight) for name, trans, weight in self._iter()) if not Xs: # All transformers are None return np.zeros((X.shape[0], 0)) if any(sparse.issparse(f) for f in Xs): Xs = sparse.vstack(Xs).tocsr() else: if isinstance(Xs[0], np.ndarray): Xs = np.vstack(Xs) elif isinstance(Xs[0], pd.Series) or isinstance( Xs[0], pd.DataFrame): Xs = pd.concat(Xs, axis=1) return Xs
def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200, n_jobs=None, verbose=False, pre_dispatch='3*n_jobs', random_state=None, sample_fraction=.75, **params): random_state = check_random_state(random_state) # We are generating 1 - weights, and not weights n_samples, n_features = X.shape if not (0 < scaling < 1): raise ValueError( "'scaling' should be between 0 and 1. Got %r instead." % scaling) scaling = 1. - scaling scores_ = 0.0 for active_set in Parallel( n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(delayed(estimator_func)( X, y, weights=scaling * random_state.randint(0, 2, size=(n_features, )), mask=(random_state.rand(n_samples) < sample_fraction), verbose=max(0, verbose - 1), **params) for _ in range(n_resampling)): scores_ += active_set scores_ /= n_resampling return scores_
def fit(self, X, y, categories="auto"): # this is hard-coded for categorical variables if isinstance(y, pd.Series) and hasattr(y, "cat"): y = y.cat.codes self.n_classes_ = np.max(y) + 1 categories = list(range(self.n_classes_)) # order of estimators self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(self.estimator), X, y, cat) for cat in categories[:-1]) return self
def validation_curve(estimator, X, y, param_name, param_range, groups=None, cv=None, scoring=None, n_jobs=None, pre_dispatch="all", verbose=0, error_score=np.nan): X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) out = parallel( delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, parameters={ param_name: v }, fit_params=None, return_train_score=True, error_score=error_score, return_estimator=True, return_times=True) # NOTE do not change order of iteration to allow one time cv splitters for train, test in cv.split(X, y, groups) for v in param_range) out = np.asarray(out) estimators = out[:, 4] out_scores = np.asarray(out[:, :2]) fit_time = out[:, 2] score_time = out[:, 3] n_params = len(param_range) n_cv_folds = out_scores.shape[0] // n_params out_scores = out_scores.reshape(n_cv_folds, n_params, 2).transpose( (2, 1, 0)) return estimators, np.float64(out_scores[0]), np.float64(out_scores[1]), np.float64(fit_time), \ np.float64(score_time)
def fit(self, X, y): # n_samples, n_features = X.shape X = self._augment(X) # Perform label encoding so label indicies start from zero le = LabelEncoder() encoded_y = le.fit_transform(y) self.classes_ = le.classes_ n_classes = len(self.classes_) # Use the Parallel library to fit C binary classifiers in parallel results = Parallel( n_jobs=self.n_jobs, prefer='threads', verbose=self.verbose)(delayed(_fit_binary_perceptron)( X, encoded_y, c, self.eta0, self.decay, self.max_iterations) for c in range(n_classes)) # Store final result for prediction self.weights_ = np.array(results) return self
def transform (self, X): """Transform X separately by each transformer, concatenate results. Parameters ---------- X : iterable or array-like, depending on transformers Input data to be transformed. Returns ------- X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ if self.use_in_model: return super(CustomFeatureUnion, self).transform(X) Xs = Parallel(n_jobs=self.n_jobs)( delayed(_transform_one)(trans, X, None, weight) for _, trans, weight in self._iter()) return self.get_result_as_dictionary(Xs)
def fit(self, X, y=None): """Fit all transformers using X. Parameters ---------- X : iterable or array-like, depending on transformers Input data, used to fit transformers. y : array-like, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : FeatureUnion This estimator """ self.transformer_list = list(self.transformer_list) self._validate_transformers() transformers = Parallel(n_jobs=self.n_jobs)( delayed(_fit_one_transformer)(trans, X, y) for _, trans, _ in self._iter()) self._update_transformer_list(transformers) return self
def fit_transform(self, X, y=None, **fit_params): """Fit all transformers, transform the data and concatenate results. Parameters ---------- X : iterable or array-like, depending on transformers Input data to be transformed. y : array-like, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ self._validate_transformers() result = Parallel(n_jobs=self.n_jobs)( delayed(_fit_transform_one)(trans, X, y, weight, **fit_params) for name, trans, weight in self._iter()) if not result: # All transformers are None return np.zeros((X.shape[0], 0)) Xs, transformers = zip(*result) self._update_transformer_list(transformers) if any(sparse.issparse(f) for f in Xs): Xs = sparse.vstack(Xs).tocsr() else: if isinstance(Xs[0], np.ndarray): Xs = np.vstack(Xs) elif isinstance(Xs[0], pd.Series) or isinstance( Xs[0], pd.DataFrame): Xs = pd.concat(Xs, axis=1) return Xs
def fit(self, X, y, sample_weight=None): """ Fit the estimators. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples] or None Sample weights. If None, then samples are equally weighted. Note that this is supported only if all underlying estimators support sample weights. Returns ------- self : object """ if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError('Multilabel and multi-output' ' classification is not supported.') if self.voting not in ('soft', 'hard'): raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)" % self.voting) if self.estimators is None or len(self.estimators) == 0: raise AttributeError('Invalid `estimators` attribute, `estimators`' ' should be a list of (string, estimator)' ' tuples') if (self.weights is not None and len(self.weights) != len(self.estimators)): raise ValueError('Number of classifiers and weights must be equal' '; got %d weights, %d estimators' % (len(self.weights), len(self.estimators))) if sample_weight is not None: for name, step in self.estimators: if not has_fit_parameter(step, 'sample_weight'): raise ValueError('Underlying estimator \'%s\' does not' ' support sample weights.' % name) names, clfs = zip(*self.estimators) self._validate_names(names) n_isnone = np.sum([clf is None for _, clf in self.estimators]) if n_isnone == len(self.estimators): raise ValueError('All estimators are None. At least one is ' 'required to be a classifier!') self.le_ = LabelEncoder().fit(y) self.classes_ = self.le_.classes_ self.estimators_ = [] transformed_y = self.le_.transform(y) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y, sample_weight=sample_weight) for clf in clfs if clf is not None) self.named_estimators_ = Bunch(**dict()) for k, e in zip(self.estimators, self.estimators_): self.named_estimators_[k[0]] = e return self
def fit(self, X, y=None): """Fits the GraphLasso covariance model to X. Parameters ---------- X : ndarray, shape (n_samples, n_features) Data from which to compute the covariance estimate y : (ignored) """ # Covariance does not make sense for a single feature self.random_state = check_random_state(self.random_state) # check_data_dimensions(X, layers=2) X = check_array(X, ensure_min_features=2, ensure_min_samples=2, estimator=self) self.X_train = X if self.assume_centered: self.location_ = np.zeros((X.shape[0], X.shape[1])) else: self.location_ = X.mean(0) emp_cov = empirical_covariance( X, assume_centered=self.assume_centered) X = check_array(X, ensure_min_features=2, estimator=self) cv = check_cv(self.cv, y, classifier=False) # List of (alpha, scores, covs) path = list() n_etas = self.etas inner_verbose = max(0, self.verbose - 1) if isinstance(n_etas, Sequence): etas = self.etas else: eta_1 = par_max(emp_cov) eta_0 = 1e-2 * eta_1 etas = np.logspace(np.log10(eta_0), np.log10(eta_1), n_etas)[::-1] n_mus = self.mus inner_verbose = max(0, self.verbose - 1) if isinstance(n_mus, Sequence): mus = self.mus else: mu_1 = par_max(emp_cov) # not sure is the best strategy mu_0 = 1e-2 * mu_1 mus = np.logspace(np.log10(mu_0), np.log10(mu_1), n_mus)[::-1] with warnings.catch_warnings(): warnings.simplefilter('ignore', ConvergenceWarning) this_path = Parallel( n_jobs=self.n_jobs, verbose=self.verbose )(delayed(flgl_path)(X[train], links=self.links, etas=etas, mus= mus, X_test=X[test], tol=self.tol, max_iter=int(.1 * self.max_iter), update_rho=self.update_rho, verbose=0, random_state=self.random_state) for train, test in cv.split(X, y)) # Little danse to transform the list in what we need covs, precs, hidds, scores = zip(*this_path) covs = zip(*covs) precs = zip(*precs) hidds = zip(*hidds) scores = zip(*scores) combinations = list(product(etas, mus)) path.extend(zip(combinations, scores, covs)) path = sorted(path, key=operator.itemgetter(0), reverse=True) # Find the maximum (avoid using built in 'max' function to # have a fully-reproducible selection of the smallest alpha # in case of equality) best_score = -np.inf last_finite_idx = 0 for index, (combination, scores, _) in enumerate(path): this_score = np.mean(scores) if this_score >= .1 / np.finfo(np.float64).eps: this_score = np.nan if np.isfinite(this_score): last_finite_idx = index if this_score >= best_score: best_score = this_score best_index = index path = list(zip(*path)) grid_scores = list(path[1]) parameters = list(path[0]) # Finally, compute the score with alpha = 0 best_eta, best_mu = combinations[best_index] self.eta_ = best_eta self.mu_ = best_mu self.cv_parameters_ = combinations # Finally fit the model with the selected alpha self.covariance_, self.precision_, self.hidden_, self.R_, self.n_iter_ = two_layers_fixed_links_GL( emp_cov, self.links, eta=best_eta, mu=best_mu, tol=self.tol, max_iter=self.max_iter, verbose=self.verbose, random_state=self.random_state, compute_objective=True, return_n_iter=True) return self
def __call__(self, value): values = Parallel(n_jobs=self.n_jobs)(delayed(pfn)(value) for pfn in self.steps) return self.aggregate(values)
def kneighbors(self, X, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]), or numpy ndarray with shape([n_cases,n_readings,n_dimensions]) y : {array-like, sparse matrix} Target values of shape = [n_samples] n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the lengths to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. Examples -------- In the following example, we construct a NeighborsClassifier class from an array representing our data set and ask who's the closest point to [1,1,1] >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] >>> from sklearn.neighbors import NearestNeighbors >>> neigh = NearestNeighbors(n_neighbors=1) >>> neigh.fit(samples) # doctest: +ELLIPSIS NearestNeighbors(algorithm='auto', leaf_size=30, ...) >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS (array([[0.5]]), array([[2]])) As you can see, it returns [[0.5]], and [[2]], which means that the element is at distance 0.5 and is the third element of samples (indexes start at 0). You can also query for multiple points: >>> X = [[0., 1., 0.], [1., 0., 1.]] >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS array([[1], [2]]...) """ check_data_sktime_tsc(X) check_is_fitted(self, "_fit_method") if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not np.issubdtype(type(n_neighbors), np.integer): raise TypeError("n_neighbors does not take %s value, " "enter integer value" % type(n_neighbors)) if X is not None: query_is_train = False X = check_array(X, accept_sparse='csr', allow_nd=True) else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 train_size = self._fit_X.shape[0] if n_neighbors > train_size: raise ValueError("Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % (train_size, n_neighbors)) n_samples = X.shape[0] sample_range = np.arange(n_samples)[:, None] n_jobs = effective_n_jobs(self.n_jobs) if self._fit_method == 'brute': reduce_func = partial(self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance) # for efficiency, use squared euclidean distances kwds = ({ 'squared': True } if self.effective_metric_ == 'euclidean' else self.effective_metric_params_) result = pairwise_distances_chunked(X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **kwds) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " "or set algorithm='brute'" % self._fit_method) if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib delayed_query = delayed(self._tree.query, check_pickle=False) parallel_kwargs = {"backend": "threading"} else: delayed_query = delayed(self._tree.query) parallel_kwargs = {"prefer": "threads"} result = Parallel(n_jobs, **parallel_kwargs)( delayed_query(X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs)) else: raise ValueError("internal: _fit_method not recognized") if return_distance: dist, neigh_ind = zip(*result) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) if not query_is_train: return result else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: dist, neigh_ind = result else: neigh_ind = result sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False neigh_ind = np.reshape(neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) if return_distance: dist = np.reshape(dist[sample_mask], (n_samples, n_neighbors - 1)) return dist, neigh_ind return neigh_ind
def fit(self, X, y=None, groups=None, **fit_params): """Run fit with all sets of parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator """ if self.fit_params is not None: warnings.warn('"fit_params" as a constructor argument was ' 'deprecated in version 0.19 and will be removed ' 'in version 0.21. Pass fit parameters to the ' '"fit" method instead.', DeprecationWarning) if fit_params: warnings.warn('Ignoring fit_params passed as a constructor ' 'argument in favor of keyword arguments to ' 'the "fit" method.', RuntimeWarning) else: fit_params = self.fit_params estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) scorers, self.multimetric_ = _check_multimetric_scoring( self.estimator, scoring=self.scoring) if self.multimetric_: if self.refit is not False and ( not isinstance(self.refit, six.string_types) or # This will work for both dict / list (tuple) self.refit not in scorers): raise ValueError("For multi-metric scoring, the parameter " "refit must be set to a scorer key " "to refit an estimator with the best " "parameter setting on the whole data and " "make the best_* attributes " "available for that metric. If this is not " "needed, refit should be set to False " "explicitly. %r was passed." % self.refit) else: refit_metric = self.refit else: refit_metric = 'score' X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) base_estimator = clone(self.estimator) parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch) fit_and_score_kwargs = dict(scorer=scorers, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, return_estimator=self.return_estimator, error_score=self.error_score, verbose=self.verbose) results_container = [{}] with parallel: all_candidate_params = [] all_out = [] all_estimators = [] def evaluate_candidates(candidate_params): candidate_params = list(candidate_params) n_candidates = len(candidate_params) def _fit_and_score_recv(i_fold, X, y, train, test, parameters): current_estimator = clone(base_estimator) if isinstance(current_estimator, Pipeline): if hasattr(current_estimator._final_estimator, 'cachedir'): current_estimator._final_estimator.cachedir = os.path.join(self.cachedir, '%i_fold ' % i_fold) else: warnings.warn('Final estimator does not have recovery' ' or saving capabilities') elif hasattr(current_estimator, 'cachedir'): current_estimator.cachedir = os.path.join(self.cachedir, '%i_fold ' % i_fold) else: warnings.warn('Estimator does not have recovery ' ' or saving capabilities') print parameters print i_fold return delayed(_fit_and_score)(current_estimator, X, y, train=train, test=test, parameters=parameters, **fit_and_score_kwargs) list_split = list(enumerate(cv.split(X, y, groups))) if self.verbose > 0: print("Fitting {0} folds for each of {1} candidates," " totalling {2} fits".format( n_splits, n_candidates, n_candidates * n_splits)) # print list(candidate_params) # raise NotImplementedError if self.client is None: out = parallel(_fit_and_score_recv(i_fold, X, y, train, test, parameters) for (parameters, (i_fold, (train, test))) in product(candidate_params, list_split)) else: self.client[:].use_dill() dview = self.client[:] out = dview.map(lambda parameters, i_fold, train, test: _fit_and_score_recv(i_fold, X, y, train, test, parameters), [(parameters, i_fold, train, test) for (parameters, (i_fold, (train, test))) in product(candidate_params, list_split)]) if self.return_estimator: all_estimators.extend([out_set[-1] for out_set in out]) out = [out_set[:-1] for out_set in out] all_candidate_params.extend(candidate_params) all_out.extend(out) # XXX: When we drop Python 2 support, we can use nonlocal # instead of results_container results_container[0] = self._format_results( all_candidate_params, scorers, n_splits, all_out) return results_container[0] self._run_search(evaluate_candidates) results = results_container[0] # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names # In single metric evaluation, refit_metric is "score" if self.refit or not self.multimetric_: self.best_index_ = results["rank_test_%s" % refit_metric].argmin() self.best_params_ = results["params"][self.best_index_] self.best_score_ = results["mean_test_%s" % refit_metric][ self.best_index_] if self.refit: self.best_estimator_ = clone(base_estimator).set_params( **self.best_params_) refit_start_time = time.time() if y is not None: self.best_estimator_.fit(X, y, **fit_params) else: self.best_estimator_.fit(X, **fit_params) refit_end_time = time.time() self.refit_time_ = refit_end_time - refit_start_time # Store the only scorer not as a dict for single metric evaluation self.scorer_ = scorers if self.multimetric_ else scorers['score'] self.cv_results_ = results self.n_splits_ = n_splits if self.return_estimator: self.cv_estimators = all_estimators return self
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20, dataset='rcv1', n_jobs=1, skip_slow=False): mem = Memory(cachedir=expanduser('~/cache'), verbose=0) if dataset == 'rcv1': rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == 'digits': digits = load_digits() X, y = digits.data, digits.target if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == 'iris': iris = load_iris() X, y = iris.data, iris.target elif dataset == '20newspaper': ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] cached_fit = mem.cache(fit_single) out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(cached_fit)(solver, X, y, penalty=penalty, single_target=single_target, C=1, max_iter=max_iter, skip_slow=skip_slow) for solver in solvers for penalty in penalties) res = [] idx = 0 for solver in solvers: for penalty in penalties: if not (skip_slow and solver == 'lightning' and penalty == 'l1'): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict(solver=solver, penalty=penalty, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies) res.append(this_res) idx += 1 with open('bench_saga.json', 'w+') as f: json.dump(res, f)
def lasso_stability_path(X, y, scaling=0.5, random_state=None, n_resampling=200, n_grid=100, sample_fraction=0.75, eps=4 * np.finfo(np.float).eps, n_jobs=None, verbose=False): """Stability path based on randomized Lasso estimates Parameters ---------- X : array-like, shape = [n_samples, n_features] training data. y : array-like, shape = [n_samples] target values. scaling : float, optional, default=0.5 The alpha parameter in the stability selection article used to randomly scale the features. Should be between 0 and 1. random_state : int, RandomState instance or None, optional, default=None The generator used to randomize the design. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. n_resampling : int, optional, default=200 Number of randomized models. n_grid : int, optional, default=100 Number of grid points. The path is linearly reinterpolated on a grid between 0 and 1 before computing the scores. sample_fraction : float, optional, default=0.75 The fraction of samples to be used in each randomized design. Should be between 0 and 1. If 1, all samples are used. eps : float, optional Smallest value of alpha / alpha_max considered n_jobs : int or None, optional (default=None) Number of CPUs to use during the resampling. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. verbose : boolean or integer, optional Sets the verbosity amount Returns ------- alphas_grid : array, shape ~ [n_grid] The grid points between 0 and 1: alpha/alpha_max scores_path : array, shape = [n_features, n_grid] The scores for each feature along the path. """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo']) rng = check_random_state(random_state) if not (0 < scaling < 1): raise ValueError("Parameter 'scaling' should be between 0 and 1." " Got %r instead." % scaling) n_samples, n_features = X.shape paths = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_lasso_stability_path)( X, y, mask=rng.rand(n_samples) < sample_fraction, weights=1. - scaling * rng.randint(0, 2, size=(n_features, )), eps=eps) for k in range(n_resampling)) all_alphas = sorted(list(set(itertools.chain(*[p[0] for p in paths])))) # Take approximately n_grid values stride = int(max(1, int(len(all_alphas) / float(n_grid)))) all_alphas = all_alphas[::stride] if not all_alphas[-1] == 1: all_alphas.append(1.) all_alphas = np.array(all_alphas) scores_path = np.zeros((n_features, len(all_alphas))) for alphas, coefs in paths: if alphas[0] != 0: alphas = np.r_[0, alphas] coefs = np.c_[np.ones((n_features, 1)), coefs] if alphas[-1] != all_alphas[-1]: alphas = np.r_[alphas, all_alphas[-1]] coefs = np.c_[coefs, np.zeros((n_features, 1))] scores_path += (interp1d(alphas, coefs, kind='nearest', bounds_error=False, fill_value=0, axis=-1)(all_alphas) != 0) scores_path /= n_resampling return all_alphas, scores_path