Пример #1
0
    def predict(self, X):
        """Predict multi-output variable using a model
		 trained for each target variable.

		Parameters
		----------
		X : (sparse) array-like, shape (n_samples, n_features)
			Data.

		Returns
		-------
		y : (sparse) array-like, shape (n_samples, n_outputs)
			Multi-output targets predicted across multiple predictors.
			Note: Separate models are generated for each predictor.
		"""
        check_is_fitted(self, 'estimators_')
        for i, e in enumerate(self.estimators):
            if not hasattr(e, "predict"):
                raise ValueError(f"The base estimator {i} should implement"
                                 " a predict method")

        X = check_array(X, accept_sparse=True)

        y = Parallel(n_jobs=self.n_jobs)(delayed(e.predict)(X)
                                         for e in self.estimators_)

        return np.asarray(y).T
Пример #2
0
def cross_val_predict(estimator, X, y=None, groups=None, cv='warn',
                      n_jobs=None, verbose=0, fit_params=None,
                      pre_dispatch='2*n_jobs', method='predict'):

    """
    Minor modifications and simplications brought to the sklearn function in order to allow
    for application with non-partition CV scheme. 
    """

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method)
        for train, test in cv.split(X, y, groups))


    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _ in prediction_blocks]
    predictions = np.concatenate(predictions)

    test_indices = np.concatenate([indices_i
                                   for _, indices_i in prediction_blocks])
    test_index = [y.index[_] for _ in test_indices]
    #print(predictions)

    if y.ndim == 1:
        return pd.Series(predictions, index = test_index)
    elif y.ndim>1:
        return pd.DataFrame(predictions, index = test_index)
Пример #3
0
    def _predict(self, predict_fn, X):
        check_is_fitted(self, 'estimators_')
        # Check data
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        # avoid storing the output of every estimator by summing them here
        if predict_fn == "predict":
            y_hat = np.zeros((X.shape[0]), dtype=np.float64)
        else:
            y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)

        # Parallel loop
        lock = threading.Lock()
        Parallel(n_jobs=n_jobs,
                 verbose=self.verbose,
                 **_joblib_parallel_args(require="sharedmem"))(
                     delayed(_accumulate_prediction)(getattr(e, predict_fn), X,
                                                     [y_hat], lock)
                     for e in self.estimators_)

        y_hat /= len(self.estimators_)

        return y_hat
Пример #4
0
    def fit_transform(self, X, y=None, **fit_params):
        """Fit all transformers, transform the data and concatenate results.
        Parameters
        ----------
        X : iterable or array-like, depending on transformers
            Input data to be transformed.
        y : array-like, shape (n_samples, ...), optional
            Targets for supervised learning.
        Returns
        -------
        X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, X, y, weight,
                                        **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        Xs = block_diag(*Xs)
        return Xs
Пример #5
0
    def predict_proba(self, X):
        """Predict class probabilities for X.
        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest. The
        class probability of a single tree is the fraction of samples of the same
        class in a leaf.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        check_is_fitted(self, 'estimators_')

        # Check data
        validate_X(X)
        check_X_is_univariate(X)
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        all_proba = Parallel(n_jobs=n_jobs,
                             verbose=self.verbose)(delayed(e.predict_proba)(X)
                                                   for e in self.estimators_)

        return np.sum(all_proba, axis=0) / len(self.estimators_)
Пример #6
0
    def predict(self, X):
        """Predict regression target for X.
        The predicted regression target of an input sample is computed as the
        mean predicted regression targets of the trees in the forest.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted values.
        """
        check_is_fitted(self, 'estimators_')
        # Check data
        validate_X(X)
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        # Parallel loop
        y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(e.predict)(X, check_input=True) for e in self.estimators_)

        return np.sum(y_hat, axis=0) / len(self.estimators_)
Пример #7
0
    def predict(self, X):
        """Predict multi-output variable using a model
         trained for each target variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        Returns
        -------
        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets predicted across multiple predictors.
            Note: Separate models are generated for each predictor.
        """
        check_is_fitted(self, 'estimators_')
        if not hasattr(self.estimator, "predict"):
            raise ValueError(
                "The base estimator should implement a predict method")

        X = check_array(X,
                        accept_sparse=True,
                        force_all_finite=False,
                        dtype="object")

        y = Parallel(n_jobs=self.n_jobs)(
            delayed(parallel_helper)(e, 'predict', X)
            for e in self.estimators_)

        return np.asarray(y).T
Пример #8
0
 def fit_mix(self, u_feats, l_feats, l_targets):
     random_state = check_random_state(self.random_state)
     best_inertia = None
     if effective_n_jobs(self.n_jobs) == 1:
         for it in range(self.n_init):
             labels, inertia, centers, n_iters = self.fit_mix_once(
                 u_feats, l_feats, l_targets, random_state)
             if best_inertia is None or inertia < best_inertia:
                 self.labels_ = labels.clone()
                 self.cluster_centers_ = centers.clone()
                 best_inertia = inertia
                 self.inertia_ = inertia
                 self.n_iter_ = n_iters
     else:
         # parallelisation of k-means runs
         seeds = random_state.randint(np.iinfo(np.int32).max,
                                      size=self.n_init)
         results = Parallel(n_jobs=self.n_jobs, verbose=0)(
             delayed(self.fit_mix_once)(u_feats, l_feats, l_targets, seed)
             for seed in seeds)
         # Get results with the lowest inertia
         labels, inertia, centers, n_iters = zip(*results)
         best = np.argmin(inertia)
         self.labels_ = labels[best]
         self.inertia_ = inertia[best]
         self.cluster_centers_ = centers[best]
         self.n_iter_ = n_iters[best]
Пример #9
0
    def fit_transform(self, X, y=None, **fit_params):
        """Fit all transformers, transform the data and concatenate results.
        Parameters
        ----------
        X : pandas DataFrame
            Input data to be transformed.
        y : pandas Series, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        Xt : pandas DataFrame
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, X, y, weight, **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))

        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)

        return self._hstack(list(Xs))
Пример #10
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform X separately by each transformer, concatenate results.

        Parameters
        ----------
        X : pd.DataFrame or array-like
            Input data to be transformed.
        Returns
        -------
        X_t : pd.DataFrame, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, X, None, weight)
            for name, trans, weight in self._iter())

        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))

        if any(sparse.issparse(f) for f in Xs):
            raise SparseNotAllowedError(
                "sparse results are not allowed, check transformers")
        else:
            if not all(isinstance(x, pd.DataFrame) for x in Xs):
                raise TypeError(
                    "one of the results is not a DataFrame, check your transformers"
                )
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs
def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=300,
               n_jobs=None):
    """pipeline of mean shift clustering
    Parameters
    ----------
    X : array-like, shape=[n_samples, n_features]
    bandwidth: the radius of the sphere
    seeds: whether use the bin seed algorithm to generate the initial seeds
    bin_size:    bin_size = bandwidth.
    min_bin_freq: for each bin_seed, the minimize of the points should cover
    return:
        cluster_centers <class 'numpy.ndarray'> shape=[n_cluster, n_features] ,labels <class 'list'>, len = n_samples
    """
    nbrs = NearestNeighbors(radius = bandwidth, n_jobs = 1).fit(X)

    if bin_seeding:
        seeds = get_bin_seeds(X, bandwidth, min_bin_freq)

    all_res = Parallel(n_jobs = n_jobs)(
        delayed(_mean_shift_single_seed)
        (seed, X, nbrs, max_iter) for seed in seeds)
    cluster_centers = np.array(list(set(all_res))).tolist()

    distances = np.zeros((len(X), len(cluster_centers)))
    for i in range(len(cluster_centers)):
        distances[:, i] = np.linalg.norm(X - cluster_centers[i], axis = 1)
    labels = np.argmin(distances, axis = 1)

    return cluster_centers, labels
Пример #12
0
    def fit(self, X, y):
        """Fit underlying estimators.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
            Multi-class targets. An indicator matrix turns on multilabel
            classification.
        Returns
        -------
        self
        """
        # A sparse LabelBinarizer, with sparse_output=True, has been shown to
        # outperform or match a dense label binarizer in all cases and has also
        # resulted in less or equal memory consumption in the fit_ovr function
        # overall.
        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        Y = self.label_binarizer_.fit_transform(y)
        Y = Y.tocsc()
        self.classes_ = self.label_binarizer_.classes_
        columns = (col.toarray().ravel() for col in Y.T)
        # In cases where individual estimators are very fast to train setting
        # n_jobs > 1 in can results in slower performance due to the overhead
        # of spawning threads.  See joblib issue #112.
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
            self.estimator, X, column, classes=[
                "not %s" % self.label_binarizer_.classes_[i],
                self.label_binarizer_.classes_[i]])
            for i, column in enumerate(columns))

        return self.estimators_
Пример #13
0
    def fit(self, X, y):
        """Fit underlying estimators.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : array-like, shape = [n_samples]
            Multi-class targets.
        Returns
        -------
        self
        """
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        check_classification_targets(y)

        self.classes_ = np.unique(y)
        if len(self.classes_) == 1:
            raise ValueError("OneVsOneClassifier can not be fit when only one"
                             " class is present.")
        n_classes = self.classes_.shape[0]
        estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_ovo_binary)
            (self.estimator, X, y, self.classes_[i], self.classes_[j])
            for i in range(n_classes) for j in range(i + 1, n_classes)))))

        self.estimators_ = estimators_indices[0]
        # try:
        #     self.pairwise_indices_ = (
        #         estimators_indices[1] if self._pairwise else None)
        # except AttributeError:
        #     self.pairwise_indices_ = None

        return estimators_indices
    def _mean_fn(self, X, fn, acc, slice=None):
        # Helper class that accumulates an arbitrary function in parallel on the accumulator acc
        # and calls the function fn on each tree e and returns the mean output. The function fn
        # should take as input a tree e, and return another function g_e, which takes as input X, check_input
        # If slice is not None, but rather a tuple (start, end), then a subset of the trees from
        # index start to index end will be used. The returned result is essentially:
        # (mean over e in slice)(g_e(X)).
        check_is_fitted(self, 'estimators_')
        # Check data
        X = self._validate_X_predict(X)

        if slice is None:
            estimator_slice = self.estimators_
        else:
            estimator_slice = self.estimators_[slice[0]:slice[1]]

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(len(estimator_slice), self.n_jobs)
        lock = threading.Lock()
        Parallel(n_jobs=n_jobs,
                 verbose=self.verbose,
                 **_joblib_parallel_args(require="sharedmem"))(
                     delayed(_accumulate_prediction)(fn(e), X, [acc], lock)
                     for e in estimator_slice)
        acc /= len(estimator_slice)
        return acc
Пример #15
0
def mean_shift(X,
               bandwidth=None,
               seeds=None,
               bin_seeding=False,
               min_bin_freq=1,
               cluster_all=True,
               max_iter=300,
               n_jobs=None):
    """pipline of mean shift clustering
    Parameters
    ----------
    X : array-like, shape=[n_samples, n_features]
    bandwidth: the radius of the sphere
    seeds: whether use the bin seed algorithm to generate the initial seeds
    bin_size:    bin_size = bandwidth.
    min_bin_freq: for each bin_seed, the minimize of the points should cover
    return:
        cluster_centers <class 'numpy.ndarray'> shape=[n_cluster, n_features] ,labels <class 'list'>, len = n_samples
    """
    print(get_bin_seeds(X, bin_seeding))
    # find the points within the sphere
    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
    ##########################################parallel computing############################
    center_intensity_dict = {}
    all_res = Parallel(n_jobs=n_jobs)(delayed(_mean_shift_single_seed)(
        seed, X, nbrs, max_iter) for seed in seeds)  #
    ##########################################parallel computing############################

    return cluster_centers, labels
Пример #16
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.

        Returns
        -------
        self : object
        """
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if sample_weight is not None:
            for name, step in self.estimators:
                if not has_fit_parameter(step, 'sample_weight'):
                    raise ValueError('Underlying estimator \'%s\' does not'
                                     ' support sample weights.' % name)
        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_fit_estimator)(
                clone(clf), X, transformed_y, sample_weight=sample_weight)
            for clf in clfs if clf is not None)

        self.named_estimators_ = Bunch(**dict())
        for k, e in zip(self.estimators, self.estimators_):
            self.named_estimators_[k[0]] = e
        return self
Пример #17
0
    def fit(self, X, y, sample_weight=None):
        """
        Trains the binner and an estimator on every
        bucket.

        Parameters
        ----------
        X: features, *X* is converted into an array if *X* is a dataframe

        y: target

        sample_weight: sample weights

        Returns
        -------
        self: returns an instance of self.

        Attributes
        ----------

        binner_ : binner

        estimators_ : dictionary of estimators, each of them
            mapped to a leave to the tree

        mean_estimator_ : estimator trained on the whole
            datasets in case the binner can find a bucket for
            a new observation

        dim_: dimension of the output
        mean_: average targets
        """
        self.estimators_ = []
        estimators = [clone(self.estimator) for i in range(self.n_estimators)]

        loop = tqdm(range(
            len(estimators))) if self.verbose == 'tqdm' else range(
                len(estimators))
        verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0)

        def _fit_piecewise_estimator(i, est, X, y, sample_weight, alpha):
            new_size = int(X.shape[0] * alpha + 0.5)
            rnd = numpy.random.randint(0, X.shape[0] - 1, new_size)
            Xr = X[rnd]
            yr = y[rnd]
            sr = sample_weight[rnd] if sample_weight else None
            return est.fit(Xr, yr, sr)

        self.estimators_ = \
            Parallel(n_jobs=self.n_jobs, verbose=verbose,
                     **_joblib_parallel_args(prefer='threads'))(
                delayed(_fit_piecewise_estimator)(
                    i, estimators[i], X, y, sample_weight, self.alpha)
                for i in loop)

        return self
Пример #18
0
def mean_shift(X,
               bandwidth=None,
               seeds=None,
               bin_seeding=False,
               min_bin_freq=1,
               cluster_all=True,
               max_iter=300,
               n_jobs=None):
    """pipline of mean shift clustering
    Parameters
    ----------
    X : array-like, shape=[n_samples, n_features]
    bandwidth: the radius of the sphere
    seeds: whether use the bin seed algorithm to generate the initial seeds
    bin_size:    bin_size = bandwidth.
    min_bin_freq: for each bin_seed, the minimize of the points should cover
    return:
        cluster_centers <class 'numpy.ndarray'> shape=[n_cluster, n_features] ,labels <class 'list'>, len = n_samples
    """
    n_samples = X.shape[0]
    n_features = X.shape[1]

    if bin_seeding:
        seeds = get_bin_seeds(X, bandwidth)

    # find the points within the sphere
    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)

    ##########################################parallel computing############################
    center_intensity_dict = {}
    all_res = Parallel(n_jobs=n_jobs)(delayed(_mean_shift_single_seed)(
        seed, X, nbrs, max_iter) for seed in seeds)  #
    ##########################################parallel computing############################

    cluster_centers = np.zeros((len(all_res), n_features))
    for i in range(len(all_res)):
        cluster_centers[i] = np.asarray(all_res[i][0])

    labels = [0] * n_samples
    neighborhoods = nbrs.radius_neighbors(cluster_centers,
                                          return_distance=False)
    for i in range(len(neighborhoods)):
        for neighbor in neighborhoods[i]:
            labels[neighbor] = i

    # Replace the original label numbers with unique indices starting from 0
    i = 0
    unique_labels_map = {}
    unique_labels = np.unique(labels)
    for unique_label in unique_labels:
        unique_labels_map[unique_label] = i
        i += 1
    for i in range(len(labels)):
        labels[i] = unique_labels_map[labels[i]]

    return cluster_centers, labels
Пример #19
0
 def _build_estimators(self, X, columns):
     self.estimators_ = Parallel(n_jobs=self.n_jobs)(
         delayed(_fit_binary)(self.svc,
                              X,
                              column,
                              classes=[
                                  "not %s" %
                                  self.label_binarizer_.classes_[i],
                                  self.label_binarizer_.classes_[i]
                              ]) for i, column in enumerate(columns))
Пример #20
0
    def predict_log_proba(self, X):
        """Predict class log-probabilities for X.
        The predicted class log-probabilities of an input sample is computed as
        the log of the mean predicted class probabilities of the base
        estimators in the ensemble.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        Returns
        -------
        p : array of shape = [n_samples, n_classes]
            The class log-probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        check_is_fitted(self, "classes_")
        if hasattr(self.base_estimator_, "predict_log_proba"):
            # Check data
            X = check_array(
                X, accept_sparse=['csr', 'csc'], dtype=None,
                force_all_finite=False
            )

            if self.n_features_ != X.shape[1]:
                raise ValueError("Number of features of the model must "
                                 "match the input. Model n_features is {0} "
                                 "and input n_features is {1} "
                                 "".format(self.n_features_, X.shape[1]))

            # Parallel loop
            n_jobs, n_estimators, starts = _partition_estimators(
                self.n_estimators, self.n_jobs)

            all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
                delayed(_parallel_predict_log_proba)(
                    self.estimators_[starts[i]:starts[i + 1]],
                    self.estimators_features_[starts[i]:starts[i + 1]],
                    X,
                    self.n_classes_)
                for i in range(n_jobs))

            # Reduce
            log_proba = all_log_proba[0]

            for j in range(1, len(all_log_proba)):
                log_proba = np.logaddexp(log_proba, all_log_proba[j])

            log_proba -= np.log(self.n_estimators)

            return log_proba

        else:
            return np.log(self.predict_proba(X))
Пример #21
0
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        n_classes = self.classes_.shape[0]
        estimators_indices = list(
            zip(*(Parallel(n_jobs=self.n_jobs)(
                delayed(_fit_ovo_binary)(self.svc, X, y, self.classes_[i],
                                         self.classes_[j])
                for i in range(n_classes) for j in range(i + 1, n_classes)))))

        self.estimators_ = estimators_indices[0]
        return self
Пример #22
0
    def predict_proba(self, X):
        """ Predict classes probabilities for the test data.

        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
            The test input samples.

        Returns
        -------
        p : array, shape = [n_samples, n_classes]
            The predicted classes probablities for the test input samples.
        """

        # Check that fit has been called
        check_is_fitted(self, ['estimators_'])

        # Check X
        if self.missing_values == 'NMAR':
            X = check_array(X, dtype=np.float64, order="C", force_all_finite='allow-nan')
        else:
            X = check_array(X, dtype=np.float64, order="C")

        n_features = X.shape[1]
        if self.n_features_ != n_features:
            raise ValueError("X: number of features %s != number of features of the model %s, "
                             "must match."
                             % (n_features, self.n_features_))

        # Predict class probabilities for all decision trees

        # embarrassing parallelism
        ps = []

        # for estimator in self.estimators_:
        #     p = estimator.predict_proba(X)
        #     ps.append(p)

        ps = Parallel(n_jobs=self.n_jobs)\
            (delayed(estimator.predict_proba)(X) for estimator in self.estimators_)

        # Predict classes probabilities for the decision forest
        # as average of the class probabilities from all decision trees

        proba = sum(ps) / len(self.estimators_)  # reduce

        # Handle single-output and multi-outputs formatting
        n_classes_max = max(self.n_classes_)
        if self.n_outputs_ == 1:
            proba = np.reshape(proba, (-1, self.n_classes_[0]))
        else:
            proba = np.reshape(proba, (-1, self.n_outputs_, n_classes_max))

        return proba
Пример #23
0
def grouped_mean(dists, group):
    n_jobs = 5
    mm1, mm2 = group.min(), group.max()
    aa = np.arange(mm1, mm2 + 2, (mm2 - mm1 + 1) / n_jobs).astype(np.int)
    aa = [(aa[i], aa[i + 1]) for i in range(aa.shape[0] - 1)]

    # print(aa)
    Parallel(n_jobs=n_jobs, backend='threading')(
        delayed(_jit_grouped_mean)(dists, m1, m2, group) for (m1, m2) \
            in aa)

    return dists
Пример #24
0
def _decision_path(isolation_forest, X, n_jobs):
    # code from sklearn RandomForest.
    X = check_array(X, dtype=DTYPE, accept_sparse='csr')
    indicators = Parallel(n_jobs=n_jobs,
                          **_joblib_parallel_args(prefer='threads'))(
                              delayed(parallel_helper)(
                                  tree, 'decision_path', X, check_input=False)
                              for tree in isolation_forest.estimators_)
    n_nodes = [0]
    n_nodes.extend([i.shape[1] for i in indicators])
    n_nodes_ptr = np.array(n_nodes).cumsum()
    indicators = sparse_hstack(indicators).tocsr()
    return indicators, n_nodes_ptr
Пример #25
0
    def multiliear_extension(self, x) -> np.float128:
        n_iterations = int(1 / (self.me_eps**2))

        def make_sample_from_dist(x):
            res = list()
            for i in x:
                res.append(np.argmax(np.random.multinomial(1, [1 - i, i], 1)))
            return np.atleast_1d(np.squeeze(
                np.argwhere(np.array(res) > 0))).tolist()

        def sample_submodular(loss_func):
            return loss_func(make_sample_from_dist(x))

        # if x is deterministic then return score on deterministic subset
        if len(set(np.unique(x).tolist()).difference([1, 0])) == 0:
            return self.score(np.atleast_1d(np.argwhere(x).squeeze()).tolist())

        x_a = np.array(x)

        # Statistically ', expected_samples, ' out of n_iterations will be sampled. Therefore set all the <1 probabilities as 1
        expected_samples = (1 - np.min(x_a[x_a > 0])) * n_iterations

        if expected_samples < 1:
            return self.score(
                np.atleast_1d(np.argwhere(x_a > 0).squeeze()).tolist())

        if self.n_jobs > 1:
            sampled_losses = Parallel(self.n_jobs)(delayed(
                partial(lambda loss_func: loss_func(make_sample_from_dist(x)),
                        loss_func=self.score))() for _ in range(n_iterations))
        else:
            sampled_losses = list()
            for _ in range(n_iterations):
                sampled_losses.append(sample_submodular(self.score))

        mean_losses = np.mean(sampled_losses)
        assert (mean_losses > 0)

        return mean_losses
    def partial_fit(self, X, y, classes=None):
        """Partially fit underlying estimators
        Should be used when memory is inefficient to train all data.
        Chunks of data can be passed in several iteration.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
            Multi-class targets. An indicator matrix turns on multilabel
            classification.
        classes : array, shape (n_classes, )
            Classes across all calls to partial_fit.
            Can be obtained via `np.unique(y_all)`, where y_all is the
            target vector of the entire dataset.
            This argument is only required in the first call of partial_fit
            and can be omitted in the subsequent calls.
        Returns
        -------
        self
        """
        if _check_partial_fit_first_call(self, classes):
            if not hasattr(self.estimator, "partial_fit"):
                raise ValueError(("Base estimator {0}, doesn't have "
                                  "partial_fit method").format(self.estimator))
            self.estimators_ = [
                clone(self.estimator) for _ in range(self.n_classes_)
            ]

            # A sparse LabelBinarizer, with sparse_output=True, has been
            # shown to outperform or match a dense label binarizer in all
            # cases and has also resulted in less or equal memory consumption
            # in the fit_ovr function overall.
            self.label_binarizer_ = LabelBinarizer(sparse_output=True)
            self.label_binarizer_.fit(self.classes_)

        if len(np.setdiff1d(y, self.classes_)):
            raise ValueError(
                ("Mini-batch contains {0} while classes " +
                 "must be subset of {1}").format(np.unique(y), self.classes_))

        Y = self.label_binarizer_.transform(y)
        Y = Y.tocsc()
        columns = (col.toarray().ravel() for col in Y.T)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_binary)(estimator, X, column)
            for estimator, column in izip(self.estimators_, columns))

        return self
Пример #27
0
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """Incrementally fit the model to data.
		Fit a separate model for each output variable.

		Parameters
		----------
		X : (sparse) array-like, shape (n_samples, n_features)
			Data.

		y : (sparse) array-like, shape (n_samples, n_outputs)
			Multi-output targets.

		classes : list of numpy arrays, shape (n_outputs)
			Each array is unique classes for one output in str/int
			Can be obtained by via
			``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where y is the
			target matrix of the entire dataset.
			This argument is required for the first call to partial_fit
			and can be omitted in the subsequent calls.
			Note that y doesn't need to contain all labels in `classes`.

		sample_weight : array-like, shape = (n_samples) or None
			Sample weights. If None, then samples are equally weighted.
			Only supported if the underlying regressor supports sample
			weights.

		Returns
		-------
		self : object
		"""
        X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output regression but has only one.")

        for i in range(y.shape[1]):
            if (sample_weight is not None and not has_fit_parameter(
                    self.estimators[i], 'sample_weight')):
                raise ValueError(f"Underlying estimator {i} does not support"
                                 " sample weights.")

        first_time = not hasattr(self, 'estimators_')

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_estimator)
            (self.estimators_[i] if not first_time else self.estimators[i], X,
             y[:, i], classes[i] if classes is not None else None,
             sample_weight, first_time) for i in range(y.shape[1]))
        return self
Пример #28
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like, shape = (n_samples) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.

        Returns
        -------
        self : object
        """

        if not hasattr(self.estimator, "fit"):
            raise ValueError(
                "The base estimator should implement a fit method")

        X, y = check_X_y(X,
                         y,
                         multi_output=True,
                         accept_sparse=True,
                         dtype="object")

        if is_classifier(self):
            check_classification_targets(y)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output regression but has only one.")

        if (sample_weight is not None
                and not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying estimator does not support"
                             " sample weights.")

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(self.estimator, X, y[:, i], sample_weight)
            for i in range(y.shape[1]))
        return self
Пример #29
0
def __build_features(X, pathwat_dict, ec_dict, labels_components, node2idx_pathway2ec, path2vec_features, file_name,
                     dspath, batch_size=100, num_jobs=1):
    tmp = lil_matrix.copy(X)
    print('\t>> Build abundance and coverage features...')
    list_batches = np.arange(start=0, stop=tmp.shape[0], step=batch_size)
    total_progress = len(list_batches) * len(pathwat_dict.keys())
    parallel = Parallel(n_jobs=num_jobs, verbose=0)
    results = parallel(delayed(compute_abd_cov)(tmp[batch:batch + batch_size],
                                                labels_components, pathwat_dict,
                                                None, batch_idx, total_progress)
                       for batch_idx, batch in enumerate(list_batches))
    desc = '\t\t--> Building {0:.4f}%...'.format((100))
    print(desc)
    abd, cov = zip(*results)
    abd = np.vstack(abd)
    cov = np.vstack(cov)
    del results
    abd = preprocessing.normalize(abd)
    print('\t>> Use pathway2vec EC features...')
    path2vec_features = path2vec_features[path2vec_features.files[0]]
    path2vec_features = path2vec_features / np.linalg.norm(path2vec_features, axis=1)[:, np.newaxis]
    ec_features = [idx for idx, v in ec_dict.items() if v in node2idx_pathway2ec]
    path2vec_features = path2vec_features[ec_features, :]
    ec_features = [np.mean(path2vec_features[row.rows[0]] * np.array(row.data[0])[:, None], axis=0)
                   for idx, row in enumerate(X)]
    save_data(data=lil_matrix(ec_features), file_name=file_name + "_Xp.pkl", save_path=dspath, mode="wb",
              tag="transformed instances to ec features")
    X = lil_matrix(hstack((tmp, ec_features)))
    save_data(data=X, file_name=file_name + "_Xe.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec features with instances")
    X = lil_matrix(hstack((tmp, abd)))
    save_data(data=X, file_name=file_name + "_Xa.pkl", save_path=dspath, mode="wb",
              tag="concatenated abundance features with instances")
    X = lil_matrix(hstack((tmp, cov)))
    save_data(data=X, file_name=file_name + "_Xc.pkl", save_path=dspath, mode="wb",
              tag="concatenated coverage features with instances")
    X = lil_matrix(hstack((tmp, ec_features)))
    X = lil_matrix(hstack((X, abd)))
    save_data(data=X, file_name=file_name + "_Xea.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec and abundance features with instances")
    X = lil_matrix(hstack((tmp, ec_features)))
    X = lil_matrix(hstack((X, cov)))
    save_data(data=X, file_name=file_name + "_Xec.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec and coverage features with instances")
    X = lil_matrix(hstack((tmp, ec_features)))
    X = lil_matrix(hstack((X, abd)))
    X = lil_matrix(hstack((X, cov)))
    save_data(data=X, file_name=file_name + "_Xm.pkl", save_path=dspath, mode="wb",
              tag="concatenated ec, abundance, and coverage features features with instances")
Пример #30
0
    def predict_var(self, X):
        """Predict class probabilities for X.
        The predicted class probabilities of an input sample is computed as
        the mean predicted class probabilities of the base estimators in the
        ensemble. If base estimators do not implement a ``predict_proba``
        method, then it resorts to voting and the predicted class probabilities
        of an input sample represents the proportion of estimators predicting
        each class.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        Returns
        -------
        p : array of shape = [n_samples, n_classes]
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        check_is_fitted(self, "classes_")
        # Check data
        X = check_array(
            X, accept_sparse=['csr', 'csc'], dtype=None,
            force_all_finite=False
        )

        if self.n_features_ != X.shape[1]:
            raise ValueError("Number of features of the model must "
                             "match the input. Model n_features is {0} and "
                             "input n_features is {1}."
                             "".format(self.n_features_, X.shape[1]))

        # Parallel loop
        n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators,
                                                             self.n_jobs)

        all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_parallel_predict_var)(
                self.estimators_[starts[i]:starts[i + 1]],
                self.estimators_features_[starts[i]:starts[i + 1]],
                X,
                self.n_classes_)
            for i in range(n_jobs))

        # Reduce
        proba = sum(all_proba) / self.n_estimators

        return proba