Пример #1
0
def synthesize_report(X, sample_ids, y_pred, y_dict_ids, y_common_name, component_dict, labels_components,
                      y_pred_score=None,
                      batch_size=30, num_jobs=1, rsfolder="Results", rspath="../.", dspath="../.", file_name='labels'):
    if y_pred is None:
        raise Exception("Please provide two matrices as numpy matrix format: "
                        "(num_samples, num_labels), representing pathway scores "
                        "and the status of prediction as binary values.")

    num_samples = len(sample_ids)
    main_folder_path = os.path.join(rspath, rsfolder)
    list_batches = np.arange(start=0, stop=num_samples, step=batch_size)
    parallel = Parallel(n_jobs=num_jobs, verbose=0)

    # Delete the previous main folder and recreate a new one
    create_remove_dir(folder_path=main_folder_path)
    if y_pred_score is not None:
        results = parallel(delayed(__synthesize_report)(X[batch:batch + batch_size],
                                                        sample_ids[batch:batch + batch_size],
                                                        y_pred_score[batch:batch + batch_size],
                                                        y_pred[batch:batch + batch_size],
                                                        y_dict_ids, y_common_name, component_dict,
                                                        labels_components, main_folder_path, batch_idx,
                                                        len(list_batches))
                           for batch_idx, batch in enumerate(list_batches))
    else:
        results = parallel(delayed(__synthesize_report)(X[batch:batch + batch_size],
                                                        sample_ids[batch:batch + batch_size],
                                                        y_pred_score, y_pred[batch:batch + batch_size],
                                                        y_dict_ids, y_common_name, component_dict,
                                                        labels_components, main_folder_path, batch_idx,
                                                        len(list_batches))
                           for batch_idx, batch in enumerate(list_batches))
    desc = '\t\t--> Synthesizing pathway reports {0:.4f}%...'.format(100)
    print(desc)
    y = list(zip(*results))
    y = [item for lst in y for item in lst]
    print('\t\t--> Storing predictions (label) to: {0:s}'.format(file_name + '_labels.pkl'))
    save_data(data=y, file_name=file_name + '_labels.pkl', save_path=dspath, mode="wb",
              print_tag=False)
    y_dict_ids = dict((y_id, y_idx) for y_idx, y_id in y_dict_ids.items())
    y_csr = np.zeros((len(y), len(y_dict_ids.keys())))
    for idx, lst in enumerate(y):
        for item in lst:
            if item in y_dict_ids:
                y_csr[idx, y_dict_ids[item]] = 1
    print('\t\t--> Storing predictions (label index) to: {0:s}'.format(file_name + '_y.pkl'))
    save_data(data=lil_matrix(y_csr), file_name=file_name + "_y.pkl", save_path=dspath, mode="wb",
              print_tag=False)
Пример #2
0
def cross_val_predict(estimator, X, y=None, groups=None, cv='warn',
                      n_jobs=None, verbose=0, fit_params=None,
                      pre_dispatch='2*n_jobs', method='predict'):

    """
    Minor modifications and simplications brought to the sklearn function in order to allow
    for application with non-partition CV scheme. 
    """

    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
    prediction_blocks = parallel(delayed(_fit_and_predict)(
        clone(estimator), X, y, train, test, verbose, fit_params, method)
        for train, test in cv.split(X, y, groups))


    # Concatenate the predictions
    predictions = [pred_block_i for pred_block_i, _ in prediction_blocks]
    predictions = np.concatenate(predictions)

    test_indices = np.concatenate([indices_i
                                   for _, indices_i in prediction_blocks])
    test_index = [y.index[_] for _ in test_indices]
    #print(predictions)

    if y.ndim == 1:
        return pd.Series(predictions, index = test_index)
    elif y.ndim>1:
        return pd.DataFrame(predictions, index = test_index)
Пример #3
0
    def predict(self, X):
        """Predict multi-output variable using a model
		 trained for each target variable.

		Parameters
		----------
		X : (sparse) array-like, shape (n_samples, n_features)
			Data.

		Returns
		-------
		y : (sparse) array-like, shape (n_samples, n_outputs)
			Multi-output targets predicted across multiple predictors.
			Note: Separate models are generated for each predictor.
		"""
        check_is_fitted(self, 'estimators_')
        for i, e in enumerate(self.estimators):
            if not hasattr(e, "predict"):
                raise ValueError(f"The base estimator {i} should implement"
                                 " a predict method")

        X = check_array(X, accept_sparse=True)

        y = Parallel(n_jobs=self.n_jobs)(delayed(e.predict)(X)
                                         for e in self.estimators_)

        return np.asarray(y).T
Пример #4
0
    def _predict(self, predict_fn, X):
        check_is_fitted(self, 'estimators_')
        # Check data
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        # avoid storing the output of every estimator by summing them here
        if predict_fn == "predict":
            y_hat = np.zeros((X.shape[0]), dtype=np.float64)
        else:
            y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)

        # Parallel loop
        lock = threading.Lock()
        Parallel(n_jobs=n_jobs,
                 verbose=self.verbose,
                 **_joblib_parallel_args(require="sharedmem"))(
                     delayed(_accumulate_prediction)(getattr(e, predict_fn), X,
                                                     [y_hat], lock)
                     for e in self.estimators_)

        y_hat /= len(self.estimators_)

        return y_hat
Пример #5
0
    def fit_transform(self, X, y=None, **fit_params):
        """Fit all transformers, transform the data and concatenate results.
        Parameters
        ----------
        X : iterable or array-like, depending on transformers
            Input data to be transformed.
        y : array-like, shape (n_samples, ...), optional
            Targets for supervised learning.
        Returns
        -------
        X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, X, y, weight,
                                        **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        Xs = block_diag(*Xs)
        return Xs
Пример #6
0
    def predict(self, X):
        """Predict regression target for X.
        The predicted regression target of an input sample is computed as the
        mean predicted regression targets of the trees in the forest.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        y : array of shape = [n_samples] or [n_samples, n_outputs]
            The predicted values.
        """
        check_is_fitted(self, 'estimators_')
        # Check data
        validate_X(X)
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        # Parallel loop
        y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(e.predict)(X, check_input=True) for e in self.estimators_)

        return np.sum(y_hat, axis=0) / len(self.estimators_)
Пример #7
0
 def fit_mix(self, u_feats, l_feats, l_targets):
     random_state = check_random_state(self.random_state)
     best_inertia = None
     if effective_n_jobs(self.n_jobs) == 1:
         for it in range(self.n_init):
             labels, inertia, centers, n_iters = self.fit_mix_once(
                 u_feats, l_feats, l_targets, random_state)
             if best_inertia is None or inertia < best_inertia:
                 self.labels_ = labels.clone()
                 self.cluster_centers_ = centers.clone()
                 best_inertia = inertia
                 self.inertia_ = inertia
                 self.n_iter_ = n_iters
     else:
         # parallelisation of k-means runs
         seeds = random_state.randint(np.iinfo(np.int32).max,
                                      size=self.n_init)
         results = Parallel(n_jobs=self.n_jobs, verbose=0)(
             delayed(self.fit_mix_once)(u_feats, l_feats, l_targets, seed)
             for seed in seeds)
         # Get results with the lowest inertia
         labels, inertia, centers, n_iters = zip(*results)
         best = np.argmin(inertia)
         self.labels_ = labels[best]
         self.inertia_ = inertia[best]
         self.cluster_centers_ = centers[best]
         self.n_iter_ = n_iters[best]
Пример #8
0
            def evaluate_candidates(candidate_params):
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                out = parallel(
                    delayed(_fit_and_score)(clone(base_estimator),
                                            X,
                                            y,
                                            train=train,
                                            test=test,
                                            parameters=parameters,
                                            **fit_and_score_kwargs)
                    for parameters, (train, test) in product(
                        candidate_params, cv.split(X, y, groups)))

                self._validate_out(out, n_candidates, n_splits)

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                nonlocal results
                results = self._format_results(all_candidate_params, scorers,
                                               n_splits, all_out)
                return results
Пример #9
0
    def predict(self, X):
        """Predict multi-output variable using a model
         trained for each target variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        Returns
        -------
        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets predicted across multiple predictors.
            Note: Separate models are generated for each predictor.
        """
        check_is_fitted(self, 'estimators_')
        if not hasattr(self.estimator, "predict"):
            raise ValueError(
                "The base estimator should implement a predict method")

        X = check_array(X,
                        accept_sparse=True,
                        force_all_finite=False,
                        dtype="object")

        y = Parallel(n_jobs=self.n_jobs)(
            delayed(parallel_helper)(e, 'predict', X)
            for e in self.estimators_)

        return np.asarray(y).T
Пример #10
0
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform X separately by each transformer, concatenate results.

        Parameters
        ----------
        X : pd.DataFrame or array-like
            Input data to be transformed.
        Returns
        -------
        X_t : pd.DataFrame, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, X, None, weight)
            for name, trans, weight in self._iter())

        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))

        if any(sparse.issparse(f) for f in Xs):
            raise SparseNotAllowedError(
                "sparse results are not allowed, check transformers")
        else:
            if not all(isinstance(x, pd.DataFrame) for x in Xs):
                raise TypeError(
                    "one of the results is not a DataFrame, check your transformers"
                )
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs
def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=300,
               n_jobs=None):
    """pipeline of mean shift clustering
    Parameters
    ----------
    X : array-like, shape=[n_samples, n_features]
    bandwidth: the radius of the sphere
    seeds: whether use the bin seed algorithm to generate the initial seeds
    bin_size:    bin_size = bandwidth.
    min_bin_freq: for each bin_seed, the minimize of the points should cover
    return:
        cluster_centers <class 'numpy.ndarray'> shape=[n_cluster, n_features] ,labels <class 'list'>, len = n_samples
    """
    nbrs = NearestNeighbors(radius = bandwidth, n_jobs = 1).fit(X)

    if bin_seeding:
        seeds = get_bin_seeds(X, bandwidth, min_bin_freq)

    all_res = Parallel(n_jobs = n_jobs)(
        delayed(_mean_shift_single_seed)
        (seed, X, nbrs, max_iter) for seed in seeds)
    cluster_centers = np.array(list(set(all_res))).tolist()

    distances = np.zeros((len(X), len(cluster_centers)))
    for i in range(len(cluster_centers)):
        distances[:, i] = np.linalg.norm(X - cluster_centers[i], axis = 1)
    labels = np.argmin(distances, axis = 1)

    return cluster_centers, labels
Пример #12
0
    def predict_proba(self, X):
        """Predict class probabilities for X.
        The predicted class probabilities of an input sample are computed as
        the mean predicted class probabilities of the trees in the forest. The
        class probability of a single tree is the fraction of samples of the same
        class in a leaf.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.
        Returns
        -------
        p : array of shape = [n_samples, n_classes], or a list of n_outputs
            such arrays if n_outputs > 1.
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        check_is_fitted(self, 'estimators_')

        # Check data
        validate_X(X)
        check_X_is_univariate(X)
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)

        all_proba = Parallel(n_jobs=n_jobs,
                             verbose=self.verbose)(delayed(e.predict_proba)(X)
                                                   for e in self.estimators_)

        return np.sum(all_proba, axis=0) / len(self.estimators_)
    def _mean_fn(self, X, fn, acc, slice=None):
        # Helper class that accumulates an arbitrary function in parallel on the accumulator acc
        # and calls the function fn on each tree e and returns the mean output. The function fn
        # should take as input a tree e, and return another function g_e, which takes as input X, check_input
        # If slice is not None, but rather a tuple (start, end), then a subset of the trees from
        # index start to index end will be used. The returned result is essentially:
        # (mean over e in slice)(g_e(X)).
        check_is_fitted(self, 'estimators_')
        # Check data
        X = self._validate_X_predict(X)

        if slice is None:
            estimator_slice = self.estimators_
        else:
            estimator_slice = self.estimators_[slice[0]:slice[1]]

        # Assign chunk of trees to jobs
        n_jobs, _, _ = _partition_estimators(len(estimator_slice), self.n_jobs)
        lock = threading.Lock()
        Parallel(n_jobs=n_jobs,
                 verbose=self.verbose,
                 **_joblib_parallel_args(require="sharedmem"))(
                     delayed(_accumulate_prediction)(fn(e), X, [acc], lock)
                     for e in estimator_slice)
        acc /= len(estimator_slice)
        return acc
Пример #14
0
    def fit_transform(self, X, y=None, **fit_params):
        """Fit all transformers, transform the data and concatenate results.
        Parameters
        ----------
        X : pandas DataFrame
            Input data to be transformed.
        y : pandas Series, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        Xt : pandas DataFrame
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, X, y, weight, **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))

        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)

        return self._hstack(list(Xs))
Пример #15
0
    def fit(self, X, y):
        """Fit underlying estimators.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
            Multi-class targets. An indicator matrix turns on multilabel
            classification.
        Returns
        -------
        self
        """
        # A sparse LabelBinarizer, with sparse_output=True, has been shown to
        # outperform or match a dense label binarizer in all cases and has also
        # resulted in less or equal memory consumption in the fit_ovr function
        # overall.
        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        Y = self.label_binarizer_.fit_transform(y)
        Y = Y.tocsc()
        self.classes_ = self.label_binarizer_.classes_
        columns = (col.toarray().ravel() for col in Y.T)
        # In cases where individual estimators are very fast to train setting
        # n_jobs > 1 in can results in slower performance due to the overhead
        # of spawning threads.  See joblib issue #112.
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
            self.estimator, X, column, classes=[
                "not %s" % self.label_binarizer_.classes_[i],
                self.label_binarizer_.classes_[i]])
            for i, column in enumerate(columns))

        return self.estimators_
Пример #16
0
    def fit(self, X, y):
        """Fit underlying estimators.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : array-like, shape = [n_samples]
            Multi-class targets.
        Returns
        -------
        self
        """
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
        check_classification_targets(y)

        self.classes_ = np.unique(y)
        if len(self.classes_) == 1:
            raise ValueError("OneVsOneClassifier can not be fit when only one"
                             " class is present.")
        n_classes = self.classes_.shape[0]
        estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_ovo_binary)
            (self.estimator, X, y, self.classes_[i], self.classes_[j])
            for i in range(n_classes) for j in range(i + 1, n_classes)))))

        self.estimators_ = estimators_indices[0]
        # try:
        #     self.pairwise_indices_ = (
        #         estimators_indices[1] if self._pairwise else None)
        # except AttributeError:
        #     self.pairwise_indices_ = None

        return estimators_indices
Пример #17
0
            def evaluate_candidates(candidate_params):
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                out = parallel(
                    delayed(_fit_and_score)(clone(base_estimator),
                                            X,
                                            y,
                                            train=train,
                                            test=test,
                                            parameters=parameters,
                                            **fit_and_score_kwargs)
                    for parameters, (train, test) in product(
                        candidate_params, cv.split(X, y, groups)))

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                # XXX: When we drop Python 2 support, we can use nonlocal
                # instead of results_container
                results_container[0] = self._format_results(
                    all_candidate_params, scorers, n_splits, all_out)
                return results_container[0]
Пример #18
0
def mean_shift(X,
               bandwidth=None,
               seeds=None,
               bin_seeding=False,
               min_bin_freq=1,
               cluster_all=True,
               max_iter=300,
               n_jobs=None):
    """pipline of mean shift clustering
    Parameters
    ----------
    X : array-like, shape=[n_samples, n_features]
    bandwidth: the radius of the sphere
    seeds: whether use the bin seed algorithm to generate the initial seeds
    bin_size:    bin_size = bandwidth.
    min_bin_freq: for each bin_seed, the minimize of the points should cover
    return:
        cluster_centers <class 'numpy.ndarray'> shape=[n_cluster, n_features] ,labels <class 'list'>, len = n_samples
    """
    print(get_bin_seeds(X, bin_seeding))
    # find the points within the sphere
    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)
    ##########################################parallel computing############################
    center_intensity_dict = {}
    all_res = Parallel(n_jobs=n_jobs)(delayed(_mean_shift_single_seed)(
        seed, X, nbrs, max_iter) for seed in seeds)  #
    ##########################################parallel computing############################

    return cluster_centers, labels
Пример #19
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the estimators.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted.
            Note that this is supported only if all underlying estimators
            support sample weights.

        Returns
        -------
        self : object
        """
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if sample_weight is not None:
            for name, step in self.estimators:
                if not has_fit_parameter(step, 'sample_weight'):
                    raise ValueError('Underlying estimator \'%s\' does not'
                                     ' support sample weights.' % name)
        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_fit_estimator)(
                clone(clf), X, transformed_y, sample_weight=sample_weight)
            for clf in clfs if clf is not None)

        self.named_estimators_ = Bunch(**dict())
        for k, e in zip(self.estimators, self.estimators_):
            self.named_estimators_[k[0]] = e
        return self
Пример #20
0
def mean_shift(X,
               bandwidth=None,
               seeds=None,
               bin_seeding=False,
               min_bin_freq=1,
               cluster_all=True,
               max_iter=300,
               n_jobs=None):
    """pipline of mean shift clustering
    Parameters
    ----------
    X : array-like, shape=[n_samples, n_features]
    bandwidth: the radius of the sphere
    seeds: whether use the bin seed algorithm to generate the initial seeds
    bin_size:    bin_size = bandwidth.
    min_bin_freq: for each bin_seed, the minimize of the points should cover
    return:
        cluster_centers <class 'numpy.ndarray'> shape=[n_cluster, n_features] ,labels <class 'list'>, len = n_samples
    """
    n_samples = X.shape[0]
    n_features = X.shape[1]

    if bin_seeding:
        seeds = get_bin_seeds(X, bandwidth)

    # find the points within the sphere
    nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(X)

    ##########################################parallel computing############################
    center_intensity_dict = {}
    all_res = Parallel(n_jobs=n_jobs)(delayed(_mean_shift_single_seed)(
        seed, X, nbrs, max_iter) for seed in seeds)  #
    ##########################################parallel computing############################

    cluster_centers = np.zeros((len(all_res), n_features))
    for i in range(len(all_res)):
        cluster_centers[i] = np.asarray(all_res[i][0])

    labels = [0] * n_samples
    neighborhoods = nbrs.radius_neighbors(cluster_centers,
                                          return_distance=False)
    for i in range(len(neighborhoods)):
        for neighbor in neighborhoods[i]:
            labels[neighbor] = i

    # Replace the original label numbers with unique indices starting from 0
    i = 0
    unique_labels_map = {}
    unique_labels = np.unique(labels)
    for unique_label in unique_labels:
        unique_labels_map[unique_label] = i
        i += 1
    for i in range(len(labels)):
        labels[i] = unique_labels_map[labels[i]]

    return cluster_centers, labels
Пример #21
0
    def fit(self, X, y, sample_weight=None):
        """
        Trains the binner and an estimator on every
        bucket.

        Parameters
        ----------
        X: features, *X* is converted into an array if *X* is a dataframe

        y: target

        sample_weight: sample weights

        Returns
        -------
        self: returns an instance of self.

        Attributes
        ----------

        binner_ : binner

        estimators_ : dictionary of estimators, each of them
            mapped to a leave to the tree

        mean_estimator_ : estimator trained on the whole
            datasets in case the binner can find a bucket for
            a new observation

        dim_: dimension of the output
        mean_: average targets
        """
        self.estimators_ = []
        estimators = [clone(self.estimator) for i in range(self.n_estimators)]

        loop = tqdm(range(
            len(estimators))) if self.verbose == 'tqdm' else range(
                len(estimators))
        verbose = 1 if self.verbose == 'tqdm' else (1 if self.verbose else 0)

        def _fit_piecewise_estimator(i, est, X, y, sample_weight, alpha):
            new_size = int(X.shape[0] * alpha + 0.5)
            rnd = numpy.random.randint(0, X.shape[0] - 1, new_size)
            Xr = X[rnd]
            yr = y[rnd]
            sr = sample_weight[rnd] if sample_weight else None
            return est.fit(Xr, yr, sr)

        self.estimators_ = \
            Parallel(n_jobs=self.n_jobs, verbose=verbose,
                     **_joblib_parallel_args(prefer='threads'))(
                delayed(_fit_piecewise_estimator)(
                    i, estimators[i], X, y, sample_weight, self.alpha)
                for i in loop)

        return self
Пример #22
0
 def _build_estimators(self, X, columns):
     self.estimators_ = Parallel(n_jobs=self.n_jobs)(
         delayed(_fit_binary)(self.svc,
                              X,
                              column,
                              classes=[
                                  "not %s" %
                                  self.label_binarizer_.classes_[i],
                                  self.label_binarizer_.classes_[i]
                              ]) for i, column in enumerate(columns))
Пример #23
0
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        n_classes = self.classes_.shape[0]
        estimators_indices = list(
            zip(*(Parallel(n_jobs=self.n_jobs)(
                delayed(_fit_ovo_binary)(self.svc, X, y, self.classes_[i],
                                         self.classes_[j])
                for i in range(n_classes) for j in range(i + 1, n_classes)))))

        self.estimators_ = estimators_indices[0]
        return self
Пример #24
0
    def predict_log_proba(self, X):
        """Predict class log-probabilities for X.
        The predicted class log-probabilities of an input sample is computed as
        the log of the mean predicted class probabilities of the base
        estimators in the ensemble.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
            The training input samples. Sparse matrices are accepted only if
            they are supported by the base estimator.
        Returns
        -------
        p : array of shape = [n_samples, n_classes]
            The class log-probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        check_is_fitted(self, "classes_")
        if hasattr(self.base_estimator_, "predict_log_proba"):
            # Check data
            X = check_array(
                X, accept_sparse=['csr', 'csc'], dtype=None,
                force_all_finite=False
            )

            if self.n_features_ != X.shape[1]:
                raise ValueError("Number of features of the model must "
                                 "match the input. Model n_features is {0} "
                                 "and input n_features is {1} "
                                 "".format(self.n_features_, X.shape[1]))

            # Parallel loop
            n_jobs, n_estimators, starts = _partition_estimators(
                self.n_estimators, self.n_jobs)

            all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
                delayed(_parallel_predict_log_proba)(
                    self.estimators_[starts[i]:starts[i + 1]],
                    self.estimators_features_[starts[i]:starts[i + 1]],
                    X,
                    self.n_classes_)
                for i in range(n_jobs))

            # Reduce
            log_proba = all_log_proba[0]

            for j in range(1, len(all_log_proba)):
                log_proba = np.logaddexp(log_proba, all_log_proba[j])

            log_proba -= np.log(self.n_estimators)

            return log_proba

        else:
            return np.log(self.predict_proba(X))
Пример #25
0
    def predict_proba(self, X):
        """ Predict classes probabilities for the test data.

        Parameters
        ----------
        X : array, shape = [n_samples, n_features]
            The test input samples.

        Returns
        -------
        p : array, shape = [n_samples, n_classes]
            The predicted classes probablities for the test input samples.
        """

        # Check that fit has been called
        check_is_fitted(self, ['estimators_'])

        # Check X
        if self.missing_values == 'NMAR':
            X = check_array(X, dtype=np.float64, order="C", force_all_finite='allow-nan')
        else:
            X = check_array(X, dtype=np.float64, order="C")

        n_features = X.shape[1]
        if self.n_features_ != n_features:
            raise ValueError("X: number of features %s != number of features of the model %s, "
                             "must match."
                             % (n_features, self.n_features_))

        # Predict class probabilities for all decision trees

        # embarrassing parallelism
        ps = []

        # for estimator in self.estimators_:
        #     p = estimator.predict_proba(X)
        #     ps.append(p)

        ps = Parallel(n_jobs=self.n_jobs)\
            (delayed(estimator.predict_proba)(X) for estimator in self.estimators_)

        # Predict classes probabilities for the decision forest
        # as average of the class probabilities from all decision trees

        proba = sum(ps) / len(self.estimators_)  # reduce

        # Handle single-output and multi-outputs formatting
        n_classes_max = max(self.n_classes_)
        if self.n_outputs_ == 1:
            proba = np.reshape(proba, (-1, self.n_classes_[0]))
        else:
            proba = np.reshape(proba, (-1, self.n_outputs_, n_classes_max))

        return proba
Пример #26
0
def grouped_mean(dists, group):
    n_jobs = 5
    mm1, mm2 = group.min(), group.max()
    aa = np.arange(mm1, mm2 + 2, (mm2 - mm1 + 1) / n_jobs).astype(np.int)
    aa = [(aa[i], aa[i + 1]) for i in range(aa.shape[0] - 1)]

    # print(aa)
    Parallel(n_jobs=n_jobs, backend='threading')(
        delayed(_jit_grouped_mean)(dists, m1, m2, group) for (m1, m2) \
            in aa)

    return dists
Пример #27
0
def _decision_path(isolation_forest, X, n_jobs):
    # code from sklearn RandomForest.
    X = check_array(X, dtype=DTYPE, accept_sparse='csr')
    indicators = Parallel(n_jobs=n_jobs,
                          **_joblib_parallel_args(prefer='threads'))(
                              delayed(parallel_helper)(
                                  tree, 'decision_path', X, check_input=False)
                              for tree in isolation_forest.estimators_)
    n_nodes = [0]
    n_nodes.extend([i.shape[1] for i in indicators])
    n_nodes_ptr = np.array(n_nodes).cumsum()
    indicators = sparse_hstack(indicators).tocsr()
    return indicators, n_nodes_ptr
    def partial_fit(self, X, y, classes=None):
        """Partially fit underlying estimators
        Should be used when memory is inefficient to train all data.
        Chunks of data can be passed in several iteration.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Data.
        y : (sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
            Multi-class targets. An indicator matrix turns on multilabel
            classification.
        classes : array, shape (n_classes, )
            Classes across all calls to partial_fit.
            Can be obtained via `np.unique(y_all)`, where y_all is the
            target vector of the entire dataset.
            This argument is only required in the first call of partial_fit
            and can be omitted in the subsequent calls.
        Returns
        -------
        self
        """
        if _check_partial_fit_first_call(self, classes):
            if not hasattr(self.estimator, "partial_fit"):
                raise ValueError(("Base estimator {0}, doesn't have "
                                  "partial_fit method").format(self.estimator))
            self.estimators_ = [
                clone(self.estimator) for _ in range(self.n_classes_)
            ]

            # A sparse LabelBinarizer, with sparse_output=True, has been
            # shown to outperform or match a dense label binarizer in all
            # cases and has also resulted in less or equal memory consumption
            # in the fit_ovr function overall.
            self.label_binarizer_ = LabelBinarizer(sparse_output=True)
            self.label_binarizer_.fit(self.classes_)

        if len(np.setdiff1d(y, self.classes_)):
            raise ValueError(
                ("Mini-batch contains {0} while classes " +
                 "must be subset of {1}").format(np.unique(y), self.classes_))

        Y = self.label_binarizer_.transform(y)
        Y = Y.tocsc()
        columns = (col.toarray().ravel() for col in Y.T)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_binary)(estimator, X, column)
            for estimator, column in izip(self.estimators_, columns))

        return self
Пример #29
0
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """Incrementally fit the model to data.
		Fit a separate model for each output variable.

		Parameters
		----------
		X : (sparse) array-like, shape (n_samples, n_features)
			Data.

		y : (sparse) array-like, shape (n_samples, n_outputs)
			Multi-output targets.

		classes : list of numpy arrays, shape (n_outputs)
			Each array is unique classes for one output in str/int
			Can be obtained by via
			``[np.unique(y[:, i]) for i in range(y.shape[1])]``, where y is the
			target matrix of the entire dataset.
			This argument is required for the first call to partial_fit
			and can be omitted in the subsequent calls.
			Note that y doesn't need to contain all labels in `classes`.

		sample_weight : array-like, shape = (n_samples) or None
			Sample weights. If None, then samples are equally weighted.
			Only supported if the underlying regressor supports sample
			weights.

		Returns
		-------
		self : object
		"""
        X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output regression but has only one.")

        for i in range(y.shape[1]):
            if (sample_weight is not None and not has_fit_parameter(
                    self.estimators[i], 'sample_weight')):
                raise ValueError(f"Underlying estimator {i} does not support"
                                 " sample weights.")

        first_time = not hasattr(self, 'estimators_')

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_partial_fit_estimator)
            (self.estimators_[i] if not first_time else self.estimators[i], X,
             y[:, i], classes[i] if classes is not None else None,
             sample_weight, first_time) for i in range(y.shape[1]))
        return self
Пример #30
0
    def fit(self, X, y, sample_weight=None):
        """ Fit the model to data.
        Fit a separate model for each output variable.

        Parameters
        ----------
        X : (sparse) array-like, shape (n_samples, n_features)
            Data.

        y : (sparse) array-like, shape (n_samples, n_outputs)
            Multi-output targets. An indicator matrix turns on multilabel
            estimation.

        sample_weight : array-like, shape = (n_samples) or None
            Sample weights. If None, then samples are equally weighted.
            Only supported if the underlying regressor supports sample
            weights.

        Returns
        -------
        self : object
        """

        if not hasattr(self.estimator, "fit"):
            raise ValueError(
                "The base estimator should implement a fit method")

        X, y = check_X_y(X,
                         y,
                         multi_output=True,
                         accept_sparse=True,
                         dtype="object")

        if is_classifier(self):
            check_classification_targets(y)

        if y.ndim == 1:
            raise ValueError("y must have at least two dimensions for "
                             "multi-output regression but has only one.")

        if (sample_weight is not None
                and not has_fit_parameter(self.estimator, 'sample_weight')):
            raise ValueError("Underlying estimator does not support"
                             " sample weights.")

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_estimator)(self.estimator, X, y[:, i], sample_weight)
            for i in range(y.shape[1]))
        return self
Пример #31
0
    def fit(self, X, y, groups=None):
        """Fit the RFE model and automatically tune the number of selected
           features.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the total number of features.
        y : array-like, shape = [n_samples]
            Target values (integers for classification, real numbers for
            regression).
        groups : array-like, shape = [n_samples], optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """
        if type(self.step) is not list:
            return super(DyRFECV, self).fit(X, y, groups)

        X, y = check_X_y(X, y, "csr")

        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        step = []
        for s in self.step:
            if 0.0 < s < 1.0:
                step.append(int(max(1, s * n_features)))
            else:
                step.append(int(s))
            if s <= 0:
                raise ValueError("Step must be >0")

        # Build an RFE object, which will evaluate and score each possible
        # feature count, down to self.min_features_to_select
        rfe = DyRFE(estimator=self.estimator,
                    n_features_to_select=self.min_features_to_select,
                    step=self.step, verbose=self.verbose)

        # Determine the number of subsets of features by fitting across
        # the train folds and choosing the "features_to_select" parameter
        # that gives the least averaged error across all folds.

        # Note that joblib raises a non-picklable error for bound methods
        # even if n_jobs is set to 1 with the default multiprocessing
        # backend.
        # This branching is done so that to
        # make sure that user code that sets n_jobs to 1
        # and provides bound methods as scorers is not broken with the
        # addition of n_jobs parameter in version 0.18.

        if effective_n_jobs(self.n_jobs) == 1:
            parallel, func = list, _rfe_single_fit
        else:
            parallel = Parallel(n_jobs=self.n_jobs)
            func = delayed(_rfe_single_fit)

        scores = parallel(
            func(rfe, self.estimator, X, y, train, test, scorer)
            for train, test in cv.split(X, y, groups))

        scores = np.sum(scores, axis=0)
        diff = int(scores.shape[0]) - len(step)
        if diff > 0:
            step = np.r_[step, [step[-1]] * diff]
        scores_rev = scores[::-1]
        argmax_idx = len(scores) - np.argmax(scores_rev) - 1
        n_features_to_select = max(
            n_features - sum(step[:argmax_idx]),
            self.min_features_to_select)

        # Re-execute an elimination with best_k over the whole set
        rfe = DyRFE(estimator=self.estimator,
                    n_features_to_select=n_features_to_select, step=self.step,
                    verbose=self.verbose)

        rfe.fit(X, y)

        # Set final attributes
        self.support_ = rfe.support_
        self.n_features_ = rfe.n_features_
        self.ranking_ = rfe.ranking_
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(self.transform(X), y)

        # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1
        # here, the scores are normalized by get_n_splits(X, y)
        self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups)
        return self