示例#1
0
    def inverse_transform(self, y, delayed=True):
        """
        Convert the data back to the original representation.
        In case unknown categories are encountered (all zeros in the
        one-hot encoding), ``None`` is used to represent this category.

        Parameters
        ----------
        X : dask_cudf Series
            The string representation of the categories.
        delayed : bool (default = True)
            Whether to execute as a delayed task or eager.

        Returns
        -------
        X_tr : dask_cudf.Series
            Distributed object containing the inverse transformed array.
        """
        if self._get_internal_model() is not None:
            return self._inverse_transform(y,
                                           delayed=delayed,
                                           output_collection_type='cudf')
        else:
            msg = ("This LabelEncoder instance is not fitted yet. Call 'fit' "
                   "with appropriate arguments before using this estimator.")
            raise NotFittedError(msg)
示例#2
0
    def transform(self, y, delayed=True):
        """
        Transform an input into its categorical keys.

        This is intended for use with small inputs relative to the size of the
        dataset. For fitting and transforming an entire dataset, prefer
        `fit_transform`.

        Parameters
        ----------
        y : dask_cudf.Series
            Input keys to be transformed. Its values should match the
            categories given to `fit`

        Returns
        -------
        encoded : dask_cudf.Series
            The ordinally encoded input series

        Raises
        ------
        KeyError
            if a category appears that was not seen in `fit`
        """
        if self._get_internal_model() is not None:
            return self._transform(y,
                                   delayed=delayed,
                                   output_dtype='int32',
                                   output_collection_type='cudf')
        else:
            msg = ("This LabelEncoder instance is not fitted yet. Call 'fit' "
                   "with appropriate arguments before using this estimator.")
            raise NotFittedError(msg)
示例#3
0
    def sample(self, n_samples=1, random_state=None):
        """
        Generate random samples from the model.
        Currently, this is implemented only for gaussian and tophat kernels,
        and the Euclidean metric.

        Parameters
        ----------
        n_samples : int, default=1
            Number of samples to generate.
        random_state : int, cupy RandomState instance or None, default=None

        Returns
        -------
        X : cupy array of shape (n_samples, n_features)
            List of samples.
        """
        if not hasattr(self, "X_"):
            raise NotFittedError()

        supported_kernels = ["gaussian", "tophat"]
        if (self.kernel not in supported_kernels
                or self.metric != "euclidean"):
            raise NotImplementedError(
                "Only {} kernels, and the euclidean"
                " metric are supported.".format(supported_kernels))

        if isinstance(random_state, cp.random.RandomState):
            rng = random_state
        else:
            rng = cp.random.RandomState(random_state)

        u = rng.uniform(0, 1, size=n_samples)
        if self.sample_weight_ is None:
            i = (u * self.X_.shape[0]).astype(np.int64)
        else:
            cumsum_weight = cp.cumsum(self.sample_weight_)
            sum_weight = cumsum_weight[-1]
            i = cp.searchsorted(cumsum_weight, u * sum_weight)
        if self.kernel == "gaussian":
            return cp.atleast_2d(rng.normal(self.X_[i], self.bandwidth))

        elif self.kernel == "tophat":
            # we first draw points from a d-dimensional normal distribution,
            # then use an incomplete gamma function to map them to a uniform
            # d-dimensional tophat distribution.
            has_scipy(raise_if_unavailable=True)
            dim = self.X_.shape[1]
            X = rng.normal(size=(n_samples, dim))
            s_sq = cp.einsum("ij,ij->i", X, X).get()

            # do this on the CPU becaause we don't have
            # a gammainc function  readily available
            correction = cp.array(
                gammainc(0.5 * dim, 0.5 * s_sq)**(1.0 / dim) * self.bandwidth /
                np.sqrt(s_sq))
            return self.X_[i] + X * correction[:, np.newaxis]
示例#4
0
    def transform(self, raw_documents):
        """
        Transform documents to document-term matrix.

        Extract token counts out of raw text documents using the vocabulary
        fitted with fit or the one provided to the constructor.

        Parameters
        ----------
        raw_documents : cudf.Series
           A Series of string documents

        Returns
        -------
        X : cupy csr array of shape (n_samples, n_features)
            Document-term matrix.
        """
        if not hasattr(self, "vocabulary_"):
            if self.vocabulary is not None:
                self.vocabulary_ = self.vocabulary
            else:
                raise NotFittedError()

        docs = self._preprocess(raw_documents)
        n_doc = len(docs)
        tokenized_df = self._create_tokenized_df(docs)
        count_df = self._count_vocab(tokenized_df)
        empty_doc_ids = self._compute_empty_doc_ids(count_df, n_doc)
        X = create_csr_matrix_from_count_df(count_df,
                                            empty_doc_ids,
                                            n_doc,
                                            len(self.vocabulary_),
                                            dtype=self.dtype)
        if self.binary:
            X.data.fill(1)
        return X
示例#5
0
文件: _tfidf.py 项目: vinaydes/cuml
 def _check_is_idf_fitted(self):
     if not hasattr(self, 'idf_'):
         msg = ("This TfidfTransformer instance is not fitted or the "
                "value of use_idf is not consistant between "
                ".fit() and .transform().")
         raise NotFittedError(msg)
示例#6
0
 def _check_is_fitted(self):
     if not self._fitted or self.train is None:
         msg = ("This LabelEncoder instance is not fitted yet. Call 'fit' "
                "with appropriate arguments before using this estimator.")
         raise NotFittedError(msg)
示例#7
0
    def score_samples(self, X):
        """Compute the log-likelihood of each sample under the model.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features)
            An array of points to query.  Last dimension should match dimension
            of training data (n_features).

        Returns
        -------

        density : ndarray of shape (n_samples,)
            Log-likelihood of each sample in `X`. These are normalized to be
            probability densities, so values will be low for high-dimensional
            data.
        """
        if not hasattr(self, "X_"):
            raise NotFittedError()
        X_cuml = input_to_cuml_array(X)
        if self.metric_params:
            if len(self.metric_params) != 1:
                raise ValueError(
                    "Cuml only supports metrics with a single arg.")
            metric_arg = list(self.metric_params.values())[0]
            distances = pairwise_distances(X_cuml.array,
                                           self.X_,
                                           metric=self.metric,
                                           metric_arg=metric_arg)
        else:
            distances = pairwise_distances(X_cuml.array,
                                           self.X_,
                                           metric=self.metric)

        distances = cp.asarray(distances)

        h = self.bandwidth
        if self.kernel in log_probability_kernels_:
            distances = log_probability_kernels_[self.kernel](distances, h)
        else:
            raise ValueError("Unsupported kernel.")

        log_probabilities = cp.zeros(distances.shape[0])
        if self.sample_weight_ is not None:
            distances += cp.log(self.sample_weight_)

        logsumexp_kernel.forall(log_probabilities.size)(distances,
                                                        log_probabilities)
        # Note that sklearns user guide is wrong
        # It says the (unnormalised) probability output for
        #  the kernel density is sum(K(x,h)).
        # In fact what they implment is (1/n)*sum(K(x,h))
        # Here we divide by n in normal probability space
        # Which becomes -log(n) in log probability space
        sum_weights = (cp.sum(self.sample_weight_) if self.sample_weight_
                       is not None else distances.shape[1])
        log_probabilities -= np.log(sum_weights)

        # norm
        if len(X_cuml.array.shape) == 1:
            # if X is one dimensional, we have 1 feature
            dimension = 1
        else:
            dimension = X_cuml.array.shape[1]
        log_probabilities = norm_log_probabilities(log_probabilities,
                                                   self.kernel, h, dimension)

        return log_probabilities