예제 #1
0
def test_reduce_futures(n_parts, client):
    def s(x):
        return x

    a = [client.submit(s, i) for i in range(n_parts)]
    b = reduce(a, sum)
    c = client.compute(b, sync=True)

    # Testing this gets the correct result for now.
    assert (sum(range(n_parts)) == c)
예제 #2
0
    def fit(self, X, y, classes=None):
        """
        Fit distributed Naive Bayes classifier model

        Parameters
        ----------

        X : dask.Array with blocks containing dense or sparse cupy arrays
        y : dask.Array with blocks containing cupy.ndarray
        classes : array-like containing unique class labels

        Returns
        -------

        cuml.dask.naive_bayes.MultinomialNB current model instance
        """

        # Only Dask.Array supported for now
        if not isinstance(X, dask.array.core.Array):
            raise ValueError("Only dask.Array is supported for X")

        if not isinstance(y, dask.array.core.Array):
            raise ValueError("Only dask.Array is supported for y")

        if len(X.chunks[1]) != 1:
            raise ValueError("X must be chunked by row only. "
                             "Multi-dimensional chunking is not supported")

        futures = DistributedDataHandler.create([X, y], self.client)

        classes = self._unique(y.map_blocks(
            MultinomialNB._unique).compute()) \
            if classes is None else classes

        models = [
            self.client.submit(self._fit,
                               part,
                               classes,
                               self.kwargs,
                               pure=False) for w, part in futures.gpu_futures
        ]

        models = reduce(models,
                        self._merge_counts_to_model,
                        client=self.client)

        models = self.client.submit(self._update_log_probs, models, pure=False)

        wait_and_raise_from_futures([models])

        self._set_internal_model(models)

        return self
예제 #3
0
    def fit(self, X):

        """
        Fit distributed TFIDF Transformer

        Parameters
        ----------

        X : dask.Array with blocks containing dense or sparse cupy arrays

        Returns
        -------

        cuml.dask.feature_extraction.text.TfidfTransformer instance
        """

        # Only Dask.Array supported for now
        if not isinstance(X, dask.array.core.Array):
            raise ValueError("Only dask.Array is supported for X")

        if len(X.chunks[1]) != 1:
            raise ValueError(
                "X must be chunked by row only. "
                "Multi-dimensional chunking is not supported"
            )

        # We don't' do anything if we don't need idf
        if not self.internal_model.use_idf:
            return self

        futures = DistributedDataHandler.create(X, self.client)

        models = [
            self.client.submit(
                self._set_doc_stats, part, self.kwargs, pure=False
            )
            for w, part in futures.gpu_futures
        ]

        models = reduce(models, self._merge_stats_to_model, client=self.client)

        wait_and_raise_from_futures([models])

        models = self.client.submit(self._set_idf_diag, models, pure=False)

        wait_and_raise_from_futures([models])

        self._set_internal_model(models)

        return self
예제 #4
0
def test_reduce_futures(n_parts, cluster):
    def s(x):
        return x

    client = Client(cluster)

    try:

        a = [client.submit(s, i) for i in range(n_parts)]
        b = reduce(a, sum)
        c = client.compute(b, sync=True)

        # Testing this gets the correct result for now.
        assert (sum(range(n_parts)) == c)
    finally:
        client.close()