Exemplo n.º 1
0
    def _query_models(self, n_neighbors,
                      comms, nn_models,
                      index_futures, query_futures):

        worker_info = comms.worker_info(comms.worker_addresses)

        index_worker_to_parts = workers_to_parts(index_futures)
        query_worker_to_parts = workers_to_parts(query_futures)

        """
        Build inputs and outputs
        """
        idx_parts_to_ranks, idx_M = parts_to_ranks(self.client,
                                                   worker_info,
                                                   index_futures)

        query_parts_to_ranks, query_M = parts_to_ranks(self.client,
                                                       worker_info,
                                                       query_futures)

        """
        Invoke kneighbors on Dask workers to perform distributed query
        """

        key = uuid1()
        nn_fit = dict([(worker_info[worker]["r"], self.client.submit(
                        NearestNeighbors._func_kneighbors,
                        nn_models[worker],
                        index_worker_to_parts[worker] if
                        worker in index_worker_to_parts else [],
                        idx_M,
                        self.n_cols,
                        idx_parts_to_ranks,
                        query_worker_to_parts[worker] if
                        worker in query_worker_to_parts else [],
                        query_M,
                        query_parts_to_ranks,
                        worker_info[worker]["r"],
                        n_neighbors,
                        key="%s-%s" % (key, idx),
                        workers=[worker]))
                       for idx, worker in enumerate(comms.worker_addresses)])

        wait(list(nn_fit.values()))
        raise_exception_from_futures(list(nn_fit.values()))

        """
        Gather resulting partitions and return dask_cudfs
        """
        out_d_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_d)

        out_i_futures = flatten_grouped_results(self.client,
                                                query_parts_to_ranks,
                                                nn_fit,
                                                getter_func=_func_get_i)

        return nn_fit, out_d_futures, out_i_futures
Exemplo n.º 2
0
    def _parallel_func(self, X, func):
        """
        Internal function that predicts the labels using a distributed
        KMeans model.

        Parameters
        ----------
        X : dask_cudf.Dataframe
            Dataframe to predict

        Returns
        -------
        result: dask_cudf.Dataframe
            Dataframe containing label predictions
        """

        key = uuid1()
        gpu_futures = self.client.sync(extract_ddf_partitions, X)
        worker_to_parts = workers_to_parts(gpu_futures)

        kmeans_predict = [
            self.client.submit(func,
                               self.local_model,
                               wf[1],
                               workers=[wf[0]],
                               key="%s-%s" % (key, idx))
            for idx, wf in enumerate(worker_to_parts.items())
        ]
        self.raise_exception_from_futures(kmeans_predict)

        return to_dask_cudf(kmeans_predict)
Exemplo n.º 3
0
    def score(self, X):

        key = uuid1()
        gpu_futures = self.client.sync(extract_ddf_partitions, X)
        worker_to_parts = workers_to_parts(gpu_futures)
        scores = [
            self.client.submit(KMeans._func_score,
                               self.local_model,
                               wf[1],
                               workers=[wf[0]],
                               key="%s-%s" % (key, idx)).result()
            for idx, wf in enumerate(worker_to_parts.items())
        ]

        return -1 * np.sum(np.array(scores) * -1)
Exemplo n.º 4
0
    def _inverse_transform(self, X):
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        worker_to_parts = workers_to_parts(gpu_futures)

        key = uuid1()
        partsToRanks = [(self.rnks[wf[0]],
                         self.client.submit(PCA._func_get_size,
                                            wf[1],
                                            workers=[wf[0]],
                                            key="%s-%s" % (key, idx)).result())
                        for idx, wf in enumerate(gpu_futures)]

        N = X.shape[1]
        M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks))

        key = uuid1()
        pca_inverse_transform = dict([
            (self.rnks[wf[0]],
             self.client.submit(PCA._func_inverse_transform,
                                wf[1],
                                worker_to_parts[wf[0]],
                                M,
                                N,
                                partsToRanks,
                                self.rnks[wf[0]],
                                key="%s-%s" % (key, idx),
                                workers=[wf[0]]))
            for idx, wf in enumerate(self.pca_models)
        ])

        wait(list(pca_inverse_transform.values()))
        raise_exception_from_futures(list(pca_inverse_transform.values()))

        out_futures = []
        completed_part_map = {}
        for rank, size in partsToRanks:
            if rank not in completed_part_map:
                completed_part_map[rank] = 0

            f = pca_inverse_transform[rank]
            out_futures.append(
                self.client.submit(PCA._func_get_idx, f,
                                   completed_part_map[rank]))

            completed_part_map[rank] += 1

        return to_dask_cudf(out_futures)
Exemplo n.º 5
0
    def fit(self, X):
        """
        Fit a multi-node multi-GPU KMeans model

        Parameters
        ----------
        X : dask_cudf.Dataframe

        Returns
        -------
        self: KMeans model
        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        worker_to_parts = workers_to_parts(gpu_futures)

        workers = list(map(lambda x: x[0], worker_to_parts.items()))

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        key = uuid1()
        kmeans_fit = [
            self.client.submit(KMeans._func_fit,
                               comms.sessionId,
                               wf[1],
                               **self.kwargs,
                               workers=[wf[0]],
                               key="%s-%s" % (key, idx))
            for idx, wf in enumerate(worker_to_parts.items())
        ]

        wait(kmeans_fit)
        self.raise_exception_from_futures(kmeans_fit)

        comms.destroy()

        self.local_model = kmeans_fit[0].result()
        self.cluster_centers_ = self.local_model.cluster_centers_

        return self
Exemplo n.º 6
0
    def fit(self, X, y, convert_dtype=False):
        """
        Fit the input data with a Random Forest classifier

        IMPORTANT: X is expected to be partitioned with at least one partition
        on each Dask worker being used by the forest (self.workers).

        If a worker has multiple data partitions, they will be concatenated
        before fitting, which will lead to additional memory usage. To minimize
        memory consumption, ensure that each worker has exactly one partition.

        When persisting data, you can use
        cuml.dask.common.utils.persist_across_workers to simplify this::

            X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers)
            y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers)
            X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client,
                                                              [X_dask_cudf,
                                                               y_dask_cudf])

        (this is equivalent to calling `persist` with the data and workers)::
            X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf,
                                                            y_dask_cudf],
                                                           workers={
                                                           X_dask_cudf=workers,
                                                           y_dask_cudf=workers
                                                           })

        Parameters
        ----------
        X : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, n_features).
            Features of training examples.
        y : dask_cudf.Dataframe
            Dense  matrix (floats or doubles) of shape (n_samples, 1)
            Labels of training examples.
            **y must be partitioned the same way as X**
        convert_dtype : bool, optional (default = False)
            When set to True, the fit method will, when necessary, convert
            y to be the same data type as X if they differ. This
            will increase memory used for the method.

        """

        c = default_client()

        self.num_classes = len(y.unique())
        X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X))
        y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y))

        X_partition_workers = [w for w, xc in X_futures.items()]
        y_partition_workers = [w for w, xc in y_futures.items()]

        if set(X_partition_workers) != set(self.workers) or \
           set(y_partition_workers) != set(self.workers):
            raise ValueError("""
              X is not partitioned on the same workers expected by RF\n
              X workers: %s\n
              y workers: %s\n
              RF workers: %s
            """ % (str(X_partition_workers),
                   str(y_partition_workers),
                   str(self.workers)))

        futures = list()
        for w, xc in X_futures.items():
            futures.append(
                c.submit(
                    RandomForestClassifier._fit,
                    self.rfs[w],
                    xc,
                    y_futures[w],
                    convert_dtype,
                    random.random(),
                    workers=[w],
                )
            )

        wait(futures)
        raise_exception_from_futures(futures)
        return self
Exemplo n.º 7
0
    def fit(self, X, y):
        """
        Fit the input data with a Random Forest regression model

        IMPORTANT: X is expected to be partitioned with at least one partition
        on each Dask worker being used by the forest (self.workers).

        When persisting data, you can use
        cuml.dask.common.utils.persist_across_workers to simplify this::

            X_dask_cudf = dask_cudf.from_cudf(X_cudf, npartitions=n_workers)
            y_dask_cudf = dask_cudf.from_cudf(y_cudf, npartitions=n_workers)
            X_dask_cudf, y_dask_cudf = persist_across_workers(dask_client,
                                                              [X_dask_cudf,
                                                               y_dask_cudf])

        (this is equivalent to calling `persist` with the data and workers)::
            X_dask_cudf, y_dask_cudf = dask_client.persist([X_dask_cudf,
                                                            y_dask_cudf],
                                                           workers={
                                                           X_dask_cudf=workers,
                                                           y_dask_cudf=workers
                                                           })
        Parameters
        ----------
        X : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, n_features).
            Features of training examples.

        y : dask_cudf.Dataframe
            Dense matrix (floats or doubles) of shape (n_samples, 1)
            Labels of training examples.
            y must be partitioned the same way as X
        """
        c = default_client()

        X_futures = workers_to_parts(c.sync(extract_ddf_partitions, X))
        y_futures = workers_to_parts(c.sync(extract_ddf_partitions, y))

        X_partition_workers = [w for w, xc in X_futures.items()]
        y_partition_workers = [w for w, xc in y_futures.items()]

        if set(X_partition_workers) != set(self.workers) or \
           set(y_partition_workers) != set(self.workers):
            raise ValueError("""
              X is not partitioned on the same workers expected by RF\n
              X workers: %s\n
              y workers: %s\n
              RF workers: %s
            """ % (str(X_partition_workers), str(y_partition_workers),
                   str(self.workers)))

        futures = list()
        for w, xc in X_futures.items():
            futures.append(
                c.submit(
                    RandomForestRegressor._fit,
                    self.rfs[w],
                    xc,
                    y_futures[w],
                    random.random(),
                    workers=[w],
                ))

        wait(futures)
        raise_exception_from_futures(futures)

        return self
Exemplo n.º 8
0
    def fit(self, X, _transform=False):
        """
        Fit the model with X.

        Parameters
        ----------
        X : dask cuDF input

        """
        gpu_futures = self.client.sync(extract_ddf_partitions, X)

        worker_to_parts = workers_to_parts(gpu_futures)

        workers = list(map(lambda x: x[0], gpu_futures))

        comms = CommsContext(comms_p2p=False)
        comms.init(workers=workers)

        worker_info = comms.worker_info(comms.worker_addresses)

        self.rnks = {w: worker_info[w]["r"] for w in workers}

        key = uuid1()
        partsToRanks = [(worker_info[wf[0]]["r"],
                         self.client.submit(PCA._func_get_size,
                                            wf[1],
                                            workers=[wf[0]],
                                            key="%s-%s" % (key, idx)).result())
                        for idx, wf in enumerate(gpu_futures)]

        N = X.shape[1]
        M = reduce(lambda a, b: a + b, map(lambda x: x[1], partsToRanks))

        key = uuid1()
        self.pca_models = [(wf[0],
                            self.client.submit(PCA._func_create_model,
                                               comms.sessionId,
                                               wf[1],
                                               **self.kwargs,
                                               workers=[wf[0]],
                                               key="%s-%s" % (key, idx)))
                           for idx, wf in enumerate(worker_to_parts.items())]

        key = uuid1()
        pca_fit = dict([(worker_info[wf[0]]["r"],
                         self.client.submit(PCA._func_fit,
                                            wf[1],
                                            M,
                                            N,
                                            partsToRanks,
                                            worker_info[wf[0]]["r"],
                                            _transform,
                                            key="%s-%s" % (key, idx),
                                            workers=[wf[0]]))
                        for idx, wf in enumerate(self.pca_models)])

        wait(list(pca_fit.values()))
        raise_exception_from_futures(list(pca_fit.values()))

        comms.destroy()

        self.local_model = self.client.submit(PCA._func_get_first,
                                              self.pca_models[0][1]).result()

        self.components_ = self.local_model.components_
        self.explained_variance_ = self.local_model.explained_variance_
        self.explained_variance_ratio_ = \
            self.local_model.explained_variance_ratio_
        self.singular_values_ = self.local_model.singular_values_
        self.noise_variance = self.local_model.noise_variance_

        out_futures = []
        if _transform:
            completed_part_map = {}
            for rank, size in partsToRanks:
                if rank not in completed_part_map:
                    completed_part_map[rank] = 0

                f = pca_fit[rank]
                out_futures.append(
                    self.client.submit(PCA._func_get_idx, f,
                                       completed_part_map[rank]))

                completed_part_map[rank] += 1

            return to_dask_cudf(out_futures)