示例#1
0
    def _get_mg_info(self, ddf):
        """
        Given a Dask cuDF, extract number of dimensions and convert
        the pieces of the Dask cuDF into Numba arrays, which can
        be passed into the kNN algorithm.
        build a
        :param ddf:
        :return:
        """

        client = default_client()

        if isinstance(ddf, dd.DataFrame):
            cols = len(ddf.columns)
            parts = ddf.to_delayed()
            parts = client.compute(parts)
            yield wait(parts)
        else:
            raise Exception("Input should be a Dask DataFrame")

        key_to_part_dict = dict([(str(part.key), part) for part in parts])
        who_has = yield client.who_has(parts)

        worker_map = []
        for key, workers in who_has.items():
            worker = parse_host_port(first(workers))
            worker_map.append((worker, key_to_part_dict[key]))

        gpu_data = [(worker, client.submit(to_gpu_matrix, part,
                                           workers=[worker]))
                    for worker, part in worker_map]

        yield wait(gpu_data)

        raise gen.Return((gpu_data, cols))
示例#2
0
文件: test_ucx.py 项目: isVoid/cuml
        def get_endpoints(addr_ports):
            # Create endpoints to all other workers
            ucx = get_worker()._ucx

            for address, port in addr_ports:
                if address != get_worker().address:
                    host, p = parse_host_port(address)
                    ucx.get_endpoint(host, port)
示例#3
0
def hosts_to_parts(futures):
    """
    Builds an ordered dict mapping each host to their list
    of parts
    :param futures: list of (worker, part) tuples
    :return:
    """
    w_to_p_map = OrderedDict()
    for w, p in futures:
        host, port = parse_host_port(w)
        host_key = (host, port)
        if host_key not in w_to_p_map:
            w_to_p_map[host_key] = []
        w_to_p_map[host_key].append(p)
    return w_to_p_map
示例#4
0
    def _build_host_dict(self, gpu_futures, client):

        who_has = client.who_has(gpu_futures)

        key_to_host_dict = {}
        for key in who_has:
            key_to_host_dict[key] = parse_host_port(who_has[key][0])

        hosts_to_key_dict = {}
        for key, host in key_to_host_dict.items():
            if host not in hosts_to_key_dict:
                hosts_to_key_dict[host] = set([key])
            else:
                hosts_to_key_dict[host].add(key)

        workers = [key[0] for key in list(who_has.values())]
        return build_host_dict(workers)
示例#5
0
    def _build_host_dict(gpu_futures, client):
        """
        Helper function to build a dictionary mapping workers to parts
        that currently hold the parts of given futures.
        :param gpu_futures:
        :param client:
        :return:
        """
        who_has = client.who_has(gpu_futures)

        key_to_host_dict = {}
        for key in who_has:
            key_to_host_dict[key] = parse_host_port(who_has[key][0])

        hosts_to_key_dict = {}
        for key, host in key_to_host_dict.items():
            if host not in hosts_to_key_dict:
                hosts_to_key_dict[host] = set([key])
            else:
                hosts_to_key_dict[host].add(key)

        workers = [key[0] for key in list(who_has.values())]
        return build_host_dict(workers)
示例#6
0
    def _do_fit(self, X_df, y_df, dtype):

        client = default_client()

        # Finding location of parts of y_df to distribute columns of X_df
        loc_dict = {}
        yield wait(y_df)
        tt = yield client.who_has(y_df)
        location = tuple(tt.values())
        for i in range(X_df.npartitions):
            part_number = eval(list(tt.keys())[i])[1]
            loc_dict[part_number] = parse_host_port(str(location[i])[:-3])

        # Lets divide the columns evenly, matching the order of the labels
        part_size = ceil(X_df.shape[1] / X_df.npartitions)

        # We scatter delayed operations to gather columns on the workers
        scattered = []
        coefs = []
        for i in range(X_df.npartitions):
            up_limit = min((i + 1) * part_size, X_df.shape[1])
            cols = X_df.columns.values[i * part_size:up_limit]
            loc_cudf = X_df[cols]
            yield wait(loc_cudf)
            scattered.append(
                client.submit(preprocess_on_worker,
                              loc_cudf,
                              workers=[loc_dict[i]]))
            yield wait(scattered)
            coefs.append(
                client.submit(dev_array_on_worker,
                              up_limit - i * part_size,
                              dtype=dtype,
                              unique=np.random.randint(0, 1e6),
                              workers=[loc_dict[i]]))
            yield wait(coefs)
            del (loc_cudf)

        # Break apart Dask.array/dataframe into chunks/parts
        # data_parts = map(delayed, scattered)
        data_parts = scattered
        label_parts = y_df.to_delayed()
        coef_parts = coefs

        # Arrange parts into pairs.  This enforces co-locality
        parts = list(map(delayed, zip(data_parts, label_parts, coef_parts)))
        parts = client.compute(parts)  # Start computation in the background
        yield wait(parts)

        for part in parts:
            if part.status == 'error':
                yield part  # trigger error locally

        # A dict in the form of { part_key: part }
        key_to_part_dict = dict([(str(part.key), part) for part in parts])

        who_has = yield client.who_has(parts)

        worker_parts = {}
        for key, workers in who_has.items():
            worker = parse_host_port(first(workers))
            if worker not in worker_parts:
                worker_parts[worker] = []
            worker_parts[worker].append(key_to_part_dict[key])
        """
        Create IP Handles on each worker hosting input data
        """

        # Format of input_devarrays = ([(X, y)..], dev)
        input_devarrays = [(worker,
                            client.submit(fit_to_device_arrays,
                                          part,
                                          workers=[worker]))
                           for worker, part in worker_parts.items()]

        yield wait(input_devarrays)
        """
        Gather IPC handles for each worker and call _fit() on each worker
        containing data.
        """

        # Last worker is the only one that can have less items.
        exec_node = loc_dict[X_df.npartitions - 1]

        # Need to fetch parts on worker
        on_worker = list(filter(lambda x: x[0] == exec_node, input_devarrays))
        not_on_worker = list(
            filter(lambda x: x[0] != exec_node, input_devarrays))

        ipc_handles = [
            client.submit(get_input_ipc_handles, future, workers=[a_worker])
            for a_worker, future in not_on_worker
        ]

        raw_arrays = [future for a_worker, future in on_worker]

        # IPC Handles are loaded in separate threads on worker so they can be
        # used to make calls through cython
        # Calls _fit_on_worker defined in the bottom
        intercept = client.submit(_fit_on_worker, (ipc_handles, raw_arrays),
                                  self._build_params_map(),
                                  workers=[exec_node])

        yield wait(intercept)

        coef_series = [
            client.submit(coef_on_worker,
                          coefs[i],
                          i,
                          X_df.shape[1],
                          X_df.npartitions,
                          loc_dict[i],
                          workers=[loc_dict[i]]) for i in range(len(loc_dict))
        ]

        # coef_on_worker(self, coef, locations, ncols, nparts, worker):

        raise gen.Return((coef_series, intercept, loc_dict))
示例#7
0
    def _do_predict(self, X_df, coefs, loc_dict, intercept, dtype):
        client = default_client()

        part_size = ceil(X_df.shape[1] / X_df.npartitions)

        # We scatter delayed operations to gather columns on the workers
        scattered = []
        for i in range(X_df.npartitions):
            up_limit = min((i + 1) * part_size, X_df.shape[1])
            cols = X_df.columns.values[i * part_size:up_limit]
            loc_cudf = X_df[cols]
            yield wait(loc_cudf)
            scattered.append(
                client.submit(preprocess_predict,
                              loc_cudf,
                              workers=[loc_dict[i]]))
            yield wait(scattered)
            del (loc_cudf)

        # Break apart Dask.array/dataframe into chunks/parts
        data_parts = scattered
        coef_parts = coefs.to_delayed()

        # Arrange parts into pairs.  This enforces co-locality
        parts = list(map(delayed, zip(data_parts, coef_parts)))
        parts = client.compute(parts)  # Start computation in the background
        yield wait(parts)

        for part in parts:
            if part.status == 'error':
                yield part  # trigger error locally

        # A dict in the form of { part_key: part }
        key_to_part_dict = dict([(str(part.key), part) for part in parts])

        who_has = yield client.who_has(parts)

        worker_parts = {}
        for key, workers in who_has.items():
            worker = parse_host_port(first(workers))
            if worker not in worker_parts:
                worker_parts[worker] = []
            worker_parts[worker].append(key_to_part_dict[key])
        """
        Create IP Handles on each worker hosting input data
        """

        # Format of input_devarrays = ([(X, y)..], dev)
        input_devarrays = [(worker,
                            client.submit(predict_to_device_arrays,
                                          part,
                                          worker,
                                          loc_dict,
                                          X_df.npartitions,
                                          dtype=dtype,
                                          workers=[worker]))
                           for worker, part in worker_parts.items()]

        yield wait(input_devarrays)
        """
        Gather IPC handles for each worker and call _fit() on each worker
        containing data.
        """
        exec_node = loc_dict[X_df.npartitions - 1]

        # Need to fetch parts on worker
        on_worker = list(filter(lambda x: x[0] == exec_node, input_devarrays))
        not_on_worker = list(
            filter(lambda x: x[0] != exec_node, input_devarrays))

        ipc_handles = [
            client.submit(get_input_ipc_handles,
                          future,
                          unique=np.random.randint(0, 1e6),
                          workers=[a_worker])
            for a_worker, future in not_on_worker
        ]

        raw_arrays = [future for a_worker, future in on_worker]

        # IPC Handles are loaded in separate threads on worker so they can be
        # used to make calls through cython
        # Calls _predict_on_worker defined in the bottom
        ret = client.submit(_predict_on_worker, (ipc_handles, raw_arrays),
                            self.intercept,
                            self._build_params_map(),
                            workers=[exec_node])

        yield wait(ret)

        dfs = [
            client.submit(series_on_worker,
                          f,
                          worker,
                          loc_dict,
                          X_df.npartitions,
                          X_df,
                          workers=[worker]) for worker, f in input_devarrays
        ]

        return dfs
示例#8
0
    def _kneighbors(self, X, k):
        """
        Internal function to query the kNN model.
        :param X:
        :param k:
        :return:
        """
        client = default_client()
        if k is None:
            k = self.n_neighbors

        # Break apart Dask.array/dataframe into chunks/parts
        data_parts = X.to_delayed()

        parts = list(map(delayed, data_parts))
        parts = client.compute(parts)  # Start computation in the background
        yield wait(parts)
        for part in parts:
            if part.status == 'error':
                yield part  # trigger error locally

        # A dict in the form of { part_key: part }
        key_to_part_dict = dict([(str(part.key), part) for part in parts])

        who_has = yield client.who_has(parts)

        worker_parts = {}
        for key, workers in who_has.items():
            worker = parse_host_port(first(workers))
            if worker not in worker_parts:
                worker_parts[worker] = []
            worker_parts[worker].append(key_to_part_dict[key])

        """
        Create IP Handles on each worker hosting input data
        """
        # Format of input_devarrays = ([(X, y)..], dev)
        input_devarrays = [(worker, client.submit(input_to_device_arrays, part,
                                                  {"k": k}, workers=[worker]))
                           for worker, part in worker_parts.items()]

        yield wait(input_devarrays)

        """
        Gather IPC handles for each worker and call _fit() on each worker
        containing data.
        """
        exec_node, model = self.model

        # Need to fetch coefficient parts on worker
        on_worker = list(filter(lambda x: x[0] == exec_node, input_devarrays))
        not_on_worker = list(filter(lambda x: x[0] != exec_node,
                                    input_devarrays))

        ipc_handles = [client.submit(get_input_ipc_handles, future,
                                     workers=[a_worker])
                       for a_worker, future in not_on_worker]

        raw_arrays = [future for a_worker, future in on_worker]

        # IPC Handles are loaded in separate threads on worker so they can be
        # used to make calls through cython

        run = client.submit(_kneighbors_on_worker, (ipc_handles, raw_arrays),
                            model, {"k": k}, workers=[exec_node])
        yield wait(run)

        dfs = [client.submit(build_dask_dfs, f, {"k": k}, workers=[worker])
               for worker, f in input_devarrays]
        yield wait(dfs)

        return gen.Return(dfs)