示例#1
0
文件: dask.py 项目: vcarpani/xgboost
def _create_device_quantile_dmatrix(feature_names, feature_types, meta_names,
                                    missing, worker_map, max_bin):
    worker = distributed_get_worker()
    if worker.address not in set(worker_map.keys()):
        msg = 'worker {address} has an empty DMatrix.  ' \
            'All workers associated with this DMatrix: {workers}'.format(
                address=worker.address,
                workers=set(worker_map.keys()))
        LOGGER.warning(msg)
        import cupy  # pylint: disable=import-error
        d = DeviceQuantileDMatrix(cupy.zeros((0, 0)),
                                  feature_names=feature_names,
                                  feature_types=feature_types,
                                  max_bin=max_bin)
        return d

    (data, labels, weights, base_margin, label_lower_bound,
     label_upper_bound) = _get_worker_parts(worker_map, meta_names, worker)
    it = DaskPartitionIter(data=data,
                           label=labels,
                           weight=weights,
                           base_margin=base_margin,
                           label_lower_bound=label_lower_bound,
                           label_upper_bound=label_upper_bound)

    dmatrix = DeviceQuantileDMatrix(it,
                                    missing=missing,
                                    feature_names=feature_names,
                                    feature_types=feature_types,
                                    nthread=worker.nthreads,
                                    max_bin=max_bin)
    return dmatrix
示例#2
0
    def dispatched_predict(worker_id):
        '''Perform prediction on each worker.'''
        LOGGER.info('Predicting on %d', worker_id)

        worker = distributed_get_worker()
        list_of_parts = _get_worker_parts_ordered(
            has_margin, worker_map, partition_order, worker)
        predictions = []
        booster.set_param({'nthread': worker.nthreads})
        for data, base_margin, order in list_of_parts:
            local_part = DMatrix(
                data,
                base_margin=base_margin,
                feature_names=feature_names,
                feature_types=feature_types,
                missing=missing,
                nthread=worker.nthreads
            )
            predt = booster.predict(
                data=local_part,
                validate_features=local_part.num_row() != 0,
                **kwargs)
            columns = 1 if len(predt.shape) == 1 else predt.shape[1]
            ret = ((delayed(predt), columns), order)
            predictions.append(ret)
        return predictions
示例#3
0
文件: dask.py 项目: vcarpani/xgboost
 def dispatched_get_shape(worker_id):
     '''Get shape of data in each worker.'''
     LOGGER.info('Get shape on %d', worker_id)
     worker = distributed_get_worker()
     list_of_parts = _get_worker_parts_ordered(False, worker_map,
                                               partition_order, worker)
     shapes = [(part.shape, order) for part, _, order in list_of_parts]
     return shapes
示例#4
0
文件: dask.py 项目: vcarpani/xgboost
 def mapped_predict(partition, is_df):
     worker = distributed_get_worker()
     booster.set_param({'nthread': worker.nthreads})
     m = DMatrix(partition, missing=missing, nthread=worker.nthreads)
     predt = booster.predict(m, validate_features=False, **kwargs)
     if is_df:
         if lazy_isinstance(partition, 'cudf', 'core.dataframe.DataFrame'):
             import cudf  # pylint: disable=import-error
             predt = cudf.DataFrame(predt, columns=['prediction'])
         else:
             predt = DataFrame(predt, columns=['prediction'])
     return predt
示例#5
0
文件: dask.py 项目: vcarpani/xgboost
def _create_dmatrix(feature_names, feature_types, meta_names, missing,
                    worker_map):
    '''Get data that local to worker from DaskDMatrix.

      Returns
      -------
      A DMatrix object.

    '''
    worker = distributed_get_worker()
    if worker.address not in set(worker_map.keys()):
        msg = 'worker {address} has an empty DMatrix.  ' \
            'All workers associated with this DMatrix: {workers}'.format(
                address=worker.address,
                workers=set(worker_map.keys()))
        LOGGER.warning(msg)
        d = DMatrix(numpy.empty((0, 0)),
                    feature_names=feature_names,
                    feature_types=feature_types)
        return d

    def concat_or_none(data):
        if data is not None:
            return concat(data)
        return data

    (data, labels, weights, base_margin, label_lower_bound,
     label_upper_bound) = _get_worker_parts(worker_map, meta_names, worker)

    labels = concat_or_none(labels)
    weights = concat_or_none(weights)
    base_margin = concat_or_none(base_margin)
    label_lower_bound = concat_or_none(label_lower_bound)
    label_upper_bound = concat_or_none(label_upper_bound)

    data = concat(data)
    dmatrix = DMatrix(data,
                      labels,
                      missing=missing,
                      feature_names=feature_names,
                      feature_types=feature_types,
                      nthread=worker.nthreads)
    dmatrix.set_info(base_margin=base_margin,
                     weight=weights,
                     label_lower_bound=label_lower_bound,
                     label_upper_bound=label_upper_bound)
    return dmatrix
示例#6
0
文件: dask.py 项目: vcarpani/xgboost
 def mapped_predict(data, is_df):
     worker = distributed_get_worker()
     booster.set_param({'nthread': worker.nthreads})
     prediction = booster.inplace_predict(data,
                                          iteration_range=iteration_range,
                                          predict_type=predict_type,
                                          missing=missing)
     if is_df:
         if lazy_isinstance(data, 'cudf.core.dataframe', 'DataFrame'):
             import cudf  # pylint: disable=import-error
             prediction = cudf.DataFrame({'prediction': prediction},
                                         dtype=numpy.float32)
         else:
             # If it's  from pandas, the partition is a numpy array
             prediction = DataFrame(prediction,
                                    columns=['prediction'],
                                    dtype=numpy.float32)
     return prediction
示例#7
0
文件: dask.py 项目: vcarpani/xgboost
    def dispatched_train(worker_addr, rabit_args, dtrain_ref, evals_ref):
        '''Perform training on a single worker.  A local function prevents pickling.

        '''
        LOGGER.info('Training on %s', str(worker_addr))
        worker = distributed_get_worker()
        with RabitContext(rabit_args):
            local_dtrain = _dmatrix_from_worker_map(**dtrain_ref)
            local_evals = []
            if evals_ref:
                for ref, name in evals_ref:
                    if ref['worker_map'] == dtrain_ref['worker_map']:
                        local_evals.append((local_dtrain, name))
                        continue
                    local_evals.append((_dmatrix_from_worker_map(**ref), name))

            local_history = {}
            local_param = params.copy()  # just to be consistent
            msg = 'Overriding `nthreads` defined in dask worker.'
            if 'nthread' in local_param.keys() and \
               local_param['nthread'] is not None and \
               local_param['nthread'] != worker.nthreads:
                msg += '`nthread` is specified.  ' + msg
                LOGGER.warning(msg)
            elif 'n_jobs' in local_param.keys() and \
                 local_param['n_jobs'] is not None and \
                 local_param['n_jobs'] != worker.nthreads:
                msg = '`n_jobs` is specified.  ' + msg
                LOGGER.warning(msg)
            else:
                local_param['nthread'] = worker.nthreads
            bst = worker_train(params=local_param,
                               dtrain=local_dtrain,
                               *args,
                               evals_result=local_history,
                               evals=local_evals,
                               early_stopping_rounds=early_stopping_rounds,
                               **kwargs)
            ret = {'booster': bst, 'history': local_history}
            if local_dtrain.num_row() == 0:
                ret = None
            return ret
示例#8
0
文件: dask.py 项目: vcarpani/xgboost
 def __init__(self, args):
     self.args = args
     worker = distributed_get_worker()
     self.args.append(
         ('DMLC_TASK_ID=[xgboost.dask]:' + str(worker.address)).encode())