def _create_device_quantile_dmatrix(feature_names, feature_types, meta_names, missing, worker_map, max_bin): worker = distributed_get_worker() if worker.address not in set(worker_map.keys()): msg = 'worker {address} has an empty DMatrix. ' \ 'All workers associated with this DMatrix: {workers}'.format( address=worker.address, workers=set(worker_map.keys())) LOGGER.warning(msg) import cupy # pylint: disable=import-error d = DeviceQuantileDMatrix(cupy.zeros((0, 0)), feature_names=feature_names, feature_types=feature_types, max_bin=max_bin) return d (data, labels, weights, base_margin, label_lower_bound, label_upper_bound) = _get_worker_parts(worker_map, meta_names, worker) it = DaskPartitionIter(data=data, label=labels, weight=weights, base_margin=base_margin, label_lower_bound=label_lower_bound, label_upper_bound=label_upper_bound) dmatrix = DeviceQuantileDMatrix(it, missing=missing, feature_names=feature_names, feature_types=feature_types, nthread=worker.nthreads, max_bin=max_bin) return dmatrix
def dispatched_predict(worker_id): '''Perform prediction on each worker.''' LOGGER.info('Predicting on %d', worker_id) worker = distributed_get_worker() list_of_parts = _get_worker_parts_ordered( has_margin, worker_map, partition_order, worker) predictions = [] booster.set_param({'nthread': worker.nthreads}) for data, base_margin, order in list_of_parts: local_part = DMatrix( data, base_margin=base_margin, feature_names=feature_names, feature_types=feature_types, missing=missing, nthread=worker.nthreads ) predt = booster.predict( data=local_part, validate_features=local_part.num_row() != 0, **kwargs) columns = 1 if len(predt.shape) == 1 else predt.shape[1] ret = ((delayed(predt), columns), order) predictions.append(ret) return predictions
def dispatched_get_shape(worker_id): '''Get shape of data in each worker.''' LOGGER.info('Get shape on %d', worker_id) worker = distributed_get_worker() list_of_parts = _get_worker_parts_ordered(False, worker_map, partition_order, worker) shapes = [(part.shape, order) for part, _, order in list_of_parts] return shapes
def mapped_predict(partition, is_df): worker = distributed_get_worker() booster.set_param({'nthread': worker.nthreads}) m = DMatrix(partition, missing=missing, nthread=worker.nthreads) predt = booster.predict(m, validate_features=False, **kwargs) if is_df: if lazy_isinstance(partition, 'cudf', 'core.dataframe.DataFrame'): import cudf # pylint: disable=import-error predt = cudf.DataFrame(predt, columns=['prediction']) else: predt = DataFrame(predt, columns=['prediction']) return predt
def _create_dmatrix(feature_names, feature_types, meta_names, missing, worker_map): '''Get data that local to worker from DaskDMatrix. Returns ------- A DMatrix object. ''' worker = distributed_get_worker() if worker.address not in set(worker_map.keys()): msg = 'worker {address} has an empty DMatrix. ' \ 'All workers associated with this DMatrix: {workers}'.format( address=worker.address, workers=set(worker_map.keys())) LOGGER.warning(msg) d = DMatrix(numpy.empty((0, 0)), feature_names=feature_names, feature_types=feature_types) return d def concat_or_none(data): if data is not None: return concat(data) return data (data, labels, weights, base_margin, label_lower_bound, label_upper_bound) = _get_worker_parts(worker_map, meta_names, worker) labels = concat_or_none(labels) weights = concat_or_none(weights) base_margin = concat_or_none(base_margin) label_lower_bound = concat_or_none(label_lower_bound) label_upper_bound = concat_or_none(label_upper_bound) data = concat(data) dmatrix = DMatrix(data, labels, missing=missing, feature_names=feature_names, feature_types=feature_types, nthread=worker.nthreads) dmatrix.set_info(base_margin=base_margin, weight=weights, label_lower_bound=label_lower_bound, label_upper_bound=label_upper_bound) return dmatrix
def mapped_predict(data, is_df): worker = distributed_get_worker() booster.set_param({'nthread': worker.nthreads}) prediction = booster.inplace_predict(data, iteration_range=iteration_range, predict_type=predict_type, missing=missing) if is_df: if lazy_isinstance(data, 'cudf.core.dataframe', 'DataFrame'): import cudf # pylint: disable=import-error prediction = cudf.DataFrame({'prediction': prediction}, dtype=numpy.float32) else: # If it's from pandas, the partition is a numpy array prediction = DataFrame(prediction, columns=['prediction'], dtype=numpy.float32) return prediction
def dispatched_train(worker_addr, rabit_args, dtrain_ref, evals_ref): '''Perform training on a single worker. A local function prevents pickling. ''' LOGGER.info('Training on %s', str(worker_addr)) worker = distributed_get_worker() with RabitContext(rabit_args): local_dtrain = _dmatrix_from_worker_map(**dtrain_ref) local_evals = [] if evals_ref: for ref, name in evals_ref: if ref['worker_map'] == dtrain_ref['worker_map']: local_evals.append((local_dtrain, name)) continue local_evals.append((_dmatrix_from_worker_map(**ref), name)) local_history = {} local_param = params.copy() # just to be consistent msg = 'Overriding `nthreads` defined in dask worker.' if 'nthread' in local_param.keys() and \ local_param['nthread'] is not None and \ local_param['nthread'] != worker.nthreads: msg += '`nthread` is specified. ' + msg LOGGER.warning(msg) elif 'n_jobs' in local_param.keys() and \ local_param['n_jobs'] is not None and \ local_param['n_jobs'] != worker.nthreads: msg = '`n_jobs` is specified. ' + msg LOGGER.warning(msg) else: local_param['nthread'] = worker.nthreads bst = worker_train(params=local_param, dtrain=local_dtrain, *args, evals_result=local_history, evals=local_evals, early_stopping_rounds=early_stopping_rounds, **kwargs) ret = {'booster': bst, 'history': local_history} if local_dtrain.num_row() == 0: ret = None return ret
def __init__(self, args): self.args = args worker = distributed_get_worker() self.args.append( ('DMLC_TASK_ID=[xgboost.dask]:' + str(worker.address)).encode())