def test_from_partitions(axis, index, columns, row_lengths, column_widths): num_rows = 2**16 num_cols = 2**8 data = np.random.randint(0, 100, size=(num_rows, num_cols)) df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data) expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis) index = expected_df.index if index == "index" else None columns = expected_df.columns if columns == "columns" else None row_lengths = (None if row_lengths is None else [num_rows, num_rows] if axis == 0 else [num_rows]) column_widths = (None if column_widths is None else [num_cols] if axis == 0 else [num_cols, num_cols]) if Engine.get() == "Ray": if axis is None: futures = [[ray.put(df1), ray.put(df2)]] else: futures = [ray.put(df1), ray.put(df2)] if Engine.get() == "Dask": client = default_client() if axis is None: futures = [client.scatter([df1, df2], hash=False)] else: futures = client.scatter([df1, df2], hash=False) actual_df = from_partitions( futures, axis, index=index, columns=columns, row_lengths=row_lengths, column_widths=column_widths, ) df_equals(expected_df, actual_df)
def main(cpus_per_actor, num_actors): if not MODIN_INSTALLED: print("Modin is not installed or installed in a version that is not " "compatible with xgboost_ray (< 0.9.0).") return # Import modin after initializing Ray from modin.distributed.dataframe.pandas import from_partitions # Generate dataset x = np.repeat(range(8), 16).reshape((32, 4)) # Even numbers --> 0, odd numbers --> 1 y = np.tile(np.repeat(range(2), 4), 4) # Flip some bits to reduce max accuracy bits_to_flip = np.random.choice(32, size=6, replace=False) y[bits_to_flip] = 1 - y[bits_to_flip] data = pd.DataFrame(x) data["label"] = y # Split into 4 partitions partitions = [ray.put(part) for part in np.split(data, 4)] # Create modin df here modin_df = from_partitions(partitions, axis=0) train_set = RayDMatrix(modin_df, "label") evals_result = {} # Set XGBoost config. xgboost_params = { "tree_method": "approx", "objective": "binary:logistic", "eval_metric": ["logloss", "error"], } # Train the classifier bst = train(params=xgboost_params, dtrain=train_set, evals=[(train_set, "train")], evals_result=evals_result, ray_params=RayParams(max_actor_restarts=0, gpus_per_actor=0, cpus_per_actor=cpus_per_actor, num_actors=num_actors), verbose_eval=False, num_boost_round=10) model_path = "modin.xgb" bst.save_model(model_path) print("Final training error: {:.4f}".format( evals_result["train"]["error"][-1]))
def _predict( booster, data, nthread: Optional[int] = cpu_count(), **kwargs, ): s = time.time() X, _ = data X_row_parts = unwrap_partitions(X, axis=0, get_ip=True) # Create remote actors actors = create_actors(nthread=nthread) assert len(actors) <= len( X_row_parts ), f"{len(X_row_parts)} row partitions couldn't be distributed between {len(actors)} nodes." # Split data across workers order_of_parts = _split_data_across_actors( actors, lambda actor, *X: actor.set_predict_data.remote(*X), X_row_parts, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() # Predict predictions = [ actor.predict._remote(args=(booster, ), kwargs=kwargs, num_returns=len(order_of_parts[ip])) if len(order_of_parts[ip]) > 1 else [ actor.predict._remote(args=(booster, ), kwargs=kwargs, num_returns=len(order_of_parts[ip])) ] for ip, actor in actors.items() ] results_to_sort = list() for ip, part_res in zip(actors, predictions): results_to_sort.extend(list(zip(part_res, order_of_parts[ip]))) results = sorted(results_to_sort, key=lambda l: l[1]) results = [part_res for part_res, _ in results] result = from_partitions(results, 0).reset_index(drop=True) LOGGER.info(f"Prediction time: {time.time() - s} s") return result
def test_from_partitions(axis): data = np.random.randint(0, 100, size=(2**16, 2**8)) df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data) expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis) if Engine.get() == "Ray": if axis is None: futures = [[ray.put(df1), ray.put(df2)]] else: futures = [ray.put(df1), ray.put(df2)] if Engine.get() == "Dask": client = get_client() if axis is None: futures = [client.scatter([df1, df2], hash=False)] else: futures = client.scatter([df1, df2], hash=False) actual_df = from_partitions(futures, axis) df_equals(expected_df, actual_df)
def _predict( booster, data, num_actors, **kwargs, ): s = time.time() X_row_parts, _ = data num_actors = _get_num_actors( num_actors if isinstance(num_actors, int) else "default_predict" ) if num_actors > len(X_row_parts): num_actors = len(X_row_parts) # Create remote actors actors, pg = create_actors(num_actors) # Split data across workers _split_data_across_actors( actors, lambda actor, *X: actor.set_predict_data.remote(*X), X_row_parts, is_predict=True, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() booster = ray.put(booster) predictions = [ tuple(actor.predict._remote(args=(booster,), kwargs=kwargs, num_returns=2)) for actor in actors ] ray.wait([part for _, part in predictions], num_returns=len(predictions)) remove_placement_group(pg) result = from_partitions(predictions, 0) LOGGER.info(f"Prediction time: {time.time() - s} s") return result
def test_from_partitions_mismatched_labels(axis, index, columns): num_rows = 2**16 num_cols = 2**8 expected_df = pd.DataFrame( np.random.randint(0, 100, size=(num_rows, num_cols))) partitions = unwrap_partitions(expected_df, axis=axis) index = (expected_df.index if index == "original_idx" else [f"row{i}" for i in expected_df.index]) columns = (expected_df.columns if columns == "original_col" else [f"col{i}" for i in expected_df.columns]) expected_df.index = index expected_df.columns = columns actual_df = from_partitions(partitions, axis=axis, index=index, columns=columns) df_equals(expected_df, actual_df)
def _predict( booster, data, **kwargs, ): """ Run distributed prediction with a trained booster on Ray engine. During execution it runs ``xgb.predict`` on each worker for subset of `data` and creates Modin DataFrame with prediction results. Parameters ---------- booster : xgboost.Booster A trained booster. data : modin.experimental.xgboost.DMatrix Input data used for prediction. **kwargs : dict Other parameters are the same as for ``xgboost.Booster.predict``. Returns ------- modin.pandas.DataFrame Modin DataFrame with prediction results. """ s = time.time() dmatrix_kwargs = data.get_dmatrix_params() # Get metadata from DMatrix input_index, input_columns, row_lengths = data.metadata # Infer columns of result def _get_num_columns(booster, n_features, **kwargs): rng = np.random.RandomState(777) test_data = rng.randn(1, n_features) test_predictions = booster.predict(xgb.DMatrix(test_data), validate_features=False, **kwargs) num_columns = (test_predictions.shape[1] if len(test_predictions.shape) > 1 else 1) return num_columns result_num_columns = _get_num_columns(booster, len(input_columns), **kwargs) new_columns = list(range(result_num_columns)) # Put common data in object store booster = ray.put(booster) new_columns_ref = ray.put(new_columns) prediction_refs = [ _map_predict.remote(booster, part, new_columns_ref, dmatrix_kwargs, **kwargs) for _, part in data.data ] predictions = from_partitions( prediction_refs, 0, index=input_index, columns=new_columns, row_lengths=row_lengths, column_widths=[len(new_columns)], ) LOGGER.info(f"Prediction time: {time.time() - s} s") return predictions
def _testModinAssignment(self, part_nodes, actor_nodes, expected_actor_parts): node_ips = [ node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"] ] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return actor_node_ips = [node_ips[nid] for nid in actor_nodes] part_node_ips = [node_ips[nid] for nid in part_nodes] # Initialize data frames on remote nodes # This way we can control which partition is on which node @ray.remote(num_cpus=0.1) def create_remote_df(arr): return ray.put(pd.DataFrame(arr)) partitions = np.array_split(self.x, len(part_nodes)) node_dfs: Sequence[ObjectRef] = ray.get([ create_remote_df.options(resources={ f"node:{pip}": 0.1 }).remote(partitions[pid]) for pid, pip in enumerate(part_node_ips) ]) node_ip_dfs = [(ray.put(part_node_ips[pid]), node_df) for pid, node_df in enumerate(node_dfs)] # Create modin dataframe from distributed partitions from modin.distributed.dataframe.pandas import (from_partitions, unwrap_partitions) modin_df = from_partitions(node_ip_dfs, axis=0) # Sanity check unwrapped = unwrap_partitions(modin_df, axis=0, get_ip=True) ip_objs, df_objs = zip(*unwrapped) try: self.assertSequenceEqual( [df[0][0] for df in partitions], [df[0][0] for df in ray.get(list(df_objs))], msg="Modin mixed up the partition order") self.assertSequenceEqual( part_node_ips, ray.get(list(ip_objs)), msg="Modin moved partitions to different IPs") except AssertionError as exc: print(f"Modin part of the test failed: {exc}") print("This is a stochastic test failure. Ignoring the rest " "of this test.") return # Create ray actors actors = [ RayXGBoostActor.options(resources={ f"node:{nip}": 0.1 }).remote(rank=rank, num_actors=len(actor_nodes)) for rank, nip in enumerate(actor_node_ips) ] # Calculate shards _, actor_to_parts = Modin.get_actor_shards(modin_df, actors) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): assigned_df = ray.get(actor_to_parts[actor_rank][i]) part_df = pd.DataFrame(partitions[part_id]) self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.")
def _predict( booster, data, num_actors, **kwargs, ): """ Run distributed prediction with a trained booster on Ray backend. During work it evenly distributes `data` between workers, runs xgb.predict on each worker for subset of `data` and creates Modin DataFrame with prediction results. Parameters ---------- booster : xgboost.Booster A trained booster. data : modin.experimental.xgboost.DMatrix Input data used for prediction. num_actors : int, optional Number of actors for prediction. If unspecified, this value will be computed automatically. **kwargs : dist Other parameters are the same as `xgboost.Booster.predict`. Returns ------- modin.pandas.DataFrame Modin DataFrame with prediction results. """ s = time.time() X_row_parts, _ = data num_actors = _get_num_actors( num_actors if isinstance(num_actors, int) else "default_predict") if num_actors > len(X_row_parts): num_actors = len(X_row_parts) # Create remote actors actors, pg = create_actors(num_actors) # Split data across workers _split_data_across_actors( actors, lambda actor, *X: actor.set_predict_data.remote(*X), X_row_parts, is_predict=True, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() booster = ray.put(booster) predictions = [ tuple(actor.predict.options(num_returns=2).remote(booster, **kwargs)) for actor in actors ] ray.wait([part for _, part in predictions], num_returns=len(predictions)) remove_placement_group(pg) result = from_partitions(predictions, 0) LOGGER.info(f"Prediction time: {time.time() - s} s") return result