def test_unwrap_partitions(axis): data = np.random.randint(0, 100, size=(2**16, 2**8)) df = pd.DataFrame(data) if axis is None: expected_partitions = df._query_compiler._modin_frame._partitions actual_partitions = np.array(unwrap_partitions(df, axis=axis)) assert (expected_partitions.shape[0] == actual_partitions.shape[0] and expected_partitions.shape[1] == expected_partitions.shape[1]) for row_idx in range(expected_partitions.shape[0]): for col_idx in range(expected_partitions.shape[1]): if Engine.get() == "Ray": assert (expected_partitions[row_idx][col_idx].oid == actual_partitions[row_idx][col_idx]) if Engine.get() == "Dask": assert (expected_partitions[row_idx][col_idx].future == actual_partitions[row_idx][col_idx]) else: expected_axis_partitions = ( df._query_compiler._modin_frame._partition_mgr_cls.axis_partition( df._query_compiler._modin_frame._partitions, axis ^ 1)) expected_axis_partitions = [ axis_partition.force_materialization().unwrap(squeeze=True) for axis_partition in expected_axis_partitions ] actual_axis_partitions = unwrap_partitions(df, axis=axis) assert len(expected_axis_partitions) == len(actual_axis_partitions) for item_idx in range(len(expected_axis_partitions)): if Engine.get() in ["Ray", "Dask"]: df_equals( get_func(expected_axis_partitions[item_idx]), get_func(actual_axis_partitions[item_idx]), )
def __init__(self, data, label): assert isinstance( data, pd.DataFrame ), f"Type of `data` is {type(data)}, but expected {pd.DataFrame}." assert isinstance( label, (pd.DataFrame, pd.Series) ), f"Type of `data` is {type(label)}, but expected {pd.DataFrame} or {pd.Series}." self.data = unwrap_partitions(data, axis=0, get_ip=True) self.label = unwrap_partitions(label, axis=0)
def __init__( self, data, label=None, missing=None, silent=False, feature_names=None, feature_types=None, feature_weights=None, enable_categorical=None, ): assert isinstance( data, pd.DataFrame ), f"Type of `data` is {type(data)}, but expected {pd.DataFrame}." if label is not None: assert isinstance( label, (pd.DataFrame, pd.Series) ), f"Type of `data` is {type(label)}, but expected {pd.DataFrame} or {pd.Series}." self.label = unwrap_partitions(label, axis=0) else: self.label = None self.data = unwrap_partitions(data, axis=0, get_ip=True) self._n_rows = data.shape[0] self._n_cols = data.shape[1] for i, dtype in enumerate(data.dtypes): if dtype == "object": raise ValueError(f"Column {i} has unsupported data type {dtype}.") self.feature_names = feature_names self.feature_types = feature_types self.missing = missing self.silent = silent self.feature_weights = feature_weights self.enable_categorical = enable_categorical self.metadata = ( data.index, data.columns, data._query_compiler._modin_frame._row_lengths, )
def __init__(self, data, label=None): assert isinstance( data, pd.DataFrame ), f"Type of `data` is {type(data)}, but expected {pd.DataFrame}." if label is not None: assert isinstance( label, (pd.DataFrame, pd.Series) ), f"Type of `data` is {type(label)}, but expected {pd.DataFrame} or {pd.Series}." self.label = unwrap_partitions(label, axis=0) else: self.label = None self.data = unwrap_partitions(data, axis=0, get_ip=True) self.metadata = ( data.index, data.columns, data._query_compiler._modin_frame._row_lengths, )
def _predict( booster, data, nthread: Optional[int] = cpu_count(), **kwargs, ): s = time.time() X, _ = data X_row_parts = unwrap_partitions(X, axis=0, get_ip=True) # Create remote actors actors = create_actors(nthread=nthread) assert len(actors) <= len( X_row_parts ), f"{len(X_row_parts)} row partitions couldn't be distributed between {len(actors)} nodes." # Split data across workers order_of_parts = _split_data_across_actors( actors, lambda actor, *X: actor.set_predict_data.remote(*X), X_row_parts, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() # Predict predictions = [ actor.predict._remote(args=(booster, ), kwargs=kwargs, num_returns=len(order_of_parts[ip])) if len(order_of_parts[ip]) > 1 else [ actor.predict._remote(args=(booster, ), kwargs=kwargs, num_returns=len(order_of_parts[ip])) ] for ip, actor in actors.items() ] results_to_sort = list() for ip, part_res in zip(actors, predictions): results_to_sort.extend(list(zip(part_res, order_of_parts[ip]))) results = sorted(results_to_sort, key=lambda l: l[1]) results = [part_res for part_res, _ in results] result = from_partitions(results, 0).reset_index(drop=True) LOGGER.info(f"Prediction time: {time.time() - s} s") return result
def test_from_partitions_mismatched_labels(axis, index, columns): num_rows = 2**16 num_cols = 2**8 expected_df = pd.DataFrame( np.random.randint(0, 100, size=(num_rows, num_cols))) partitions = unwrap_partitions(expected_df, axis=axis) index = (expected_df.index if index == "original_idx" else [f"row{i}" for i in expected_df.index]) columns = (expected_df.columns if columns == "original_col" else [f"col{i}" for i in expected_df.columns]) expected_df.index = index expected_df.columns = columns actual_df = from_partitions(partitions, axis=axis, index=index, columns=columns) df_equals(expected_df, actual_df)
def get_actor_shards( data: Any, # modin.pandas.DataFrame actors: Sequence[ActorHandle]) -> \ Tuple[Any, Optional[Dict[int, Any]]]: _assert_modin_installed() from modin.distributed.dataframe.pandas import unwrap_partitions actor_rank_ips = get_actor_rank_ips(actors) # Get IPs and partitions unwrapped = unwrap_partitions(data, axis=0, get_ip=True) ip_objs, part_objs = zip(*unwrapped) # Build a table mapping from IP to list of partitions ip_to_parts = defaultdict(list) for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs): ip_to_parts[ip].append(part_obj) # Modin dataframes are not serializable, so pass None here # as the first return value return None, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)
def _predict( booster, data, nthread: Optional[int] = cpu_count(), evenly_data_distribution: Optional[bool] = True, **kwargs, ): s = time.time() X, _ = data X_row_parts = unwrap_partitions(X, axis=0, bind_ip=not evenly_data_distribution) # Create remote actors actors = create_actors(nthread=nthread) # Split data across workers _split_data_across_actors( actors, lambda actor, *X: actor.set_predict_data.remote(*X), X_row_parts, evenly_data_distribution=evenly_data_distribution, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() # Predict predictions = [ actor.predict.remote(booster, **kwargs) for _, actor in actors.items() ] result = ray.get(predictions) LOGGER.info(f"Prediction time: {time.time() - s} s") return np.concatenate(result)
def assign_partitions_to_actors(data: Any, actor_rank_ips: Dict[int, str]) \ -> Dict[int, Sequence[ObjectRef]]: """Assign partitions from a Modin dataframe to actors. This function collects the Modin partitions and evenly distributes them to actors, trying to minimize data transfer by respecting co-locality. This function currently does _not_ take partition sizes into account for distributing data. It assumes that all partitions have (more or less) the same length. Instead, partitions are evenly distributed. E.g. for 8 partitions and 3 actors, each actor gets assigned 2 or 3 partitions. Which partitions are assigned depends on the data locality. The algorithm is as follows: For any number of data partitions, get the Ray object references to the shards and the IP addresses where they currently live. Calculate the minimum and maximum amount of partitions per actor. These numbers should differ by at most 1. Also calculate how many actors will get more partitions assigned than the other actors. First, each actor gets assigned up to ``max_parts_per_actor`` co-located partitions. Only up to ``num_actors_with_max_parts`` actors get the maximum number of partitions, the rest try to fill the minimum. The rest of the partitions (all of which cannot be assigned to a co-located actor) are assigned to actors until there are none left. """ from modin.distributed.dataframe.pandas import unwrap_partitions unwrapped = unwrap_partitions(data, axis=0, get_ip=True) ip_objs, part_objs = zip(*unwrapped) # Build a table mapping from IP to list of partitions ip_to_parts = defaultdict(list) for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs): ip_to_parts[ip].append(part_obj) num_partitions = len(part_objs) num_actors = len(actor_rank_ips) min_parts_per_actor = max(0, math.floor(num_partitions / num_actors)) max_parts_per_actor = max(1, math.ceil(num_partitions / num_actors)) num_actors_with_max_parts = num_partitions % num_actors # This is our result dict that maps actor objects to a list of partitions actor_to_partitions = defaultdict(list) # First we loop through the actors and assign them partitions from their # own IPs. Do this until each actor has `min_parts_per_actor` partitions partition_assigned = True while partition_assigned: partition_assigned = False # Loop through each actor once, assigning for rank, actor_ip in actor_rank_ips.items(): num_parts_left_on_ip = len(ip_to_parts[actor_ip]) num_actor_parts = len(actor_to_partitions[rank]) if num_parts_left_on_ip > 0 and \ num_actor_parts < max_parts_per_actor: if num_actor_parts >= min_parts_per_actor: # Only allow up to `num_actors_with_max_parts actors to # have the maximum number of partitions assigned. if num_actors_with_max_parts <= 0: continue num_actors_with_max_parts -= 1 actor_to_partitions[rank].append(ip_to_parts[actor_ip].pop(0)) partition_assigned = True # The rest of the partitions, no matter where they are located, could not # be assigned to co-located actors. Thus, we assign them # to actors who still need partitions. rest_parts = list(itertools.chain(*ip_to_parts.values())) partition_assigned = True while len(rest_parts) > 0 and partition_assigned: partition_assigned = False for rank in actor_rank_ips: num_actor_parts = len(actor_to_partitions[rank]) if num_actor_parts < max_parts_per_actor: if num_actor_parts >= min_parts_per_actor: if num_actors_with_max_parts <= 0: continue num_actors_with_max_parts -= 1 actor_to_partitions[rank].append(rest_parts.pop(0)) partition_assigned = True if len(rest_parts) <= 0: break if len(rest_parts) != 0: raise RuntimeError( f"There are still partitions left to assign, but no actor " f"has capacity for more. This is probably a bug. Please go " f"to https://github.com/ray-project/xgboost_ray to report it.") return actor_to_partitions
def _train( dtrain, nthread, evenly_data_distribution, params: Dict, *args, evals=(), **kwargs, ): s = time.time() X, y = dtrain assert len(X) == len(y) X_row_parts = unwrap_partitions(X, axis=0, bind_ip=not evenly_data_distribution) y_row_parts = unwrap_partitions(y, axis=0, bind_ip=not evenly_data_distribution) assert len(X_row_parts) == len(y_row_parts), "Unaligned train data" # Create remote actors actors = create_actors(nthread=nthread) add_as_eval_method = None if evals: for (eval_data, method) in evals[:]: if eval_data is dtrain: add_as_eval_method = method evals.remove((eval_data, method)) for ((eval_X, eval_y), eval_method) in evals: # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.add_eval_data.remote( *X_y, eval_method=eval_method), unwrap_partitions(eval_X, axis=0, bind_ip=not evenly_data_distribution), unwrap_partitions(eval_y, axis=0, bind_ip=not evenly_data_distribution), evenly_data_distribution=evenly_data_distribution, ) # Split data across workers _split_data_across_actors( actors, lambda actor, *X_y: actor.set_train_data.remote( *X_y, add_as_eval_method=add_as_eval_method), X_row_parts, y_row_parts, evenly_data_distribution=evenly_data_distribution, ) LOGGER.info(f"Data preparation time: {time.time() - s} s") s = time.time() with RabitContextManager(len(actors), get_node_ip_address()) as env: rabit_args = [("%s=%s" % item).encode() for item in env.items()] # Train fut = [ actor.train.remote(rabit_args, params, *args, **kwargs) for _, actor in actors.items() ] # All results should be the same because of Rabit tracking. So we just # return the first one. result = ray.get(fut[0]) LOGGER.info(f"Training time: {time.time() - s} s") return result
def _testModinAssignment(self, part_nodes, actor_nodes, expected_actor_parts): node_ips = [ node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"] ] if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: print("Not running on cluster, skipping rest of this test.") return actor_node_ips = [node_ips[nid] for nid in actor_nodes] part_node_ips = [node_ips[nid] for nid in part_nodes] # Initialize data frames on remote nodes # This way we can control which partition is on which node @ray.remote(num_cpus=0.1) def create_remote_df(arr): return ray.put(pd.DataFrame(arr)) partitions = np.array_split(self.x, len(part_nodes)) node_dfs: Sequence[ObjectRef] = ray.get([ create_remote_df.options(resources={ f"node:{pip}": 0.1 }).remote(partitions[pid]) for pid, pip in enumerate(part_node_ips) ]) node_ip_dfs = [(ray.put(part_node_ips[pid]), node_df) for pid, node_df in enumerate(node_dfs)] # Create modin dataframe from distributed partitions from modin.distributed.dataframe.pandas import (from_partitions, unwrap_partitions) modin_df = from_partitions(node_ip_dfs, axis=0) # Sanity check unwrapped = unwrap_partitions(modin_df, axis=0, get_ip=True) ip_objs, df_objs = zip(*unwrapped) try: self.assertSequenceEqual( [df[0][0] for df in partitions], [df[0][0] for df in ray.get(list(df_objs))], msg="Modin mixed up the partition order") self.assertSequenceEqual( part_node_ips, ray.get(list(ip_objs)), msg="Modin moved partitions to different IPs") except AssertionError as exc: print(f"Modin part of the test failed: {exc}") print("This is a stochastic test failure. Ignoring the rest " "of this test.") return # Create ray actors actors = [ RayXGBoostActor.options(resources={ f"node:{nip}": 0.1 }).remote(rank=rank, num_actors=len(actor_nodes)) for rank, nip in enumerate(actor_node_ips) ] # Calculate shards _, actor_to_parts = Modin.get_actor_shards(modin_df, actors) for actor_rank, part_ids in expected_actor_parts.items(): for i, part_id in enumerate(part_ids): assigned_df = ray.get(actor_to_parts[actor_rank][i]) part_df = pd.DataFrame(partitions[part_id]) self.assertTrue( assigned_df.equals(part_df), msg=f"Assignment failed: Actor rank {actor_rank}, " f"partition {i} is not partition with ID {part_id}.")