Python unwrap_partitions 예제들, modin.distributed.dataframe.pandas.unwrap_partitions Python 예제들

예제 #1

0

파일 보기

파일: test_partition_api.py 프로젝트: prutskov/modin

def test_unwrap_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df = pd.DataFrame(data)

    if axis is None:
        expected_partitions = df._query_compiler._modin_frame._partitions
        actual_partitions = np.array(unwrap_partitions(df, axis=axis))
        assert (expected_partitions.shape[0] == actual_partitions.shape[0] and
                expected_partitions.shape[1] == expected_partitions.shape[1])
        for row_idx in range(expected_partitions.shape[0]):
            for col_idx in range(expected_partitions.shape[1]):
                if Engine.get() == "Ray":
                    assert (expected_partitions[row_idx][col_idx].oid ==
                            actual_partitions[row_idx][col_idx])
                if Engine.get() == "Dask":
                    assert (expected_partitions[row_idx][col_idx].future ==
                            actual_partitions[row_idx][col_idx])
    else:
        expected_axis_partitions = (
            df._query_compiler._modin_frame._partition_mgr_cls.axis_partition(
                df._query_compiler._modin_frame._partitions, axis ^ 1))
        expected_axis_partitions = [
            axis_partition.force_materialization().unwrap(squeeze=True)
            for axis_partition in expected_axis_partitions
        ]
        actual_axis_partitions = unwrap_partitions(df, axis=axis)
        assert len(expected_axis_partitions) == len(actual_axis_partitions)
        for item_idx in range(len(expected_axis_partitions)):
            if Engine.get() in ["Ray", "Dask"]:
                df_equals(
                    get_func(expected_axis_partitions[item_idx]),
                    get_func(actual_axis_partitions[item_idx]),
                )

예제 #2

0

파일 보기

    def __init__(self, data, label):
        assert isinstance(
            data, pd.DataFrame
        ), f"Type of `data` is {type(data)}, but expected {pd.DataFrame}."
        assert isinstance(
            label, (pd.DataFrame, pd.Series)
        ), f"Type of `data` is {type(label)}, but expected {pd.DataFrame} or {pd.Series}."

        self.data = unwrap_partitions(data, axis=0, get_ip=True)
        self.label = unwrap_partitions(label, axis=0)

예제 #3

0

파일 보기

파일: xgboost.py 프로젝트: prutskov/modin

    def __init__(
        self,
        data,
        label=None,
        missing=None,
        silent=False,
        feature_names=None,
        feature_types=None,
        feature_weights=None,
        enable_categorical=None,
    ):
        assert isinstance(
            data, pd.DataFrame
        ), f"Type of `data` is {type(data)}, but expected {pd.DataFrame}."

        if label is not None:
            assert isinstance(
                label, (pd.DataFrame, pd.Series)
            ), f"Type of `data` is {type(label)}, but expected {pd.DataFrame} or {pd.Series}."
            self.label = unwrap_partitions(label, axis=0)
        else:
            self.label = None

        self.data = unwrap_partitions(data, axis=0, get_ip=True)

        self._n_rows = data.shape[0]
        self._n_cols = data.shape[1]

        for i, dtype in enumerate(data.dtypes):
            if dtype == "object":
                raise ValueError(f"Column {i} has unsupported data type {dtype}.")

        self.feature_names = feature_names
        self.feature_types = feature_types

        self.missing = missing
        self.silent = silent
        self.feature_weights = feature_weights
        self.enable_categorical = enable_categorical

        self.metadata = (
            data.index,
            data.columns,
            data._query_compiler._modin_frame._row_lengths,
        )

예제 #4

0

파일 보기

    def __init__(self, data, label=None):
        assert isinstance(
            data, pd.DataFrame
        ), f"Type of `data` is {type(data)}, but expected {pd.DataFrame}."

        if label is not None:
            assert isinstance(
                label, (pd.DataFrame, pd.Series)
            ), f"Type of `data` is {type(label)}, but expected {pd.DataFrame} or {pd.Series}."
            self.label = unwrap_partitions(label, axis=0)
        else:
            self.label = None

        self.data = unwrap_partitions(data, axis=0, get_ip=True)

        self.metadata = (
            data.index,
            data.columns,
            data._query_compiler._modin_frame._row_lengths,
        )

예제 #5

0

파일 보기

파일: xgboost_ray.py 프로젝트: yangl235/modin

def _predict(
        booster,
        data,
        nthread: Optional[int] = cpu_count(),
        **kwargs,
):
    s = time.time()

    X, _ = data
    X_row_parts = unwrap_partitions(X, axis=0, get_ip=True)

    # Create remote actors
    actors = create_actors(nthread=nthread)

    assert len(actors) <= len(
        X_row_parts
    ), f"{len(X_row_parts)} row partitions couldn't be distributed between {len(actors)} nodes."

    # Split data across workers
    order_of_parts = _split_data_across_actors(
        actors,
        lambda actor, *X: actor.set_predict_data.remote(*X),
        X_row_parts,
    )

    LOGGER.info(f"Data preparation time: {time.time() - s} s")
    s = time.time()

    # Predict
    predictions = [
        actor.predict._remote(args=(booster, ),
                              kwargs=kwargs,
                              num_returns=len(order_of_parts[ip]))
        if len(order_of_parts[ip]) > 1 else [
            actor.predict._remote(args=(booster, ),
                                  kwargs=kwargs,
                                  num_returns=len(order_of_parts[ip]))
        ] for ip, actor in actors.items()
    ]

    results_to_sort = list()
    for ip, part_res in zip(actors, predictions):
        results_to_sort.extend(list(zip(part_res, order_of_parts[ip])))

    results = sorted(results_to_sort, key=lambda l: l[1])
    results = [part_res for part_res, _ in results]

    result = from_partitions(results, 0).reset_index(drop=True)
    LOGGER.info(f"Prediction time: {time.time() - s} s")

    return result

예제 #6

0

파일 보기

def test_from_partitions_mismatched_labels(axis, index, columns):
    num_rows = 2**16
    num_cols = 2**8
    expected_df = pd.DataFrame(
        np.random.randint(0, 100, size=(num_rows, num_cols)))
    partitions = unwrap_partitions(expected_df, axis=axis)

    index = (expected_df.index if index == "original_idx" else
             [f"row{i}" for i in expected_df.index])
    columns = (expected_df.columns if columns == "original_col" else
               [f"col{i}" for i in expected_df.columns])

    expected_df.index = index
    expected_df.columns = columns
    actual_df = from_partitions(partitions,
                                axis=axis,
                                index=index,
                                columns=columns)
    df_equals(expected_df, actual_df)

예제 #7

0

파일 보기

파일: modin.py 프로젝트: ray-project/xgboost_ray

    def get_actor_shards(
            data: Any,  # modin.pandas.DataFrame
            actors: Sequence[ActorHandle]) -> \
            Tuple[Any, Optional[Dict[int, Any]]]:
        _assert_modin_installed()

        from modin.distributed.dataframe.pandas import unwrap_partitions

        actor_rank_ips = get_actor_rank_ips(actors)

        # Get IPs and partitions
        unwrapped = unwrap_partitions(data, axis=0, get_ip=True)
        ip_objs, part_objs = zip(*unwrapped)

        # Build a table mapping from IP to list of partitions
        ip_to_parts = defaultdict(list)
        for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs):
            ip_to_parts[ip].append(part_obj)

        # Modin dataframes are not serializable, so pass None here
        # as the first return value
        return None, assign_partitions_to_actors(ip_to_parts, actor_rank_ips)

예제 #8

0

파일 보기

def _predict(
    booster,
    data,
    nthread: Optional[int] = cpu_count(),
    evenly_data_distribution: Optional[bool] = True,
    **kwargs,
):
    s = time.time()

    X, _ = data
    X_row_parts = unwrap_partitions(X,
                                    axis=0,
                                    bind_ip=not evenly_data_distribution)

    # Create remote actors
    actors = create_actors(nthread=nthread)

    # Split data across workers
    _split_data_across_actors(
        actors,
        lambda actor, *X: actor.set_predict_data.remote(*X),
        X_row_parts,
        evenly_data_distribution=evenly_data_distribution,
    )

    LOGGER.info(f"Data preparation time: {time.time() - s} s")
    s = time.time()

    # Predict
    predictions = [
        actor.predict.remote(booster, **kwargs) for _, actor in actors.items()
    ]
    result = ray.get(predictions)
    LOGGER.info(f"Prediction time: {time.time() - s} s")

    return np.concatenate(result)

예제 #9

0

파일 보기

파일: modin.py 프로젝트: ijrsvt/xgboost_ray

def assign_partitions_to_actors(data: Any, actor_rank_ips: Dict[int, str]) \
        -> Dict[int, Sequence[ObjectRef]]:
    """Assign partitions from a Modin dataframe to actors.

    This function collects the Modin partitions and evenly distributes
    them to actors, trying to minimize data transfer by respecting
    co-locality.

    This function currently does _not_ take partition sizes into account
    for distributing data. It assumes that all partitions have (more or less)
    the same length.

    Instead, partitions are evenly distributed. E.g. for 8 partitions and 3
    actors, each actor gets assigned 2 or 3 partitions. Which partitions are
    assigned depends on the data locality.

    The algorithm is as follows: For any number of data partitions, get the
    Ray object references to the shards and the IP addresses where they
    currently live.

    Calculate the minimum and maximum amount of partitions per actor. These
    numbers should differ by at most 1. Also calculate how many actors will
    get more partitions assigned than the other actors.

    First, each actor gets assigned up to ``max_parts_per_actor`` co-located
    partitions. Only up to ``num_actors_with_max_parts`` actors get the
    maximum number of partitions, the rest try to fill the minimum.

    The rest of the partitions (all of which cannot be assigned to a
    co-located actor) are assigned to actors until there are none left.
    """
    from modin.distributed.dataframe.pandas import unwrap_partitions

    unwrapped = unwrap_partitions(data, axis=0, get_ip=True)

    ip_objs, part_objs = zip(*unwrapped)

    # Build a table mapping from IP to list of partitions
    ip_to_parts = defaultdict(list)
    for ip, part_obj in zip(ray.get(list(ip_objs)), part_objs):
        ip_to_parts[ip].append(part_obj)

    num_partitions = len(part_objs)
    num_actors = len(actor_rank_ips)
    min_parts_per_actor = max(0, math.floor(num_partitions / num_actors))
    max_parts_per_actor = max(1, math.ceil(num_partitions / num_actors))
    num_actors_with_max_parts = num_partitions % num_actors

    # This is our result dict that maps actor objects to a list of partitions
    actor_to_partitions = defaultdict(list)

    # First we loop through the actors and assign them partitions from their
    # own IPs. Do this until each actor has `min_parts_per_actor` partitions
    partition_assigned = True
    while partition_assigned:
        partition_assigned = False

        # Loop through each actor once, assigning
        for rank, actor_ip in actor_rank_ips.items():
            num_parts_left_on_ip = len(ip_to_parts[actor_ip])
            num_actor_parts = len(actor_to_partitions[rank])

            if num_parts_left_on_ip > 0 and \
               num_actor_parts < max_parts_per_actor:
                if num_actor_parts >= min_parts_per_actor:
                    # Only allow up to `num_actors_with_max_parts actors to
                    # have the maximum number of partitions assigned.
                    if num_actors_with_max_parts <= 0:
                        continue
                    num_actors_with_max_parts -= 1
                actor_to_partitions[rank].append(ip_to_parts[actor_ip].pop(0))
                partition_assigned = True

    # The rest of the partitions, no matter where they are located, could not
    # be assigned to co-located actors. Thus, we assign them
    # to actors who still need partitions.
    rest_parts = list(itertools.chain(*ip_to_parts.values()))
    partition_assigned = True
    while len(rest_parts) > 0 and partition_assigned:
        partition_assigned = False
        for rank in actor_rank_ips:
            num_actor_parts = len(actor_to_partitions[rank])
            if num_actor_parts < max_parts_per_actor:
                if num_actor_parts >= min_parts_per_actor:
                    if num_actors_with_max_parts <= 0:
                        continue
                    num_actors_with_max_parts -= 1
                actor_to_partitions[rank].append(rest_parts.pop(0))
                partition_assigned = True
            if len(rest_parts) <= 0:
                break

    if len(rest_parts) != 0:
        raise RuntimeError(
            f"There are still partitions left to assign, but no actor "
            f"has capacity for more. This is probably a bug. Please go "
            f"to https://github.com/ray-project/xgboost_ray to report it.")

    return actor_to_partitions

예제 #10

0

파일 보기

def _train(
        dtrain,
        nthread,
        evenly_data_distribution,
        params: Dict,
        *args,
        evals=(),
        **kwargs,
):
    s = time.time()

    X, y = dtrain
    assert len(X) == len(y)

    X_row_parts = unwrap_partitions(X,
                                    axis=0,
                                    bind_ip=not evenly_data_distribution)
    y_row_parts = unwrap_partitions(y,
                                    axis=0,
                                    bind_ip=not evenly_data_distribution)
    assert len(X_row_parts) == len(y_row_parts), "Unaligned train data"

    # Create remote actors
    actors = create_actors(nthread=nthread)

    add_as_eval_method = None
    if evals:
        for (eval_data, method) in evals[:]:
            if eval_data is dtrain:
                add_as_eval_method = method
                evals.remove((eval_data, method))

        for ((eval_X, eval_y), eval_method) in evals:
            # Split data across workers
            _split_data_across_actors(
                actors,
                lambda actor, *X_y: actor.add_eval_data.remote(
                    *X_y, eval_method=eval_method),
                unwrap_partitions(eval_X,
                                  axis=0,
                                  bind_ip=not evenly_data_distribution),
                unwrap_partitions(eval_y,
                                  axis=0,
                                  bind_ip=not evenly_data_distribution),
                evenly_data_distribution=evenly_data_distribution,
            )

    # Split data across workers
    _split_data_across_actors(
        actors,
        lambda actor, *X_y: actor.set_train_data.remote(
            *X_y, add_as_eval_method=add_as_eval_method),
        X_row_parts,
        y_row_parts,
        evenly_data_distribution=evenly_data_distribution,
    )
    LOGGER.info(f"Data preparation time: {time.time() - s} s")

    s = time.time()
    with RabitContextManager(len(actors), get_node_ip_address()) as env:
        rabit_args = [("%s=%s" % item).encode() for item in env.items()]

        # Train
        fut = [
            actor.train.remote(rabit_args, params, *args, **kwargs)
            for _, actor in actors.items()
        ]

        # All results should be the same because of Rabit tracking. So we just
        # return the first one.
        result = ray.get(fut[0])
        LOGGER.info(f"Training time: {time.time() - s} s")
        return result

예제 #11

0

파일 보기

    def _testModinAssignment(self, part_nodes, actor_nodes,
                             expected_actor_parts):
        node_ips = [
            node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"]
        ]
        if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1:
            print("Not running on cluster, skipping rest of this test.")
            return

        actor_node_ips = [node_ips[nid] for nid in actor_nodes]
        part_node_ips = [node_ips[nid] for nid in part_nodes]

        # Initialize data frames on remote nodes
        # This way we can control which partition is on which node
        @ray.remote(num_cpus=0.1)
        def create_remote_df(arr):
            return ray.put(pd.DataFrame(arr))

        partitions = np.array_split(self.x, len(part_nodes))
        node_dfs: Sequence[ObjectRef] = ray.get([
            create_remote_df.options(resources={
                f"node:{pip}": 0.1
            }).remote(partitions[pid]) for pid, pip in enumerate(part_node_ips)
        ])
        node_ip_dfs = [(ray.put(part_node_ips[pid]), node_df)
                       for pid, node_df in enumerate(node_dfs)]

        # Create modin dataframe from distributed partitions
        from modin.distributed.dataframe.pandas import (from_partitions,
                                                        unwrap_partitions)
        modin_df = from_partitions(node_ip_dfs, axis=0)

        # Sanity check
        unwrapped = unwrap_partitions(modin_df, axis=0, get_ip=True)
        ip_objs, df_objs = zip(*unwrapped)

        try:
            self.assertSequenceEqual(
                [df[0][0] for df in partitions],
                [df[0][0] for df in ray.get(list(df_objs))],
                msg="Modin mixed up the partition order")

            self.assertSequenceEqual(
                part_node_ips,
                ray.get(list(ip_objs)),
                msg="Modin moved partitions to different IPs")
        except AssertionError as exc:
            print(f"Modin part of the test failed: {exc}")
            print("This is a stochastic test failure. Ignoring the rest "
                  "of this test.")
            return

        # Create ray actors
        actors = [
            RayXGBoostActor.options(resources={
                f"node:{nip}": 0.1
            }).remote(rank=rank, num_actors=len(actor_nodes))
            for rank, nip in enumerate(actor_node_ips)
        ]

        # Calculate shards
        _, actor_to_parts = Modin.get_actor_shards(modin_df, actors)

        for actor_rank, part_ids in expected_actor_parts.items():
            for i, part_id in enumerate(part_ids):
                assigned_df = ray.get(actor_to_parts[actor_rank][i])
                part_df = pd.DataFrame(partitions[part_id])

                self.assertTrue(
                    assigned_df.equals(part_df),
                    msg=f"Assignment failed: Actor rank {actor_rank}, "
                    f"partition {i} is not partition with ID {part_id}.")