def _update_engine(publisher: Publisher): global DEFAULT_NPARTITIONS, dask_client num_cpus = DEFAULT_NPARTITIONS if publisher.get() == "Ray": import ray if _is_first_update.get("Ray", True): initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif publisher.get() == "Dask": # pragma: no cover from distributed.client import get_client if threading.current_thread( ).name == "MainThread" and _is_first_update.get("Dask", True): import warnings warnings.warn("The Dask Engine for Modin is experimental.") try: dask_client = get_client() except ValueError: from distributed import Client num_cpus = os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count() dask_client = Client(n_workers=int(num_cpus)) elif publisher.get() != "Python": raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus))
def initialize_dask(): from distributed.client import get_client try: get_client() except ValueError: from distributed import Client # The indentation here is intentional, we want the code to be indented. ErrorMessage.not_initialized( "Dask", """ from distributed import Client client = Client() """, ) Client(n_workers=CpuCount.get())
def put(cls, obj): """A factory classmethod to format a given object. Args: obj: An object. Returns: A `RemotePartitions` object. """ client = get_client() return cls(client.scatter(obj, hash=False))
def apply(self, func, **kwargs): """Apply some callable function to the data in this partition. Note: It is up to the implementation how kwargs are handled. They are an important part of many implementations. As of right now, they are not serialized. Args: func: The lambda to apply (may already be correctly formatted) Returns: A new `BaseFramePartition` containing the object that has had `func` applied to it. """ func = pkl.dumps(func) call_queue = self.call_queue + [[func, kwargs]] future = get_client().submit(apply_list_of_funcs, call_queue, self.future, pure=False) futures = [ get_client().submit(lambda l: l[i], future) for i in range(2) ] return PandasOnDaskFramePartition(futures[0], ip=futures[1])
def extract_ddf_partitions(ddf): """ Returns the mapping: worker -> [list of futures]""" client = get_client() delayed_ddf = ddf.to_delayed() parts = client.compute(delayed_ddf) wait(parts) key_to_part = dict([(str(part.key), part) for part in parts]) ret = defaultdict(list) # Map worker -> [list of futures] for key, workers in client.who_has(parts).items(): worker = first( workers ) # If multiple workers have the part, we pick the first worker ret[worker].append(key_to_part[key]) return ret
def test_from_partitions(axis): data = np.random.randint(0, 100, size=(2**16, 2**8)) df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data) expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis) if Engine.get() == "Ray": if axis is None: futures = [[ray.put(df1), ray.put(df2)]] else: futures = [ray.put(df1), ray.put(df2)] if Engine.get() == "Dask": client = get_client() if axis is None: futures = [client.scatter([df1, df2], hash=False)] else: futures = client.scatter([df1, df2], hash=False) actual_df = from_partitions(futures, axis) df_equals(expected_df, actual_df)
def deploy_axis_func(cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions): client = get_client() axis_result = client.submit( PandasFrameAxisPartition.deploy_axis_func, axis, func, num_splits, kwargs, maintain_partitioning, *partitions, pure=False, ) if num_splits == 1: return axis_result # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ client.submit(lambda l: l[i], axis_result, pure=False) for i in range(num_splits) ]
def initialize_dask(): from distributed.client import get_client try: client = get_client() except ValueError: from distributed import Client # The indentation here is intentional, we want the code to be indented. ErrorMessage.not_initialized( "Dask", """ from distributed import Client client = Client() """, ) client = Client(n_workers=CpuCount.get()) num_cpus = len(client.ncores()) NPartitions.put_if_default(num_cpus)
def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions): client = get_client() axis_result = client.submit( PandasFrameAxisPartition.deploy_func_between_two_axis_partitions, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions, pure=False, ) # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ client.submit(lambda l: l[i], axis_result, pure=False) for i in range(num_splits) ]
def sample( self, label: str | None = None, *, client: Client | None = None, measure: str = "process", interval: float = 0.5, ): """Context manager that records memory usage in the cluster. This is synchronous if the client is synchronous and asynchronous if the client is asynchronous. The samples are recorded in ``self.samples[<label>]``. Parameters ========== label: str, optional Tag to record the samples under in the self.samples dict. Default: automatically generate a random label client: Client, optional client used to connect to the scheduler. Default: use the global client measure: str, optional One of the measures from :class:`distributed.scheduler.MemoryState`. Default: sample process memory interval: float, optional sampling interval, in seconds. Default: 0.5 """ if not client: from distributed.client import get_client client = get_client() if client.asynchronous: return self._sample_async(label, client, measure, interval) else: return self._sample_sync(label, client, measure, interval)
def deploy_axis_func(cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions): client = get_client() axis_result = client.submit( PandasFrameAxisPartition.deploy_axis_func, axis, func, num_splits, kwargs, maintain_partitioning, *partitions, pure=False, ) lengths = kwargs.get("_lengths", None) result_num_splits = len(lengths) if lengths else num_splits # We have to do this to split it back up. It is already split, but we need to # get futures for each. return [ client.submit(lambda l: l[i], axis_result, pure=False) for i in range(result_num_splits) ]
def _update_engine(publisher: Parameter): global DEFAULT_NPARTITIONS, dask_client, num_cpus from modin.config import Backend, CpuCount if publisher.get() == "Ray": import ray from modin.engines.ray.utils import initialize_ray # With OmniSci backend there is only a single worker per node # and we allow it to work on all cores. if Backend.get() == "Omnisci": CpuCount.put(1) os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count()) if _is_first_update.get("Ray", True): initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif publisher.get() == "Dask": # pragma: no cover from distributed.client import get_client if threading.current_thread( ).name == "MainThread" and _is_first_update.get("Dask", True): import warnings warnings.warn("The Dask Engine for Modin is experimental.") try: dask_client = get_client() except ValueError: from distributed import Client dask_client = Client(n_workers=CpuCount.get()) elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection conn = get_connection() remote_ray = conn.modules["ray"] if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(partition): from ray import ray_constants import modin from modin.engines.ray.utils import initialize_ray modin.set_backends("Ray", partition) initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray(Backend.get()) # import EngineDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.data_management.factories.dispatcher # noqa: F401 else: get_connection().modules["modin"].set_backends( "Ray", Backend.get()) num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_backends("Python") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus))
move_stdlib_ahead_of_site_packages) if execution_engine == "Ray": import ray initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif execution_engine == "Dask": # pragma: no cover from distributed.client import get_client import warnings if threading.current_thread().name == "MainThread": warnings.warn("The Dask Engine for Modin is experimental.") try: client = get_client() except ValueError: from distributed import Client import multiprocessing num_cpus = multiprocessing.cpu_count() client = Client(n_workers=num_cpus) elif execution_engine != "Python": raise ImportError( "Unrecognized execution engine: {}.".format(execution_engine)) DEFAULT_NPARTITIONS = max(4, int(num_cpus)) __all__ = [ "DataFrame", "Series",
def _update_engine(publisher: Publisher): global DEFAULT_NPARTITIONS, dask_client, num_cpus if publisher.get() == "Ray": import ray from modin.engines.ray.utils import initialize_ray if _is_first_update.get("Ray", True): initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif publisher.get() == "Dask": # pragma: no cover from distributed.client import get_client if threading.current_thread( ).name == "MainThread" and _is_first_update.get("Dask", True): import warnings warnings.warn("The Dask Engine for Modin is experimental.") try: dask_client = get_client() except ValueError: from distributed import Client num_cpus = (os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count()) dask_client = Client(n_workers=int(num_cpus)) elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection import rpyc conn: rpyc.ClassicService = get_connection() remote_ray = conn.modules["ray"] if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(): from ray import ray_constants import modin from modin.engines.ray.utils import initialize_ray modin.set_backends("Ray") initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray() # import EngineDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.data_management.dispatcher # noqa: F401 num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus))