def simulate_cloud(request): mode = request.config.getoption("--simulate-cloud").lower() if mode == "off": yield return if mode not in ("normal", "experimental"): raise ValueError(f"Unsupported --simulate-cloud mode: {mode}") assert IsExperimental.get( ), "Simulated cloud must be started in experimental mode" from modin.experimental.cloud import create_cluster, get_connection import modin.pandas.test.utils with create_cluster("local", cluster_type="local"): get_connection().teleport(set_experimental_env)(mode) with Patcher( get_connection(), (modin.pandas.test.utils, "assert_index_equal"), (modin.pandas.test.utils, "assert_series_equal"), (modin.pandas.test.utils, "assert_frame_equal"), (modin.pandas.test.utils, "assert_extension_array_equal"), (modin.pandas.test.utils, "assert_empty_frame_equal"), ): yield
def _update_engine(publisher: Parameter): global dask_client from modin.config import Backend, CpuCount if publisher.get() == "Ray": from modin.engines.ray.utils import initialize_ray # With OmniSci backend there is only a single worker per node # and we allow it to work on all cores. if Backend.get() == "Omnisci": CpuCount.put(1) os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count()) if _is_first_update.get("Ray", True): initialize_ray() elif publisher.get() == "Dask": if _is_first_update.get("Dask", True): from modin.engines.dask.utils import initialize_dask initialize_dask() elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection conn = get_connection() if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(partition): from ray import ray_constants import modin from modin.engines.ray.utils import initialize_ray modin.set_backends("Ray", partition) initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray(Backend.get()) # import FactoryDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.data_management.factories.dispatcher # noqa: F401 else: get_connection().modules["modin"].set_backends( "Ray", Backend.get()) elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_backends("Python") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False
def __update_engine(self, _): if Engine.get() in REMOTE_ENGINES: from modin.experimental.cloud import get_connection self.__swap_numpy(get_connection().modules["numpy"]) else: self.__swap_numpy()
def __update_engine(self, _): if execution_engine.get() == "Cloudray": from modin.experimental.cloud import get_connection self.__swap_numpy(get_connection().modules["numpy"]) else: self.__swap_numpy()
def simulate_cloud(request): mode = request.config.getoption("--simulate-cloud").lower() if mode == "off": yield return if mode not in ("normal", "experimental"): raise ValueError(f"Unsupported --simulate-cloud mode: {mode}") assert IsExperimental.get(), "Simulated cloud must be started in experimental mode" from modin.experimental.cloud import create_cluster, get_connection import pandas._testing import pandas._libs.testing as cyx_testing with create_cluster("local", cluster_type="local"): get_connection().teleport(set_experimental_env)(mode) with Patcher( get_connection(), (pandas._testing, "assert_class_equal"), (cyx_testing, "assert_almost_equal"), ): yield
def simulate_cloud(request): mode = request.config.getoption("--simulate-cloud").lower() if mode == "off": yield return if mode not in ("normal", "experimental"): raise ValueError(f"Unsupported --simulate-cloud mode: {mode}") os.environ["MODIN_EXPERIMENTAL"] = "True" from modin.experimental.cloud import create_cluster, get_connection with create_cluster("local", __spawner__="local"): def set_env(mode): import os os.environ["MODIN_EXPERIMENTAL"] = ( "True" if mode == "experimental" else "False" ) get_connection().teleport(set_env)(mode) yield
def prepare(cls): # query_compiler import is needed so remote PandasQueryCompiler # has an imported local counterpart; # if there isn't such counterpart rpyc generates some bogus # class type which raises TypeError() # upon checking its isinstance() or issubclass() import modin.backends.pandas.query_compiler # noqa: F401 from modin.experimental.cloud import get_connection # import a numpy overrider if it wasn't already imported import modin.experimental.pandas.numpy_wrap # noqa: F401 class WrappedIO: def __init__(self, conn, factory): self.__conn = conn remote_factory = getattr(conn.modules[factory.__module__], factory.__name__) remote_factory.prepare() self.__io_cls = remote_factory.io_cls self.__reads = { name for name in BaseIO.__dict__ if name.startswith("read_") } self.__wrappers = {} def __getattr__(self, name): if name in self.__reads: try: wrap = self.__wrappers[name] except KeyError: def wrap(*a, _original=getattr(self.__io_cls, name), **kw): a, kw = self.__conn.deliver(a, kw) return _original(*a, **kw) self.__wrappers[name] = wrap else: wrap = getattr(self.__io_cls, name) return wrap cls.io_cls = WrappedIO(get_connection(), cls.wrapped_factory)
def _update_engine(publisher: Parameter): global DEFAULT_NPARTITIONS, dask_client, num_cpus from modin.config import Backend, CpuCount if publisher.get() == "Ray": import ray from modin.engines.ray.utils import initialize_ray # With OmniSci backend there is only a single worker per node # and we allow it to work on all cores. if Backend.get() == "Omnisci": CpuCount.put(1) os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count()) if _is_first_update.get("Ray", True): initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif publisher.get() == "Dask": # pragma: no cover from distributed.client import get_client if threading.current_thread( ).name == "MainThread" and _is_first_update.get("Dask", True): import warnings warnings.warn("The Dask Engine for Modin is experimental.") try: dask_client = get_client() except ValueError: from distributed import Client dask_client = Client(n_workers=CpuCount.get()) elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection conn = get_connection() remote_ray = conn.modules["ray"] if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(partition): from ray import ray_constants import modin from modin.engines.ray.utils import initialize_ray modin.set_backends("Ray", partition) initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray(Backend.get()) # import EngineDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.data_management.factories.dispatcher # noqa: F401 else: get_connection().modules["modin"].set_backends( "Ray", Backend.get()) num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_backends("Python") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus))
def _update_engine(publisher: Publisher): global DEFAULT_NPARTITIONS, dask_client, num_cpus if publisher.get() == "Ray": import ray from modin.engines.ray.utils import initialize_ray if _is_first_update.get("Ray", True): initialize_ray() num_cpus = ray.cluster_resources()["CPU"] elif publisher.get() == "Dask": # pragma: no cover from distributed.client import get_client if threading.current_thread( ).name == "MainThread" and _is_first_update.get("Dask", True): import warnings warnings.warn("The Dask Engine for Modin is experimental.") try: dask_client = get_client() except ValueError: from distributed import Client num_cpus = (os.environ.get("MODIN_CPUS", None) or multiprocessing.cpu_count()) dask_client = Client(n_workers=int(num_cpus)) elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection import rpyc conn: rpyc.ClassicService = get_connection() remote_ray = conn.modules["ray"] if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(): from ray import ray_constants import modin from modin.engines.ray.utils import initialize_ray modin.set_backends("Ray") initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray() # import EngineDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.data_management.dispatcher # noqa: F401 num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus))
"aws_credentials", cluster_name="rayscale-test", region="eu-north-1", zone="eu-north-1b", image="ami-00e1e82d7d4ca80d3", **cluster_params, ) with test_cluster: data_file = "https://modin-datasets.s3.amazonaws.com/trips_data.csv" if USE_OMNISCI: # Workaround for GH#2099 from modin.experimental.cloud import get_connection data_file, remote_data_file = "/tmp/trips_data.csv", data_file get_connection().modules["subprocess"].check_call( ["wget", remote_data_file, "-O", data_file] ) # Omniscripts check for files being present when given local file paths, # so replace "glob" there with a remote one import utils.utils utils.utils.glob = get_connection().modules["glob"] parameters = { "data_file": data_file, # "data_file": "s3://modin-datasets/trips_data.csv", "dfiles_num": 1, "validation": False, "no_ibis": True, "no_pandas": False,
import modin.experimental.pandas as pd # noqa: F401 from modin.experimental.cloud import create_cluster, get_connection from mortgage import run_benchmark from mortgage.mortgage_pandas import etl_pandas test_cluster = create_cluster( "aws", "aws_credentials", cluster_name="rayscale-test", region="eu-north-1", zone="eu-north-1b", image="ami-00e1e82d7d4ca80d3", ) with test_cluster: conn = get_connection() np = conn.modules["numpy"] etl_pandas.__globals__["np"] = np parameters = { "data_file": "https://modin-datasets.s3.amazonaws.com/mortgage", # "data_file": "s3://modin-datasets/mortgage", "dfiles_num": 1, "no_ml": True, "validation": False, "no_ibis": True, "no_pandas": False, "pandas_mode": "Modin_on_ray", "ray_tmpdir": "/tmp", "ray_memory": 1024 * 1024 * 1024, }
cluster_name="rayscale-test", region="eu-central-1", zone="eu-central-1b", image="ami-05f7491af5eef733a", **cluster_params, ) with test_cluster: if USE_OMNISCI: from modin.experimental.cloud import get_connection # We should move omniscripts trigger in remote conext # https://github.com/intel-ai/omniscripts/blob/7d4599bcacf51de876952c658048571d32275ac1/taxi/taxibench_pandas_ibis.py#L482 import modin.experimental.engines.omnisci_on_native.frame.omnisci_worker OmnisciServer = (get_connection().modules[ "modin.experimental.engines.omnisci_on_native.frame.omnisci_worker"] .OmnisciServer) modin.experimental.engines.omnisci_on_native.frame.omnisci_worker.OmnisciServer = ( OmnisciServer) # Omniscripts check for files being present when given local file paths, # so replace "glob" there with a remote one import utils.utils utils.utils.glob = get_connection().modules["glob"] parameters = { "data_file": "s3://modin-datasets/cloud/taxi/trips_xaa.csv", "dfiles_num": 1, "validation": False, "no_ibis": True,
def _update_engine(publisher: Parameter): global dask_client from modin.config import StorageFormat, CpuCount from modin.config.envvars import IsExperimental from modin.config.pubsub import ValueSource if (StorageFormat.get() == "Omnisci" and publisher.get_value_source() == ValueSource.DEFAULT): publisher.put("Native") IsExperimental.put(True) elif (publisher.get() == "Native" and StorageFormat.get_value_source() == ValueSource.DEFAULT): StorageFormat.put("Omnisci") IsExperimental.put(True) elif publisher.get() == "Ray": if _is_first_update.get("Ray", True): from modin.core.execution.ray.common.utils import initialize_ray initialize_ray() elif publisher.get() == "Native": # With OmniSci storage format there is only a single worker per node # and we allow it to work on all cores. if StorageFormat.get() == "Omnisci": os.environ["OMP_NUM_THREADS"] = str(CpuCount.get()) else: raise ValueError( f"Storage format should be 'Omnisci' with 'Native' engine, but provided {StorageFormat.get()}." ) elif publisher.get() == "Dask": if _is_first_update.get("Dask", True): from modin.core.execution.dask.common.utils import initialize_dask initialize_dask() elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection conn = get_connection() if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(partition): from ray import ray_constants import modin from modin.core.execution.ray.common.utils import initialize_ray modin.set_execution("Ray", partition) initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray(StorageFormat.get()) # import FactoryDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.core.execution.dispatching.factories.dispatcher # noqa: F401 else: get_connection().modules["modin"].set_execution( "Ray", StorageFormat.get()) elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_execution("Python") elif publisher.get() == "Cloudnative": from modin.experimental.cloud import get_connection assert ( StorageFormat.get() == "Omnisci" ), f"Storage format should be 'Omnisci' with 'Cloudnative' engine, but provided {StorageFormat.get()}." get_connection().modules["modin"].set_execution("Native", "OmniSci") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False