def _update_engine(cls, _): factory_name = get_current_backend() + "Factory" try: cls.__engine = getattr(factories, factory_name) except AttributeError: if not IsExperimental.get(): # allow missing factories in experimenal mode only if hasattr(factories, "Experimental" + factory_name): msg = ( "{0} on {1} is only accessible through the experimental API.\nRun " "`import modin.experimental.pandas as pd` to use {0} on {1}." ) else: msg = ( "Cannot find a factory for partition '{}' and execution engine '{}'. " "Potential reason might be incorrect environment variable value for " f"{Backend.varname} or {Engine.varname}") raise FactoryNotFoundError( msg.format(Backend.get(), Engine.get())) cls.__engine = StubFactory.set_failing_name(factory_name) else: cls.__engine.prepare()
def predict( self, data: DMatrix, nthread: Optional[int] = cpu_count(), **kwargs, ): """ Run prediction with a trained booster. Parameters ---------- data : DMatrix Input data used for prediction. nthread : int. Default is number of threads on master node Number of threads for using in each node. \\*\\*kwargs : Other parameters are the same as `xgboost.Booster.predict`. Returns ------- ``modin.pandas.DataFrame`` Modin DataFrame with prediction results. """ LOGGER.info("Prediction started") if Engine.get() == "Ray": from .xgboost_ray import _predict else: raise ValueError("Current version supports only Ray engine.") assert isinstance( data, DMatrix ), f"Type of `data` is {type(data)}, but expected {DMatrix}." result = _predict(self.copy(), data, nthread, **kwargs) LOGGER.info("Prediction finished") return result
def predict( self, data: DMatrix, **kwargs, ): """ Run distributed prediction with a trained booster. During execution it runs ``xgb.predict`` on each worker for subset of `data` and creates Modin DataFrame with prediction results. Parameters ---------- data : modin.experimental.xgboost.DMatrix Input data used for prediction. **kwargs : dict Other parameters are the same as for ``xgboost.Booster.predict``. Returns ------- modin.pandas.DataFrame Modin DataFrame with prediction results. """ LOGGER.info("Prediction started") if Engine.get() == "Ray": from .xgboost_ray import _predict else: raise ValueError("Current version supports only Ray engine.") assert isinstance( data, DMatrix ), f"Type of `data` is {type(data)}, but expected {DMatrix}." result = _predict(self.copy(), data, **kwargs) LOGGER.info("Prediction finished") return result
def _update_factory(cls, _): """ Update and prepare factory with a new one specified via Modin config. Parameters ---------- _ : object This parameters serves the compatibility purpose. Does not affect the result. """ factory_name = get_current_execution() + "Factory" try: cls.__factory = getattr(factories, factory_name) except AttributeError: if factory_name == "ExperimentalOmnisciOnRayFactory": msg = ("OmniSci storage format no longer needs Ray engine; " + "please specify MODIN_ENGINE='native'") raise FactoryNotFoundError(msg) if not IsExperimental.get(): # allow missing factories in experimenal mode only if hasattr(factories, "Experimental" + factory_name): msg = ( "{0} on {1} is only accessible through the experimental API.\nRun " + "`import modin.experimental.pandas as pd` to use {0} on {1}." ) else: msg = ( "Cannot find a factory for partition '{}' and execution engine '{}'. " + "Potential reason might be incorrect environment variable value for " + f"{StorageFormat.varname} or {Engine.varname}") raise FactoryNotFoundError( msg.format(StorageFormat.get(), Engine.get())) cls.__factory = StubFactory.set_failing_name(factory_name) else: cls.__factory.prepare()
def train( params: Dict, dtrain: ModinDMatrix, *args, evals=(), nthread: Optional[int] = cpu_count(), evenly_data_distribution: Optional[bool] = True, **kwargs, ): """ Train XGBoost model. Parameters ---------- params : dict Booster params. dtrain : ModinDMatrix Data to be trained against. evals: list of pairs (ModinDMatrix, string) List of validation sets for which metrics will evaluated during training. Validation metrics will help us track the performance of the model. nthread : int Number of threads for using in each node. By default it is equal to number of threads on master node. evenly_data_distribution : boolean, default True Whether make evenly distribution of partitions between nodes or not. In case `False` minimal datatransfer between nodes will be provided but the data may not be evenly distributed. \\*\\*kwargs : Other parameters are the same as `xgboost.train` except for `evals_result`, which is returned as part of function return value instead of argument. Returns ------- dict A dictionary containing trained booster and evaluation history. `history` field is the same as `eval_result` from `xgboost.train`. .. code-block:: python {'booster': xgboost.Booster, 'history': {'train': {'logloss': ['0.48253', '0.35953']}, 'eval': {'logloss': ['0.480385', '0.357756']}}} """ LOGGER.info("Training started") if Engine.get() == "Ray": from .xgboost_ray import _train else: raise ValueError("Current version supports only Ray engine.") result = _train(dtrain, nthread, evenly_data_distribution, params, *args, evals=evals, **kwargs) LOGGER.info("Training finished") return result
RAND_HIGH = 100 random_state = np.random.RandomState(seed=42) try: from modin.config import NPartitions NPARTITIONS = NPartitions.get() except ImportError: NPARTITIONS = pd.DEFAULT_NPARTITIONS try: from modin.config import TestDatasetSize, AsvImplementation, Engine ASV_USE_IMPL = AsvImplementation.get() ASV_DATASET_SIZE = TestDatasetSize.get() or "Small" ASV_USE_ENGINE = Engine.get() except ImportError: # The same benchmarking code can be run for different versions of Modin, so in # case of an error importing important variables, we'll just use predefined values ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin") ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small") ASV_USE_ENGINE = os.environ.get("MODIN_ENGINE", "Ray") ASV_USE_IMPL = ASV_USE_IMPL.lower() ASV_DATASET_SIZE = ASV_DATASET_SIZE.lower() ASV_USE_ENGINE = ASV_USE_ENGINE.lower() assert ASV_USE_IMPL in ("modin", "pandas") assert ASV_DATASET_SIZE in ("big", "small") assert ASV_USE_ENGINE in ("ray", "dask", "python")
def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, partition_column: Optional[str] = None, lower_bound: Optional[int] = None, upper_bound: Optional[int] = None, max_sessions: Optional[int] = None, ) -> DataFrame: """ General documentation is available in `modin.pandas.read_sql`. This experimental feature provides distributed reading from a sql file. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable; str connections are closed automatically. See `here <https://docs.sqlalchemy.org/en/13/core/connections.html>`_. index_col : str or list of str, optional Column(s) to set as index(MultiIndex). coerce_float : bool, default: True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params : list, tuple or dict, optional List of parameters to pass to execute method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. Eg. for psycopg2, uses %(name)s so use params= {'name' : 'value'}. parse_dates : list or dict, optional - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds to the keyword arguments of :func:`pandas.to_datetime` Especially useful with databases without native Datetime support, such as SQLite. columns : list, optional List of column names to select from SQL table (only used when reading a table). chunksize : int, optional If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. partition_column : str, optional Column used to share the data between the workers (MUST be a INTEGER column). lower_bound : int, optional The minimum value to be requested from the partition_column. upper_bound : int, optional The maximum value to be requested from the partition_column. max_sessions : int, optional The maximum number of simultaneous connections allowed to use. Returns ------- modin.DataFrame """ Engine.subscribe(_update_engine) assert IsExperimental.get(), "This only works in experimental mode" _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))
def __init__(self): self.__own_attrs__ = set(type(self).__dict__.keys()) Engine.subscribe(self.__update_engine)
# # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import numpy as np import pandas import pytest import modin.pandas as pd from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions from modin.config import Engine, NPartitions from modin.pandas.test.utils import df_equals if Engine.get() == "Ray": import ray if Engine.get() == "Dask": from distributed.client import get_client NPartitions.put(4) @pytest.mark.parametrize("axis", [None, 0, 1]) def test_unwrap_partitions(axis): data = np.random.randint(0, 100, size=(2**16, 2**8)) df = pd.DataFrame(data) if axis is None: expected_partitions = df._query_compiler._modin_frame._partitions actual_partitions = np.array(unwrap_partitions(df, axis=axis))
import pandas import pytest import modin.experimental.pandas as pd from modin.config import Engine from modin.pandas.test.test_io import ( # noqa: F401 df_equals, eval_io, make_sql_connection, _make_csv_file, teardown_test_files, ) from modin.pandas.test.utils import get_unique_filename @pytest.mark.skipif( Engine.get() == "Dask", reason="Dask does not have experimental API", ) def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": pytest.xfail("Distributed read_sql is broken, see GH#2194") filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql( query, conn, partition_column="col1", lower_bound=0, upper_bound=6 ) modin_df_from_table = pd.read_sql(
# compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest from modin.config import Engine import modin.experimental.xgboost as xgb import modin.pandas as pd @pytest.mark.skipif( Engine.get() == "Ray", reason="This test doesn't make sense on Ray backend.", ) @pytest.mark.skipif( Engine.get() == "Python", reason= "This test doesn't make sense on not distributed backend (see issue #2938).", ) def test_backend(): try: xgb.train({}, xgb.DMatrix(pd.DataFrame([0]), pd.DataFrame([0]))) except ValueError: pass
def test_engine_wrong_factory(): with pytest.raises(FactoryNotFoundError): Engine.put("BadEngine") Engine.put("Python") # revert engine to default
# # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import os import pandas import numpy as np import pyarrow import pytest from modin.config import IsExperimental, Engine, Backend IsExperimental.put(True) Engine.put("ray") Backend.put("omnisci") import modin.pandas as pd from modin.pandas.test.utils import ( df_equals, bool_arg_values, to_pandas, test_data_values, test_data_keys, generate_multiindex, eval_general, ) def set_execution_mode(frame, mode, recursive=False):
# Licensed to Modin Development Team under one or more contributor license agreements. # See the NOTICE file distributed with this work for additional information regarding # copyright ownership. The Modin Development Team licenses this file to you under the # Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pytest from modin.config import Engine import modin.experimental.xgboost as xgb @pytest.mark.skipif( Engine.get() == "Ray", reason="This test doesn't make sense on Ray backend.", ) @pytest.mark.parametrize("func", ["train", "predict"]) def test_backend(func): try: getattr(xgb, func)({}, xgb.ModinDMatrix(None, None)) except ValueError: pass
def train( params: Dict, dtrain: DMatrix, *args, evals=(), num_actors: Optional[int] = None, evals_result: Optional[Dict] = None, **kwargs, ): """ Run distributed training of XGBoost model. During work it evenly distributes `dtrain` between workers according to IP addresses partitions (in case of not even distribution of `dtrain` over nodes, some partitions will be re-distributed between nodes), runs xgb.train on each worker for subset of `dtrain` and reduces training results of each worker using Rabit Context. Parameters ---------- params : dict Booster params. dtrain : modin.experimental.xgboost.DMatrix Data to be trained against. *args : iterable Other parameters for `xgboost.train`. evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty List of validation sets for which metrics will evaluated during training. Validation metrics will help us track the performance of the model. num_actors : int, optional Number of actors for training. If unspecified, this value will be computed automatically. evals_result : dict, optional Dict to store evaluation results in. **kwargs : dict Other parameters are the same as `xgboost.train`. Returns ------- modin.experimental.xgboost.Booster A trained booster. """ LOGGER.info("Training started") if Engine.get() == "Ray": from .xgboost_ray import _train else: raise ValueError("Current version supports only Ray engine.") assert isinstance( dtrain, DMatrix ), f"Type of `dtrain` is {type(dtrain)}, but expected {DMatrix}." result = _train(dtrain, params, *args, num_actors=num_actors, evals=evals, **kwargs) if isinstance(evals_result, dict): evals_result.update(result["history"]) LOGGER.info("Training finished") return Booster(model_file=result["booster"])
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. import pandas import pytest import modin.experimental.pandas as pd from modin.config import Engine from modin.pandas.test.test_io import ( # noqa: F401 df_equals, make_sql_connection, ) @pytest.mark.skipif( Engine.get() == "Dask", reason="Dask does not have experimental API", ) def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": pytest.xfail("Distributed read_sql is broken, see GH#2194") filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql( query, conn, partition_column="col1", lower_bound=0, upper_bound=6 ) modin_df_from_table = pd.read_sql(
def predict( self, data: DMatrix, **kwargs, ): """ Run distributed prediction with a trained booster. During execution it runs ``xgb.predict`` on each worker for subset of `data` and creates Modin DataFrame with prediction results. Parameters ---------- data : modin.experimental.xgboost.DMatrix Input data used for prediction. **kwargs : dict Other parameters are the same as for ``xgboost.Booster.predict``. Returns ------- modin.pandas.DataFrame Modin DataFrame with prediction results. """ LOGGER.info("Prediction started") if Engine.get() == "Ray": from .xgboost_ray import _predict else: raise ValueError("Current version supports only Ray engine.") assert isinstance( data, DMatrix ), f"Type of `data` is {type(data)}, but expected {DMatrix}." if ( self.feature_names is not None and data.feature_names is not None and self.feature_names != data.feature_names ): data_missing = set(self.feature_names) - set(data.feature_names) self_missing = set(data.feature_names) - set(self.feature_names) msg = "feature_names mismatch: {0} {1}" if data_missing: msg += ( "\nexpected " + ", ".join(str(s) for s in data_missing) + " in input data" ) if self_missing: msg += ( "\ntraining data did not have the following fields: " + ", ".join(str(s) for s in self_missing) ) raise ValueError(msg.format(self.feature_names, data.feature_names)) result = _predict(self.copy(), data, **kwargs) LOGGER.info("Prediction finished") return result
import pytest import modin.experimental.pandas as pd from modin.config import Engine from modin.utils import get_current_execution from modin.pandas.test.utils import ( df_equals, get_unique_filename, teardown_test_files, test_data, ) from modin.test.test_utils import warns_that_defaulting_to_pandas from modin.pandas.test.utils import parse_dates_values_by_id, time_parsing_csv_path @pytest.mark.skipif( Engine.get() == "Dask", reason="Dask does not have experimental API", ) def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql( query, conn, partition_column="col1", lower_bound=0,
return cls.__factory._read_sql_table(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql_query) def read_sql_query(cls, **kwargs): return cls.__factory._read_sql_query(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_spss) def read_spss(cls, **kwargs): return cls.__factory._read_spss(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_sql) def to_sql(cls, *args, **kwargs): return cls.__factory._to_sql(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_pickle) def to_pickle(cls, *args, **kwargs): return cls.__factory._to_pickle(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_csv) def to_csv(cls, *args, **kwargs): return cls.__factory._to_csv(*args, **kwargs) Engine.subscribe(FactoryDispatcher._update_factory) Backend.subscribe(FactoryDispatcher._update_factory)
num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_backends("Python") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus)) Engine.subscribe(_update_engine) from .. import __version__ from .dataframe import DataFrame from .io import ( read_csv, read_parquet, read_json, read_html, read_clipboard, read_excel, read_hdf, read_feather, read_stata, read_sas, read_pickle,
def make_wrapped_class(local_cls: type, rpyc_wrapper_name: str): """ Replaces given local class in its module with a replacement class which has __new__ defined (a dual-nature class). This new class is instantiated differently depending on whether this is done in remote or local context. In local context we effectively get the same behaviour, but in remote context the created class is actually of separate type which proxies most requests to a remote end. Parameters ---------- local_cls: class The class to replace with a dual-nature class rpyc_wrapper_name: str The function *name* to make a proxy class type. Note that this is specifically taken as string to not import "rpyc_proxy" module in top-level, as it requires RPyC to be installed, and not all users of Modin (even in experimental mode) need remote context. """ # get a copy of local_cls attributes' dict but skip _very_ special attributes, # because copying them to a different type leads to them not working. # Python should create new descriptors automatically for us instead. namespace = { name: value for name, value in local_cls.__dict__.items() if not isinstance(value, types.GetSetDescriptorType) } namespace["__real_cls__"] = None namespace["__new__"] = None # define a new class the same way original was defined but with replaced # metaclass and a few more attributes in namespace result = RemoteMeta(local_cls.__name__, local_cls.__bases__, namespace) def make_new(__class__): """ Define a __new__() with a __class__ that is closure-bound, needed for super() to work """ # update '__class__' magic closure value - used by super() for attr in __class__.__dict__.values(): if not callable(attr): continue cells = getattr(attr, "__closure__", None) or () for cell in cells: if cell.cell_contents is local_cls: cell.cell_contents = __class__ def __new__(cls, *a, **kw): if cls is result and cls.__real_cls__ is not result: return cls.__real_cls__(*a, **kw) return super().__new__(cls) __class__.__new__ = __new__ make_new(result) setattr(sys.modules[local_cls.__module__], local_cls.__name__, result) _KNOWN_DUALS[local_cls] = result def update_class(_): if Engine.get() in REMOTE_ENGINES: from . import rpyc_proxy result.__real_cls__ = getattr(rpyc_proxy, rpyc_wrapper_name)(result) else: result.__real_cls__ = result Engine.subscribe(update_class)
def read_sql(cls, **kwargs): return cls.__engine._read_sql(**kwargs) @classmethod def read_fwf(cls, **kwargs): return cls.__engine._read_fwf(**kwargs) @classmethod def read_sql_table(cls, **kwargs): return cls.__engine._read_sql_table(**kwargs) @classmethod def read_sql_query(cls, **kwargs): return cls.__engine._read_sql_query(**kwargs) @classmethod def read_spss(cls, **kwargs): return cls.__engine._read_spss(**kwargs) @classmethod def to_sql(cls, *args, **kwargs): return cls.__engine._to_sql(*args, **kwargs) @classmethod def to_pickle(cls, *args, **kwargs): return cls.__engine._to_pickle(*args, **kwargs) Engine.subscribe(EngineDispatcher._update_engine) Backend.subscribe(EngineDispatcher._update_engine)
import numpy as np import pandas import pytest import modin.pandas as pd from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions from modin.config import Engine, NPartitions from modin.pandas.test.utils import df_equals from modin.pandas.indexing import compute_sliced_len from modin.data_management.factories.dispatcher import FactoryDispatcher PartitionClass = (FactoryDispatcher.get_factory().io_cls.frame_cls. _partition_mgr_cls._partition_class) if Engine.get() == "Ray": import ray put_func = ray.put get_func = ray.get FutureType = ray.ObjectRef elif Engine.get() == "Dask": from distributed.client import default_client from distributed import Future put_func = lambda x: default_client().scatter(x) # noqa: E731 get_func = lambda x: x.result() # noqa: E731 FutureType = Future elif Engine.get() == "Python": put_func = lambda x: x # noqa: E731 get_func = lambda x: x # noqa: E731