def _read(**kwargs) -> DataFrame: """ General documentation is available in `modin.pandas.read_csv`. This experimental feature provides parallel reading from multiple csv files which are defined by glob pattern. Works for local files only! Parameters ---------- **kwargs : dict Keyword arguments in `modin.pandas.read_csv`. Returns ------- modin.DataFrame """ Engine.subscribe(_update_engine) try: pd_obj = FactoryDispatcher.read_csv_glob(**kwargs) except AttributeError: raise AttributeError( "read_csv_glob() is only implemented for pandas on Ray.") # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj return DataFrame(query_compiler=pd_obj)
def _read(**kwargs): """ Read csv file from local disk. Parameters ---------- filepath_or_buffer: The filepath of the csv file. We only support local files for now. kwargs: Keyword arguments in pandas.read_csv """ from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) try: pd_obj = EngineDispatcher.read_csv_glob(**kwargs) except AttributeError: raise AttributeError( "read_csv_glob() is only implemented for pandas on Ray.") # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj return DataFrame(query_compiler=pd_obj)
def to_pickle_distributed( self, filepath_or_buffer: FilePathOrBuffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): """ Pickle (serialize) object to file. If `*` in the filename all partitions are written to their own separate file, otherwise default pandas implementation is used. Parameters ---------- filepath_or_buffer : str, path object or file-like object File path where the pickled object will be stored. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. Compression mode may be any of the following possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and path_or_buf is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given and mode is 'zip' or inferred as 'zip', other entries passed as additional compression options. protocol : int, default: pickle.HIGHEST_PROTOCOL Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible values are 0, 1, 2, 3, 4, 5. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html. storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. """ from modin.data_management.factories.dispatcher import FactoryDispatcher obj = self Engine.subscribe(_update_engine) if isinstance(self, DataFrame): obj = self._query_compiler FactoryDispatcher.to_pickle_distributed( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, )
def _read(**kwargs) -> DataFrame: """ General documentation is available in `modin.pandas.read_csv`. This experimental feature provides parallel reading from multiple csv files which are defined by glob pattern. Parameters ---------- **kwargs : dict Keyword arguments in `modin.pandas.read_csv`. Returns ------- modin.DataFrame Examples -------- >>> import modin.experimental.pandas as pd >>> df = pd.read_csv_glob("s3://nyc-tlc/trip data/yellow_tripdata_2020-1*") UserWarning: `read_*` implementation has mismatches with pandas: Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue. VendorID tpep_pickup_datetime ... total_amount congestion_surcharge 0 1.0 2020-10-01 00:09:08 ... 4.30 0.0 1 1.0 2020-10-01 00:09:19 ... 13.30 2.5 2 1.0 2020-10-01 00:30:00 ... 15.36 2.5 3 2.0 2020-10-01 00:56:46 ... -3.80 0.0 4 2.0 2020-10-01 00:56:46 ... 3.80 0.0 ... ... ... ... ... ... 4652008 NaN 2020-12-31 23:44:35 ... 43.95 2.5 4652009 NaN 2020-12-31 23:41:36 ... 20.17 2.5 4652010 NaN 2020-12-31 23:01:17 ... 78.98 0.0 4652011 NaN 2020-12-31 23:31:29 ... 39.50 0.0 4652012 NaN 2020-12-31 23:12:48 ... 20.64 0.0 [4652013 rows x 18 columns] """ Engine.subscribe(_update_engine) try: pd_obj = FactoryDispatcher.read_csv_glob(**kwargs) except AttributeError: raise AttributeError( "read_csv_glob() is only implemented for pandas on Ray.") # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj return DataFrame(query_compiler=pd_obj)
def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, partition_column=None, lower_bound=None, upper_bound=None, max_sessions=None, ): """Read SQL query or database table into a DataFrame. Args: sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con: SQLAlchemy connectable (engine/connection) or database string URI or DBAPI2 connection (fallback mode) index_col: Column(s) to set as index(MultiIndex). coerce_float: Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params: List of parameters to pass to execute method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. parse_dates: - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds to the keyword arguments of :func:`pandas.to_datetime` Especially useful with databases without native Datetime support, such as SQLite. columns: List of column names to select from SQL table (only used when reading a table). chunksize: If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. partition_column: column used to share the data between the workers (MUST be a INTEGER column) lower_bound: the minimum value to be requested from the partition_column upper_bound: the maximum value to be requested from the partition_column max_sessions: the maximum number of simultaneous connections allowed to use Returns: Pandas Dataframe """ Engine.subscribe(_update_engine) assert IsExperimental.get(), "This only works in experimental mode" _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
def read_pickle_distributed( filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): """ Load pickled pandas object from files. This experimental feature provides parallel reading from multiple pickle files which are defined by glob pattern. The files must contain parts of one dataframe, which can be obtained, for example, by `to_pickle_distributed` function. Parameters ---------- filepath_or_buffer : str, path object or file-like object File path, URL, or buffer where the pickled object will be loaded from. Accept URL. URL is not limited to S3 and GCS. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' If 'infer' and 'path_or_url' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. Returns ------- unpickled : same type as object stored in file Notes ----- The number of partitions is equal to the number of input files. """ Engine.subscribe(_update_engine) assert IsExperimental.get(), "This only works in experimental mode" _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=FactoryDispatcher.read_pickle_distributed( **kwargs))
def read_custom_text( filepath_or_buffer, columns, custom_parser, compression="infer", nrows: Optional[int] = None, is_quoting=True, ): """ Load custom text data from file. Parameters ---------- filepath_or_buffer : str File path where the custom text data will be loaded from. columns : list or callable(file-like object, **kwargs) -> list Column names of list type or callable that create column names from opened file and passed `kwargs`. custom_parser : callable(file-like object, **kwargs) -> pandas.DataFrame Function that takes as input a part of the `filepath_or_buffer` file loaded into memory in file-like object form. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default: 'infer' If 'infer' and 'path_or_url' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). nrows : int, optional Amount of rows to read. is_quoting : bool, default: True Whether or not to consider quotes. Returns ------- modin.DataFrame """ Engine.subscribe(_update_engine) assert IsExperimental.get(), "This only works in experimental mode" _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=FactoryDispatcher.read_custom_text( **kwargs))
def read_sql(cls, **kwargs): return cls.__engine._read_sql(**kwargs) @classmethod def read_fwf(cls, **kwargs): return cls.__engine._read_fwf(**kwargs) @classmethod def read_sql_table(cls, **kwargs): return cls.__engine._read_sql_table(**kwargs) @classmethod def read_sql_query(cls, **kwargs): return cls.__engine._read_sql_query(**kwargs) @classmethod def read_spss(cls, **kwargs): return cls.__engine._read_spss(**kwargs) @classmethod def to_sql(cls, *args, **kwargs): return cls.__engine._to_sql(*args, **kwargs) @classmethod def to_pickle(cls, *args, **kwargs): return cls.__engine._to_pickle(*args, **kwargs) Engine.subscribe(EngineDispatcher._update_engine) Backend.subscribe(EngineDispatcher._update_engine)
num_cpus = remote_ray.cluster_resources()["CPU"] elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_backends("Python") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False DEFAULT_NPARTITIONS = max(4, int(num_cpus)) Engine.subscribe(_update_engine) from .. import __version__ from .dataframe import DataFrame from .io import ( read_csv, read_parquet, read_json, read_html, read_clipboard, read_excel, read_hdf, read_feather, read_stata, read_sas, read_pickle,
def __init__(self): self.__own_attrs__ = set(type(self).__dict__.keys()) Engine.subscribe(self.__update_engine)
def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, partition_column: Optional[str] = None, lower_bound: Optional[int] = None, upper_bound: Optional[int] = None, max_sessions: Optional[int] = None, ) -> DataFrame: """ General documentation is available in `modin.pandas.read_sql`. This experimental feature provides distributed reading from a sql file. Parameters ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. con : SQLAlchemy connectable, str, or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable; str connections are closed automatically. See `here <https://docs.sqlalchemy.org/en/13/core/connections.html>`_. index_col : str or list of str, optional Column(s) to set as index(MultiIndex). coerce_float : bool, default: True Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. params : list, tuple or dict, optional List of parameters to pass to execute method. The syntax used to pass parameters is database driver dependent. Check your database driver documentation for which of the five syntax styles, described in PEP 249's paramstyle, is supported. Eg. for psycopg2, uses %(name)s so use params= {'name' : 'value'}. parse_dates : list or dict, optional - List of column names to parse as dates. - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times, or is one of (D, s, ns, ms, us) in case of parsing integer timestamps. - Dict of ``{column_name: arg dict}``, where the arg dict corresponds to the keyword arguments of :func:`pandas.to_datetime` Especially useful with databases without native Datetime support, such as SQLite. columns : list, optional List of column names to select from SQL table (only used when reading a table). chunksize : int, optional If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. partition_column : str, optional Column used to share the data between the workers (MUST be a INTEGER column). lower_bound : int, optional The minimum value to be requested from the partition_column. upper_bound : int, optional The maximum value to be requested from the partition_column. max_sessions : int, optional The maximum number of simultaneous connections allowed to use. Returns ------- modin.DataFrame """ Engine.subscribe(_update_engine) assert IsExperimental.get(), "This only works in experimental mode" _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))
def make_wrapped_class(local_cls: type, rpyc_wrapper_name: str): """ Replaces given local class in its module with a replacement class which has __new__ defined (a dual-nature class). This new class is instantiated differently depending on whether this is done in remote or local context. In local context we effectively get the same behaviour, but in remote context the created class is actually of separate type which proxies most requests to a remote end. Parameters ---------- local_cls: class The class to replace with a dual-nature class rpyc_wrapper_name: str The function *name* to make a proxy class type. Note that this is specifically taken as string to not import "rpyc_proxy" module in top-level, as it requires RPyC to be installed, and not all users of Modin (even in experimental mode) need remote context. """ # get a copy of local_cls attributes' dict but skip _very_ special attributes, # because copying them to a different type leads to them not working. # Python should create new descriptors automatically for us instead. namespace = { name: value for name, value in local_cls.__dict__.items() if not isinstance(value, types.GetSetDescriptorType) } namespace["__real_cls__"] = None namespace["__new__"] = None # define a new class the same way original was defined but with replaced # metaclass and a few more attributes in namespace result = RemoteMeta(local_cls.__name__, local_cls.__bases__, namespace) def make_new(__class__): """ Define a __new__() with a __class__ that is closure-bound, needed for super() to work """ # update '__class__' magic closure value - used by super() for attr in __class__.__dict__.values(): if not callable(attr): continue cells = getattr(attr, "__closure__", None) or () for cell in cells: if cell.cell_contents is local_cls: cell.cell_contents = __class__ def __new__(cls, *a, **kw): if cls is result and cls.__real_cls__ is not result: return cls.__real_cls__(*a, **kw) return super().__new__(cls) __class__.__new__ = __new__ make_new(result) setattr(sys.modules[local_cls.__module__], local_cls.__name__, result) _KNOWN_DUALS[local_cls] = result def update_class(_): if Engine.get() in REMOTE_ENGINES: from . import rpyc_proxy result.__real_cls__ = getattr(rpyc_proxy, rpyc_wrapper_name)(result) else: result.__real_cls__ = result Engine.subscribe(update_class)
return cls.__factory._read_sql_table(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_sql_query) def read_sql_query(cls, **kwargs): return cls.__factory._read_sql_query(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._read_spss) def read_spss(cls, **kwargs): return cls.__factory._read_spss(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_sql) def to_sql(cls, *args, **kwargs): return cls.__factory._to_sql(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_pickle) def to_pickle(cls, *args, **kwargs): return cls.__factory._to_pickle(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_csv) def to_csv(cls, *args, **kwargs): return cls.__factory._to_csv(*args, **kwargs) Engine.subscribe(FactoryDispatcher._update_factory) Backend.subscribe(FactoryDispatcher._update_factory)