def test_factory_switch(): Engine.put("Test") assert FactoryDispatcher.get_factory() == PandasOnTestFactory assert FactoryDispatcher.get_factory().io_cls == "Foo" Engine.put("Python") # revert engine to default StorageFormat.put("Test") assert FactoryDispatcher.get_factory() == TestOnPythonFactory assert FactoryDispatcher.get_factory().io_cls == "Bar" StorageFormat.put("Pandas") # revert engine to default
def to_pickle_distributed( self, filepath_or_buffer: FilePathOrBuffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): """ Pickle (serialize) object to file. This experimental feature provides parallel writing into multiple pickle files which are defined by glob pattern, otherwise (without glob pattern) default pandas implementation is used. Parameters ---------- filepath_or_buffer : str, path object or file-like object File path where the pickled object will be stored. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. Compression mode may be any of the following possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and path_or_buf is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given and mode is 'zip' or inferred as 'zip', other entries passed as additional compression options. protocol : int, default: pickle.HIGHEST_PROTOCOL Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible values are 0, 1, 2, 3, 4, 5. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html. storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. """ obj = self Engine.subscribe(_update_engine) if isinstance(self, DataFrame): obj = self._query_compiler FactoryDispatcher.to_pickle_distributed( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, )
def _read(**kwargs): """ Read csv file from local disk. Parameters ---------- **kwargs : dict Keyword arguments in pandas.read_csv. Returns ------- modin.pandas.DataFrame """ Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher squeeze = kwargs.pop("squeeze", False) pd_obj = FactoryDispatcher.read_csv(**kwargs) # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj result = DataFrame(query_compiler=pd_obj) if squeeze: return result.squeeze(axis=1) return result
def read_fwf( filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], colspecs="infer", widths=None, infer_nrows=100, **kwds, ): # noqa: PR01, RT01, D200 """ Read a table of fixed-width formatted lines into DataFrame. """ Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher from pandas.io.parsers.base_parser import parser_defaults _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwds", {})) target_kwargs = parser_defaults.copy() target_kwargs.update(kwargs) pd_obj = FactoryDispatcher.read_fwf(**target_kwargs) # When `read_fwf` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj return DataFrame(query_compiler=pd_obj)
def from_non_pandas(df, index, columns, dtype): """ Convert a non-pandas DataFrame into Modin DataFrame. Parameters ---------- df : object Non-pandas DataFrame. index : object Index for non-pandas DataFrame. columns : object Columns for non-pandas DataFrame. dtype : type Data type to force. Returns ------- modin.pandas.DataFrame Converted DataFrame. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher new_qc = FactoryDispatcher.from_non_pandas(df, index, columns, dtype) if new_qc is not None: from .dataframe import DataFrame return DataFrame(query_compiler=new_qc) return new_qc
def read_json( path_or_buf=None, orient=None, typ="frame", dtype=None, convert_axes=None, convert_dates=True, keep_default_dates=True, numpy=False, precise_float=False, date_unit=None, encoding=None, encoding_errors="strict", lines=False, chunksize=None, compression="infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_json(**kwargs))
def read_html( io, match=".+", flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, thousands=",", encoding=None, decimal=".", converters=None, na_values=None, keep_default_na=True, displayed_only=True, ): # noqa: PR01, RT01, D200 """ Read HTML tables into a ``DataFrame`` object. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_html(**kwargs))
def read_gbq( query: str, project_id: Optional[str] = None, index_col: Optional[str] = None, col_order: Optional[List[str]] = None, reauth: bool = False, auth_local_webserver: bool = False, dialect: Optional[str] = None, location: Optional[str] = None, configuration: Optional[Dict[str, Any]] = None, credentials=None, use_bqstorage_api: Optional[bool] = None, progress_bar_type: Optional[str] = None, max_results: Optional[int] = None, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Load data from Google BigQuery. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_gbq(**kwargs))
def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_clipboard(**kwargs))
def read_spss( path: Union[str, pathlib.Path], usecols: Union[Sequence[str], type(None)] = None, convert_categoricals: bool = True, ): Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_spss( path, usecols, convert_categoricals))
def read_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_pickle(**kwargs))
def to_pickle( obj: Any, filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): # noqa: RT01 """ Pickle (serialize) object to file. Parameters ---------- obj : any object Any python object. filepath_or_buffer : str, path object, or file-like object String, path object (implementing `os.PathLike[str]`), or file-like object implementing a binary ``write()`` function. compression : str or dict, default: 'infer' For on-the-fly compression of the output data. If `infer` and `filepath_or_buffer` path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to `None` for no compression. Can also be a dict with key `method` set to one of {`zip`, `gzip`, `bz2`, `zstd`} and other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an example, the following could be passed for faster compression and to create a reproducible gzip archive: `compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}`. protocol : int, default: pickle.HIGHEST_PROTOCOL Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL. The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to ``urllib`` as header options. For other URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details. """ Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(obj, DataFrame): obj = obj._query_compiler return FactoryDispatcher.to_pickle( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, )
def read_feather( path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_feather(**kwargs))
def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if kwargs.get("chunksize") is not None: ErrorMessage.default_to_pandas("Parameters provided [chunksize]") df_gen = pandas.read_sql(**kwargs) return (DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) for df in df_gen) return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))
def _read(**kwargs) -> DataFrame: """ General documentation is available in `modin.pandas.read_csv`. This experimental feature provides parallel reading from multiple csv files which are defined by glob pattern. Parameters ---------- **kwargs : dict Keyword arguments in `modin.pandas.read_csv`. Returns ------- modin.DataFrame Examples -------- >>> import modin.experimental.pandas as pd >>> df = pd.read_csv_glob("s3://nyc-tlc/trip data/yellow_tripdata_2020-1*") UserWarning: `read_*` implementation has mismatches with pandas: Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue. VendorID tpep_pickup_datetime ... total_amount congestion_surcharge 0 1.0 2020-10-01 00:09:08 ... 4.30 0.0 1 1.0 2020-10-01 00:09:19 ... 13.30 2.5 2 1.0 2020-10-01 00:30:00 ... 15.36 2.5 3 2.0 2020-10-01 00:56:46 ... -3.80 0.0 4 2.0 2020-10-01 00:56:46 ... 3.80 0.0 ... ... ... ... ... ... 4652008 NaN 2020-12-31 23:44:35 ... 43.95 2.5 4652009 NaN 2020-12-31 23:41:36 ... 20.17 2.5 4652010 NaN 2020-12-31 23:01:17 ... 78.98 0.0 4652011 NaN 2020-12-31 23:31:29 ... 39.50 0.0 4652012 NaN 2020-12-31 23:12:48 ... 20.64 0.0 [4652013 rows x 18 columns] """ Engine.subscribe(_update_engine) try: pd_obj = FactoryDispatcher.read_csv_glob(**kwargs) except AttributeError: raise AttributeError( "read_csv_glob() is only implemented for pandas on Ray.") # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj return DataFrame(query_compiler=pd_obj)
def read_sas( filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False, ): # pragma: no cover _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_sas(**kwargs))
def read_pickle( filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): # noqa: PR01, RT01, D200 """ Load pickled Modin object (or any object) from file. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_pickle(**kwargs))
def read_sql_query( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize=None, dtype=None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_sql_query(**kwargs))
def read_excel( io, sheet_name: "str | int | list[IntStrT] | None" = 0, header: "int | Sequence[int] | None" = 0, names=None, index_col: "int | Sequence[int] | None" = None, usecols=None, squeeze: "bool | None" = None, dtype: "DtypeArg | None" = None, engine: "Literal[('xlrd', 'openpyxl', 'odf', 'pyxlsb')] | None" = None, converters=None, true_values: "Iterable[Hashable] | None" = None, false_values: "Iterable[Hashable] | None" = None, skiprows: "Sequence[int] | int | Callable[[int], object] | None" = None, nrows: "int | None" = None, na_values=None, keep_default_na: "bool" = True, na_filter: "bool" = True, verbose: "bool" = False, parse_dates=False, date_parser=None, thousands: "str | None" = None, decimal: "str" = ".", comment: "str | None" = None, skipfooter: "int" = 0, convert_float: "bool | None" = None, mangle_dupe_cols: "bool" = True, storage_options: "StorageOptions" = None, ) -> "DataFrame | dict[IntStrT, DataFrame]": # noqa: PR01, RT01, D200 """ Read an Excel file into a DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher intermediate = FactoryDispatcher.read_excel(**kwargs) if isinstance(intermediate, (OrderedDict, dict)): parsed = type(intermediate)() for key in intermediate.keys(): parsed[key] = DataFrame(query_compiler=intermediate.get(key)) return parsed else: return DataFrame(query_compiler=intermediate)
def from_arrow(at): """ Convert an Arrow Table to a Modin DataFrame. Parameters ---------- at : Arrow Table The Arrow Table to convert from. Returns ------- DataFrame A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher from .dataframe import DataFrame return DataFrame(query_compiler=FactoryDispatcher.from_arrow(at))
def from_pandas(df): """ Convert a pandas DataFrame to a Modin DataFrame. Parameters ---------- df : pandas.DataFrame The pandas DataFrame to convert. Returns ------- modin.pandas.DataFrame A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher from .dataframe import DataFrame return DataFrame(query_compiler=FactoryDispatcher.from_pandas(df))
def read_sql_table( table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None, ): # noqa: PR01, RT01, D200 """ Read SQL database table into a DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_sql_table(**kwargs))
def to_pickle( obj: Any, filepath_or_buffer: FilePathOrBuffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(obj, DataFrame): obj = obj._query_compiler return FactoryDispatcher.to_pickle( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, )
def read_parquet( path, engine: str = "auto", columns=None, storage_options: StorageOptions = None, use_nullable_dtypes: bool = False, **kwargs, ): Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_parquet( path=path, engine=engine, columns=columns, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, **kwargs, ))
def read_pickle_distributed( filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): """ Load pickled pandas object from files. This experimental feature provides parallel reading from multiple pickle files which are defined by glob pattern. The files must contain parts of one dataframe, which can be obtained, for example, by `to_pickle_distributed` function. Parameters ---------- filepath_or_buffer : str, path object or file-like object File path, URL, or buffer where the pickled object will be loaded from. Accept URL. URL is not limited to S3 and GCS. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer' If 'infer' and 'path_or_url' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a non-fsspec URL. See the fsspec and backend storage implementation docs for the set of allowed keys and values. Returns ------- unpickled : same type as object stored in file Notes ----- The number of partitions is equal to the number of input files. """ Engine.subscribe(_update_engine) assert IsExperimental.get(), "This only works in experimental mode" _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=FactoryDispatcher.read_pickle_distributed( **kwargs))
def from_dataframe(df): """ Convert a DataFrame implementing the dataframe exchange protocol to a Modin DataFrame. See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. Parameters ---------- df : DataFrame The DataFrame object supporting the dataframe exchange protocol. Returns ------- DataFrame A new Modin DataFrame object. """ from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher from .dataframe import DataFrame return DataFrame(query_compiler=FactoryDispatcher.from_dataframe(df))
def read_hdf( path_or_buf, key=None, mode: str = "r", errors: str = "strict", where=None, start: Optional[int] = None, stop: Optional[int] = None, columns=None, iterator=False, chunksize: Optional[int] = None, **kwargs, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_hdf(**kwargs))
def read_stata( filepath_or_buffer, convert_dates=True, convert_categoricals=True, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False, compression="infer", storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_stata(**kwargs))
def read_excel( io, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=None, mangle_dupe_cols=True, storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher intermediate = FactoryDispatcher.read_excel(**kwargs) if isinstance(intermediate, (OrderedDict, dict)): parsed = type(intermediate)() for key in intermediate.keys(): parsed[key] = DataFrame(query_compiler=intermediate.get(key)) return parsed else: return DataFrame(query_compiler=intermediate)
def read_custom_text( filepath_or_buffer, columns, custom_parser, compression="infer", nrows: Optional[int] = None, is_quoting=True, ): """ Load custom text data from file. Parameters ---------- filepath_or_buffer : str File path where the custom text data will be loaded from. columns : list or callable(file-like object, **kwargs) -> list Column names of list type or callable that create column names from opened file and passed `kwargs`. custom_parser : callable(file-like object, **kwargs) -> pandas.DataFrame Function that takes as input a part of the `filepath_or_buffer` file loaded into memory in file-like object form. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default: 'infer' If 'infer' and 'path_or_url' is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). nrows : int, optional Amount of rows to read. is_quoting : bool, default: True Whether or not to consider quotes. Returns ------- modin.DataFrame """ Engine.subscribe(_update_engine) assert IsExperimental.get(), "This only works in experimental mode" _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return DataFrame(query_compiler=FactoryDispatcher.read_custom_text( **kwargs))