def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # noqa: D103 _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_clipboard(**kwargs))
def read_pickle(filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer"): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_pickle(**kwargs))
def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_clipboard(**kwargs))
def read_orc(path, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: # noqa: PR01, RT01, D200 """ Load an ORC object from the file path, returning a DataFrame. """ ErrorMessage.default_to_pandas("read_orc") Engine.subscribe(_update_engine) return DataFrame(pandas.read_orc(path, columns, **kwargs))
def read_spss( path: Union[str, pathlib.Path], usecols: Union[Sequence[str], type(None)] = None, convert_categoricals: bool = True, ): from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_spss( path, usecols, convert_categoricals))
def read_spss( path: Union[str, pathlib.Path], usecols: Union[Sequence[str], type(None)] = None, convert_categoricals: bool = True, ): Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_spss( path, usecols, convert_categoricals))
def read_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_pickle(**kwargs))
def to_pickle( obj: Any, filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): # noqa: RT01 """ Pickle (serialize) object to file. Parameters ---------- obj : any object Any python object. filepath_or_buffer : str, path object, or file-like object String, path object (implementing `os.PathLike[str]`), or file-like object implementing a binary ``write()`` function. compression : str or dict, default: 'infer' For on-the-fly compression of the output data. If `infer` and `filepath_or_buffer` path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to `None` for no compression. Can also be a dict with key `method` set to one of {`zip`, `gzip`, `bz2`, `zstd`} and other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an example, the following could be passed for faster compression and to create a reproducible gzip archive: `compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}`. protocol : int, default: pickle.HIGHEST_PROTOCOL Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL. The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value pairs are forwarded to ``urllib`` as header options. For other URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details. """ Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if isinstance(obj, DataFrame): obj = obj._query_compiler return FactoryDispatcher.to_pickle( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, )
def read_feather( path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_feather(**kwargs))
def read_sas( filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False, ): # pragma: no cover _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_sas(**kwargs))
def read_pickle( filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): # noqa: PR01, RT01, D200 """ Load pickled Modin object (or any object) from file. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_pickle(**kwargs))
def read_sql_query( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize=None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_sql_query(**kwargs))
def to_pickle( obj: Any, filepath_or_buffer: Union[str, pathlib.Path], compression: Optional[str] = "infer", protocol: int = 4, ): from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) if isinstance(obj, DataFrame): obj = obj._query_compiler return EngineDispatcher.to_pickle(obj, filepath_or_buffer, compression=compression, protocol=protocol)
def json_normalize( data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, meta_prefix: Optional[str] = None, record_prefix: Optional[str] = None, errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, ) -> DataFrame: ErrorMessage.default_to_pandas("json_normalize") Engine.subscribe(_update_engine) return DataFrame( pandas.json_normalize(data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level))
def read_feather( path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None, ): # noqa: PR01, RT01, D200 """ Load a feather-format object from the file path. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_feather(**kwargs))
def read_sql_query( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, chunksize=None, dtype=None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_sql_query(**kwargs))
def read_sas( filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False, ): # pragma: no cover # noqa: PR01, RT01, D200 """ Read SAS files stored as either XPORT or SAS7BDAT format files. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_sas(**kwargs))
def read_parquet( path, engine: str = "auto", columns=None, use_nullable_dtypes: bool = False, **kwargs, ): from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_parquet( path=path, columns=columns, engine=engine, use_nullable_dtypes=use_nullable_dtypes, **kwargs, ))
def read_excel( io, sheet_name: "str | int | list[IntStrT] | None" = 0, header: "int | Sequence[int] | None" = 0, names=None, index_col: "int | Sequence[int] | None" = None, usecols=None, squeeze: "bool | None" = None, dtype: "DtypeArg | None" = None, engine: "Literal[('xlrd', 'openpyxl', 'odf', 'pyxlsb')] | None" = None, converters=None, true_values: "Iterable[Hashable] | None" = None, false_values: "Iterable[Hashable] | None" = None, skiprows: "Sequence[int] | int | Callable[[int], object] | None" = None, nrows: "int | None" = None, na_values=None, keep_default_na: "bool" = True, na_filter: "bool" = True, verbose: "bool" = False, parse_dates=False, date_parser=None, thousands: "str | None" = None, decimal: "str" = ".", comment: "str | None" = None, skipfooter: "int" = 0, convert_float: "bool | None" = None, mangle_dupe_cols: "bool" = True, storage_options: "StorageOptions" = None, ) -> "DataFrame | dict[IntStrT, DataFrame]": # noqa: PR01, RT01, D200 """ Read an Excel file into a DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher intermediate = FactoryDispatcher.read_excel(**kwargs) if isinstance(intermediate, (OrderedDict, dict)): parsed = type(intermediate)() for key in intermediate.keys(): parsed[key] = DataFrame(query_compiler=intermediate.get(key)) return parsed else: return DataFrame(query_compiler=intermediate)
def json_normalize( data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, meta_prefix: Optional[str] = None, record_prefix: Optional[str] = None, errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, ) -> DataFrame: # noqa: PR01, RT01, D200 """ Normalize semi-structured JSON data into a flat table. """ ErrorMessage.default_to_pandas("json_normalize") Engine.subscribe(_update_engine) return DataFrame( pandas.json_normalize(data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level))
def read_stata( filepath_or_buffer, convert_dates=True, convert_categoricals=True, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_stata(**kwargs))
def read_sql_table( table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, chunksize=None, ): # noqa: PR01, RT01, D200 """ Read SQL database table into a DataFrame. """ _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_sql_table(**kwargs))
def read_parquet( path, engine: str = "auto", columns=None, storage_options: StorageOptions = None, use_nullable_dtypes: bool = False, **kwargs, ): Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_parquet( path=path, engine=engine, columns=columns, storage_options=storage_options, use_nullable_dtypes=use_nullable_dtypes, **kwargs, ))
def to_pickle( obj: Any, filepath_or_buffer: FilePathOrBuffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): from modin.data_management.factories.dispatcher import FactoryDispatcher Engine.subscribe(_update_engine) if isinstance(obj, DataFrame): obj = obj._query_compiler return FactoryDispatcher.to_pickle( obj, filepath_or_buffer=filepath_or_buffer, compression=compression, protocol=protocol, storage_options=storage_options, )
def read_fwf( filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], colspecs="infer", widths=None, infer_nrows=100, **kwds, ): from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwds", {})) pd_obj = EngineDispatcher.read_fwf(**kwargs) # When `read_fwf` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj return DataFrame(query_compiler=pd_obj)
def read_stata( filepath_or_buffer, convert_dates=True, convert_categoricals=True, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False, compression="infer", storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher return DataFrame(query_compiler=FactoryDispatcher.read_stata(**kwargs))
def read_hdf( path_or_buf, key=None, mode: str = "r", errors: str = "strict", where=None, start: Optional[int] = None, stop: Optional[int] = None, columns=None, iterator=False, chunksize: Optional[int] = None, **kwargs, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) return DataFrame(query_compiler=EngineDispatcher.read_hdf(**kwargs))
def read_excel( io, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, storage_options: StorageOptions = None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) intermediate = EngineDispatcher.read_excel(**kwargs) if isinstance(intermediate, (OrderedDict, dict)): parsed = type(intermediate)() for key in intermediate.keys(): parsed[key] = DataFrame(query_compiler=intermediate.get(key)) return parsed else: return DataFrame(query_compiler=intermediate)
def read_sql( sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, ): _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) if kwargs.get("chunksize") is not None: ErrorMessage.default_to_pandas("Parameters provided [chunksize]") df_gen = pandas.read_sql(**kwargs) return (DataFrame(query_compiler=EngineDispatcher.from_pandas(df)) for df in df_gen) return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
def _read(**kwargs): """ Read csv file from local disk. Parameters ---------- filepath_or_buffer: The filepath of the csv file. We only support local files for now. kwargs: Keyword arguments in pandas.read_csv """ from modin.data_management.factories.dispatcher import EngineDispatcher Engine.subscribe(_update_engine) pd_obj = EngineDispatcher.read_csv(**kwargs) # This happens when `read_csv` returns a TextFileReader object for iterating through if isinstance(pd_obj, pandas.io.parsers.TextFileReader): reader = pd_obj.read pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader( *args, **kwargs)) return pd_obj return DataFrame(query_compiler=pd_obj)