示例#1
0
文件: io.py 项目: prutskov/modin
def read_html(
    io,
    match=".+",
    flavor=None,
    header=None,
    index_col=None,
    skiprows=None,
    attrs=None,
    parse_dates=False,
    thousands=",",
    encoding=None,
    decimal=".",
    converters=None,
    na_values=None,
    keep_default_na=True,
    displayed_only=True,
):  # noqa: PR01, RT01, D200
    """
    Read HTML tables into a ``DataFrame`` object.
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_html(**kwargs))
示例#2
0
文件: io.py 项目: totemtang/modin
def read_json(
    path_or_buf=None,
    orient=None,
    typ="frame",
    dtype=None,
    convert_axes=None,
    convert_dates=True,
    keep_default_dates=True,
    numpy=False,
    precise_float=False,
    date_unit=None,
    encoding=None,
    encoding_errors="strict",
    lines=False,
    chunksize=None,
    compression="infer",
    nrows: Optional[int] = None,
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_json(**kwargs))
示例#3
0
文件: io.py 项目: totemtang/modin
def _read(**kwargs):
    """
    Read csv file from local disk.

    Parameters
    ----------
    **kwargs : dict
        Keyword arguments in pandas.read_csv.

    Returns
    -------
    modin.pandas.DataFrame
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    pd_obj = FactoryDispatcher.read_csv(**kwargs)
    # This happens when `read_csv` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(
            query_compiler=reader(*args, **kwargs)
        )
        return pd_obj
    return DataFrame(query_compiler=pd_obj)
示例#4
0
文件: io.py 项目: prutskov/modin
def read_fwf(
    filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]],
    colspecs="infer",
    widths=None,
    infer_nrows=100,
    **kwds,
):  # noqa: PR01, RT01, D200
    """
    Read a table of fixed-width formatted lines into DataFrame.
    """
    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
    from pandas.io.parsers.base_parser import parser_defaults

    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwds", {}))
    target_kwargs = parser_defaults.copy()
    target_kwargs.update(kwargs)
    pd_obj = FactoryDispatcher.read_fwf(**target_kwargs)
    # When `read_fwf` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader(
            *args, **kwargs))
        return pd_obj
    return DataFrame(query_compiler=pd_obj)
示例#5
0
文件: io.py 项目: totemtang/modin
def read_xml(
    path_or_buffer,
    xpath="./*",
    namespaces=None,
    elems_only=False,
    attrs_only=False,
    names=None,
    encoding="utf-8",
    parser="lxml",
    stylesheet=None,
    compression="infer",
    storage_options=None,
) -> DataFrame:
    ErrorMessage.default_to_pandas("read_xml")
    Engine.subscribe(_update_engine)
    return DataFrame(
        pandas.read_xml(
            path_or_buffer,
            xpath=xpath,
            namespaces=namespaces,
            elems_only=elems_only,
            attrs_only=attrs_only,
            names=names,
            encoding=encoding,
            parser=parser,
            stylesheet=stylesheet,
            compression=compression,
            storage_options=storage_options,
        )
    )
示例#6
0
文件: io.py 项目: yangw1234/modin
def read_feather(path, columns=None, use_threads: bool = True):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_feather(**kwargs))
示例#7
0
文件: io.py 项目: RehanSD/modin
def _read(**kwargs):
    """
    Read csv file from local disk.

    Parameters
    ----------
    **kwargs : dict
        Keyword arguments in pandas.read_csv.

    Returns
    -------
    modin.pandas.DataFrame
    """
    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    squeeze = kwargs.pop("squeeze", False)
    pd_obj = FactoryDispatcher.read_csv(**kwargs)
    # This happens when `read_csv` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader(
            *args, **kwargs))
        return pd_obj
    result = DataFrame(query_compiler=pd_obj)
    if squeeze:
        return result.squeeze(axis=1)
    return result
示例#8
0
文件: io.py 项目: prutskov/modin
def read_gbq(
    query: str,
    project_id: Optional[str] = None,
    index_col: Optional[str] = None,
    col_order: Optional[List[str]] = None,
    reauth: bool = False,
    auth_local_webserver: bool = False,
    dialect: Optional[str] = None,
    location: Optional[str] = None,
    configuration: Optional[Dict[str, Any]] = None,
    credentials=None,
    use_bqstorage_api: Optional[bool] = None,
    progress_bar_type: Optional[str] = None,
    max_results: Optional[int] = None,
) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Load data from Google BigQuery.
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_gbq(**kwargs))
示例#9
0
文件: io.py 项目: RehanSD/modin
def read_clipboard(sep=r"\s+", **kwargs):  # pragma: no cover
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_clipboard(**kwargs))
示例#10
0
文件: io.py 项目: yangw1234/modin
def read_pickle(filepath_or_buffer: FilePathOrBuffer,
                compression: Optional[str] = "infer"):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_pickle(**kwargs))
示例#11
0
文件: io.py 项目: yangl235/modin
def read_clipboard(sep=r"\s+", **kwargs):  # pragma: no cover
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_clipboard(**kwargs))
示例#12
0
文件: io.py 项目: prutskov/modin
def read_orc(path,
             columns: Optional[List[str]] = None,
             **kwargs) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Load an ORC object from the file path, returning a DataFrame.
    """
    ErrorMessage.default_to_pandas("read_orc")
    Engine.subscribe(_update_engine)
    return DataFrame(pandas.read_orc(path, columns, **kwargs))
示例#13
0
文件: io.py 项目: RehanSD/modin
def read_spss(
    path: Union[str, pathlib.Path],
    usecols: Union[Sequence[str], type(None)] = None,
    convert_categoricals: bool = True,
):
    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_spss(
        path, usecols, convert_categoricals))
示例#14
0
文件: io.py 项目: yangl235/modin
def read_spss(
    path: Union[str, pathlib.Path],
    usecols: Union[Sequence[str], type(None)] = None,
    convert_categoricals: bool = True,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_spss(
        path, usecols, convert_categoricals))
示例#15
0
文件: io.py 项目: prutskov/modin
def to_pickle(
    obj: Any,
    filepath_or_buffer,
    compression: CompressionOptions = "infer",
    protocol: int = pickle.HIGHEST_PROTOCOL,
    storage_options: StorageOptions = None,
):  # noqa: RT01
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    filepath_or_buffer : str, path object, or file-like object
        String, path object (implementing `os.PathLike[str]`), or file-like
        object implementing a binary ``write()`` function.
    compression : str or dict, default: 'infer'
        For on-the-fly compression of the output data. If `infer` and `filepath_or_buffer`
        path-like, then detect compression from the following extensions: '.gz',
        '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to
        `None` for no compression. Can also be a dict with key `method` set
        to one of {`zip`, `gzip`, `bz2`, `zstd`} and other
        key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
        ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
        example, the following could be passed for faster compression and to create
        a reproducible gzip archive:
        `compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}`.
    protocol : int, default: pickle.HIGHEST_PROTOCOL
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL. The possible values for this parameter depend
        on the version of Python. For Python 2.x, possible values are 0, 1, 2.
        For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value.
        A negative value for the protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
        are forwarded to ``urllib`` as header options. For other URLs (e.g.
        starting with "s3://", and "gcs://") the key-value pairs are forwarded to
        ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.
    """
    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    if isinstance(obj, DataFrame):
        obj = obj._query_compiler
    return FactoryDispatcher.to_pickle(
        obj,
        filepath_or_buffer=filepath_or_buffer,
        compression=compression,
        protocol=protocol,
        storage_options=storage_options,
    )
示例#16
0
文件: io.py 项目: RehanSD/modin
def read_pickle(
    filepath_or_buffer: FilePathOrBuffer,
    compression: Optional[str] = "infer",
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_pickle(**kwargs))
示例#17
0
文件: io.py 项目: RehanSD/modin
def read_feather(
    path,
    columns=None,
    use_threads: bool = True,
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_feather(**kwargs))
示例#18
0
文件: io.py 项目: prutskov/modin
def read_pickle(
    filepath_or_buffer,
    compression: Optional[str] = "infer",
    storage_options: StorageOptions = None,
):  # noqa: PR01, RT01, D200
    """
    Load pickled Modin object (or any object) from file.
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_pickle(**kwargs))
示例#19
0
文件: io.py 项目: yangl235/modin
def read_sas(
    filepath_or_buffer,
    format=None,
    index=None,
    encoding=None,
    chunksize=None,
    iterator=False,
):  # pragma: no cover
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_sas(**kwargs))
示例#20
0
文件: io.py 项目: yangl235/modin
def json_normalize(
    data: Union[Dict, List[Dict]],
    record_path: Optional[Union[str, List]] = None,
    meta: Optional[Union[str, List[Union[str, List[str]]]]] = None,
    meta_prefix: Optional[str] = None,
    record_prefix: Optional[str] = None,
    errors: Optional[str] = "raise",
    sep: str = ".",
    max_level: Optional[int] = None,
) -> DataFrame:
    ErrorMessage.default_to_pandas("json_normalize")
    Engine.subscribe(_update_engine)
    return DataFrame(
        pandas.json_normalize(data, record_path, meta, meta_prefix,
                              record_prefix, errors, sep, max_level))
示例#21
0
文件: io.py 项目: yangl235/modin
def read_sql_query(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    chunksize=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_sql_query(**kwargs))
示例#22
0
文件: io.py 项目: yangw1234/modin
def to_pickle(
    obj: Any,
    filepath_or_buffer: Union[str, pathlib.Path],
    compression: Optional[str] = "infer",
    protocol: int = 4,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    if isinstance(obj, DataFrame):
        obj = obj._query_compiler
    return EngineDispatcher.to_pickle(obj,
                                      filepath_or_buffer,
                                      compression=compression,
                                      protocol=protocol)
示例#23
0
文件: io.py 项目: RehanSD/modin
def read_sql_query(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    chunksize=None,
    dtype=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_sql_query(**kwargs))
示例#24
0
文件: io.py 项目: yangl235/modin
def read_parquet(
    path,
    engine: str = "auto",
    columns=None,
    use_nullable_dtypes: bool = False,
    **kwargs,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_parquet(
        path=path,
        columns=columns,
        engine=engine,
        use_nullable_dtypes=use_nullable_dtypes,
        **kwargs,
    ))
示例#25
0
文件: io.py 项目: prutskov/modin
def read_sas(
    filepath_or_buffer,
    format=None,
    index=None,
    encoding=None,
    chunksize=None,
    iterator=False,
):  # pragma: no cover  # noqa: PR01, RT01, D200
    """
    Read SAS files stored as either XPORT or SAS7BDAT format files.
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_sas(**kwargs))
示例#26
0
文件: io.py 项目: prutskov/modin
def read_excel(
    io,
    sheet_name: "str | int | list[IntStrT] | None" = 0,
    header: "int | Sequence[int] | None" = 0,
    names=None,
    index_col: "int | Sequence[int] | None" = None,
    usecols=None,
    squeeze: "bool | None" = None,
    dtype: "DtypeArg | None" = None,
    engine: "Literal[('xlrd', 'openpyxl', 'odf', 'pyxlsb')] | None" = None,
    converters=None,
    true_values: "Iterable[Hashable] | None" = None,
    false_values: "Iterable[Hashable] | None" = None,
    skiprows: "Sequence[int] | int | Callable[[int], object] | None" = None,
    nrows: "int | None" = None,
    na_values=None,
    keep_default_na: "bool" = True,
    na_filter: "bool" = True,
    verbose: "bool" = False,
    parse_dates=False,
    date_parser=None,
    thousands: "str | None" = None,
    decimal: "str" = ".",
    comment: "str | None" = None,
    skipfooter: "int" = 0,
    convert_float: "bool | None" = None,
    mangle_dupe_cols: "bool" = True,
    storage_options: "StorageOptions" = None,
) -> "DataFrame | dict[IntStrT, DataFrame]":  # noqa: PR01, RT01, D200
    """
    Read an Excel file into a DataFrame.
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    intermediate = FactoryDispatcher.read_excel(**kwargs)
    if isinstance(intermediate, (OrderedDict, dict)):
        parsed = type(intermediate)()
        for key in intermediate.keys():
            parsed[key] = DataFrame(query_compiler=intermediate.get(key))
        return parsed
    else:
        return DataFrame(query_compiler=intermediate)
示例#27
0
文件: io.py 项目: yangw1234/modin
def read_stata(
    filepath_or_buffer,
    convert_dates=True,
    convert_categoricals=True,
    index_col=None,
    convert_missing=False,
    preserve_dtypes=True,
    columns=None,
    order_categoricals=True,
    chunksize=None,
    iterator=False,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_stata(**kwargs))
示例#28
0
文件: io.py 项目: prutskov/modin
def json_normalize(
    data: Union[Dict, List[Dict]],
    record_path: Optional[Union[str, List]] = None,
    meta: Optional[Union[str, List[Union[str, List[str]]]]] = None,
    meta_prefix: Optional[str] = None,
    record_prefix: Optional[str] = None,
    errors: Optional[str] = "raise",
    sep: str = ".",
    max_level: Optional[int] = None,
) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Normalize semi-structured JSON data into a flat table.
    """
    ErrorMessage.default_to_pandas("json_normalize")
    Engine.subscribe(_update_engine)
    return DataFrame(
        pandas.json_normalize(data, record_path, meta, meta_prefix,
                              record_prefix, errors, sep, max_level))
示例#29
0
文件: io.py 项目: RehanSD/modin
def read_parquet(
    path,
    engine: str = "auto",
    columns=None,
    storage_options: StorageOptions = None,
    use_nullable_dtypes: bool = False,
    **kwargs,
):
    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    return DataFrame(query_compiler=FactoryDispatcher.read_parquet(
        path=path,
        engine=engine,
        columns=columns,
        storage_options=storage_options,
        use_nullable_dtypes=use_nullable_dtypes,
        **kwargs,
    ))
示例#30
0
文件: io.py 项目: totemtang/modin
def to_pickle(
    obj: Any,
    filepath_or_buffer: FilePathOrBuffer,
    compression: CompressionOptions = "infer",
    protocol: int = pickle.HIGHEST_PROTOCOL,
    storage_options: StorageOptions = None,
):
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    if isinstance(obj, DataFrame):
        obj = obj._query_compiler
    return FactoryDispatcher.to_pickle(
        obj,
        filepath_or_buffer=filepath_or_buffer,
        compression=compression,
        protocol=protocol,
        storage_options=storage_options,
    )