예제 #1
0
def test_engine_switch():
    execution_engine.put("Test")
    assert EngineDispatcher.get_engine() == PandasOnTestFactory
    assert EngineDispatcher.get_engine().io_cls == "Foo"
    execution_engine.put("Python")  # revert engine to default

    partition_format.put("Test")
    assert EngineDispatcher.get_engine() == TestOnPythonFactory
    assert EngineDispatcher.get_engine().io_cls == "Bar"
    partition_format.put("Pandas")  # revert engine to default
예제 #2
0
def test_engine_switch():
    Engine.put("Test")
    assert EngineDispatcher.get_engine() == PandasOnTestFactory
    assert EngineDispatcher.get_engine().io_cls == "Foo"
    Engine.put("Python")  # revert engine to default

    Backend.put("Test")
    assert EngineDispatcher.get_engine() == TestOnPythonFactory
    assert EngineDispatcher.get_engine().io_cls == "Bar"
    Backend.put("Pandas")  # revert engine to default
예제 #3
0
파일: io.py 프로젝트: rolveb/modin
def read_clipboard(sep=r"\s+", **kwargs):  # pragma: no cover
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_clipboard(**kwargs))
예제 #4
0
파일: io.py 프로젝트: yangw1234/modin
def read_feather(path, columns=None, use_threads: bool = True):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_feather(**kwargs))
예제 #5
0
파일: io_exp.py 프로젝트: yyz940922/modin
def _read(**kwargs):
    """
    Read csv file from local disk.

    Parameters
    ----------
    filepath_or_buffer:
        The filepath of the csv file.
        We only support local files for now.
    kwargs: Keyword arguments in pandas.read_csv
    """
    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)

    try:
        pd_obj = EngineDispatcher.read_csv_glob(**kwargs)
    except AttributeError:
        raise AttributeError(
            "read_csv_glob() is only implemented for pandas on Ray.")

    # This happens when `read_csv` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader(
            *args, **kwargs))
        return pd_obj

    return DataFrame(query_compiler=pd_obj)
예제 #6
0
def from_non_pandas(df, index, columns, dtype):
    """
    Implement [METHOD_NAME].

    TODO: Add more details for this docstring template.

    Parameters
    ----------
    What arguments does this function have.
    [
    PARAMETER_NAME : PARAMETERS TYPES
        Description.
    ]

    Returns
    -------
    What this returns (if anything)
    """
    from modin.data_management.factories.dispatcher import EngineDispatcher

    new_qc = EngineDispatcher.from_non_pandas(df, index, columns, dtype)
    if new_qc is not None:
        from .dataframe import DataFrame

        return DataFrame(query_compiler=new_qc)
    return new_qc
예제 #7
0
파일: io.py 프로젝트: rolveb/modin
def read_pickle(filepath_or_buffer: FilePathOrBuffer,
                compression: Optional[str] = "infer"):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_pickle(**kwargs))
예제 #8
0
파일: io.py 프로젝트: rolveb/modin
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):
    """Read SQL query or database table into a DataFrame.

    Args:
        sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name.
        con: SQLAlchemy connectable (engine/connection) or database string URI or DBAPI2 connection (fallback mode)
        index_col: Column(s) to set as index(MultiIndex).
        coerce_float: Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to
                      floating point, useful for SQL result sets.
        params: List of parameters to pass to execute method. The syntax used
                to pass parameters is database driver dependent. Check your
                database driver documentation for which of the five syntax styles,
                described in PEP 249's paramstyle, is supported.
        parse_dates:
                     - List of column names to parse as dates.
                     - Dict of ``{column_name: format string}`` where format string is
                       strftime compatible in case of parsing string times, or is one of
                       (D, s, ns, ms, us) in case of parsing integer timestamps.
                     - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
                       to the keyword arguments of :func:`pandas.to_datetime`
                       Especially useful with databases without native Datetime support,
                       such as SQLite.
        columns: List of column names to select from SQL table (only used when reading a table).
        chunksize: If specified, return an iterator where `chunksize` is the number of rows to include in each chunk.

    Returns:
        Modin Dataframe
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    if kwargs.get("chunksize") is not None:
        ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
        df_gen = pandas.read_sql(**kwargs)
        return (DataFrame(query_compiler=EngineDispatcher.from_pandas(df))
                for df in df_gen)
    return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
예제 #9
0
파일: io.py 프로젝트: rolveb/modin
def read_spss(
    path: Union[str, pathlib.Path],
    usecols: Union[Sequence[str], type(None)] = None,
    convert_categoricals: bool = True,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_spss(
        path, usecols, convert_categoricals))
예제 #10
0
파일: utils.py 프로젝트: rolveb/modin
def from_non_pandas(df, index, columns, dtype):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    new_qc = EngineDispatcher.from_non_pandas(df, index, columns, dtype)
    if new_qc is not None:
        from .dataframe import DataFrame

        return DataFrame(query_compiler=new_qc)
    return new_qc
예제 #11
0
파일: io.py 프로젝트: yangl235/modin
def read_pickle(
    filepath_or_buffer: FilePathOrBuffer,
    compression: Optional[str] = "infer",
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_pickle(**kwargs))
예제 #12
0
파일: io.py 프로젝트: yangl235/modin
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    if kwargs.get("chunksize") is not None:
        ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
        df_gen = pandas.read_sql(**kwargs)
        return (DataFrame(query_compiler=EngineDispatcher.from_pandas(df))
                for df in df_gen)
    return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
예제 #13
0
파일: utils.py 프로젝트: rolveb/modin
def from_pandas(df):
    """Converts a pandas DataFrame to a Modin DataFrame.
    Args:
        df (pandas.DataFrame): The pandas DataFrame to convert.

    Returns:
        A new Modin DataFrame object.
    """
    from modin.data_management.factories.dispatcher import EngineDispatcher
    from .dataframe import DataFrame

    return DataFrame(query_compiler=EngineDispatcher.from_pandas(df))
예제 #14
0
파일: io.py 프로젝트: rolveb/modin
def read_sas(
    filepath_or_buffer,
    format=None,
    index=None,
    encoding=None,
    chunksize=None,
    iterator=False,
):  # pragma: no cover
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_sas(**kwargs))
예제 #15
0
파일: io.py 프로젝트: rolveb/modin
def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
    """Load a parquet object from the file path, returning a DataFrame.

    Args:
        path: The filepath of the parquet file.
              We only support local files for now.
        engine: This argument doesn't do anything for now.
        kwargs: Pass into parquet's read_pandas function.
    """
    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_parquet(
        path=path, columns=columns, engine=engine, **kwargs))
예제 #16
0
파일: io_exp.py 프로젝트: rolveb/modin
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
    partition_column=None,
    lower_bound=None,
    upper_bound=None,
    max_sessions=None,
):
    """Read SQL query or database table into a DataFrame.

    Args:
        sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name.
        con: SQLAlchemy connectable (engine/connection) or database string URI or DBAPI2 connection (fallback mode)
        index_col: Column(s) to set as index(MultiIndex).
        coerce_float: Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to
                      floating point, useful for SQL result sets.
        params: List of parameters to pass to execute method. The syntax used
                to pass parameters is database driver dependent. Check your
                database driver documentation for which of the five syntax styles,
                described in PEP 249's paramstyle, is supported.
        parse_dates:
                     - List of column names to parse as dates.
                     - Dict of ``{column_name: format string}`` where format string is
                       strftime compatible in case of parsing string times, or is one of
                       (D, s, ns, ms, us) in case of parsing integer timestamps.
                     - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
                       to the keyword arguments of :func:`pandas.to_datetime`
                       Especially useful with databases without native Datetime support,
                       such as SQLite.
        columns: List of column names to select from SQL table (only used when reading a table).
        chunksize: If specified, return an iterator where `chunksize` is the number of rows to include in each chunk.
        partition_column: column used to share the data between the workers (MUST be a INTEGER column)
        lower_bound: the minimum value to be requested from the partition_column
        upper_bound: the maximum value to be requested from the partition_column
        max_sessions: the maximum number of simultaneous connections allowed to use

    Returns:
        Pandas Dataframe
    """
    assert (os.environ.get(
        "MODIN_EXPERIMENTAL",
        "").title() == "True"), "This only works in experimental mode"
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
예제 #17
0
파일: io.py 프로젝트: rolveb/modin
def to_pickle(
    obj: Any,
    filepath_or_buffer: Union[str, pathlib.Path],
    compression: Optional[str] = "infer",
    protocol: int = 4,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    if isinstance(obj, DataFrame):
        obj = obj._query_compiler
    return EngineDispatcher.to_pickle(obj,
                                      filepath_or_buffer,
                                      compression=compression,
                                      protocol=protocol)
예제 #18
0
파일: io.py 프로젝트: rolveb/modin
def read_sql_query(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    chunksize=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_sql_query(**kwargs))
예제 #19
0
파일: io.py 프로젝트: yangl235/modin
def read_sql_table(
    table_name,
    con,
    schema=None,
    index_col=None,
    coerce_float=True,
    parse_dates=None,
    columns=None,
    chunksize=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_sql_table(**kwargs))
예제 #20
0
파일: io.py 프로젝트: yangl235/modin
def to_pickle(
    obj: Any,
    filepath_or_buffer: Union[str, pathlib.Path],
    compression: Optional[str] = "infer",
    protocol: int = pickle.HIGHEST_PROTOCOL,
    storage_options: StorageOptions = None,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    if isinstance(obj, DataFrame):
        obj = obj._query_compiler
    return EngineDispatcher.to_pickle(obj,
                                      filepath_or_buffer,
                                      compression=compression,
                                      protocol=protocol)
예제 #21
0
파일: io.py 프로젝트: yangl235/modin
def read_parquet(
    path,
    engine: str = "auto",
    columns=None,
    use_nullable_dtypes: bool = False,
    **kwargs,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_parquet(
        path=path,
        columns=columns,
        engine=engine,
        use_nullable_dtypes=use_nullable_dtypes,
        **kwargs,
    ))
예제 #22
0
파일: io.py 프로젝트: rolveb/modin
def read_stata(
    filepath_or_buffer,
    convert_dates=True,
    convert_categoricals=True,
    index_col=None,
    convert_missing=False,
    preserve_dtypes=True,
    columns=None,
    order_categoricals=True,
    chunksize=None,
    iterator=False,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_stata(**kwargs))
예제 #23
0
파일: utils.py 프로젝트: rolveb/modin
def from_arrow(at):
    """Converts an Arrow Table to a Modin DataFrame.

    Parameters
    ----------
        at : Arrow Table
            The Arrow Table to convert from.

    Returns
    -------
    DataFrame
        A new Modin DataFrame object.
    """
    from modin.data_management.factories.dispatcher import EngineDispatcher
    from .dataframe import DataFrame

    return DataFrame(query_compiler=EngineDispatcher.from_arrow(at))
예제 #24
0
파일: io.py 프로젝트: rolveb/modin
def _read(**kwargs):
    """Read csv file from local disk.
    Args:
        filepath_or_buffer:
              The filepath of the csv file.
              We only support local files for now.
        kwargs: Keyword arguments in pandas.read_csv
    """
    from modin.data_management.factories.dispatcher import EngineDispatcher

    pd_obj = EngineDispatcher.read_csv(**kwargs)
    # This happens when `read_csv` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader(
            *args, **kwargs))
        return pd_obj
    return DataFrame(query_compiler=pd_obj)
예제 #25
0
파일: io.py 프로젝트: rolveb/modin
def read_hdf(
    path_or_buf,
    key=None,
    mode: str = "r",
    errors: str = "strict",
    where=None,
    start: Optional[int] = None,
    stop: Optional[int] = None,
    columns=None,
    iterator=False,
    chunksize: Optional[int] = None,
    **kwargs,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_hdf(**kwargs))
예제 #26
0
파일: io.py 프로젝트: rolveb/modin
def read_fwf(
    filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]],
    colspecs="infer",
    widths=None,
    infer_nrows=100,
    **kwds,
):
    from modin.data_management.factories.dispatcher import EngineDispatcher

    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwds", {}))
    pd_obj = EngineDispatcher.read_fwf(**kwargs)
    # When `read_fwf` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader(
            *args, **kwargs))
        return pd_obj
    return DataFrame(query_compiler=pd_obj)
예제 #27
0
파일: io.py 프로젝트: yangl235/modin
def read_excel(
    io,
    sheet_name=0,
    header=0,
    names=None,
    index_col=None,
    usecols=None,
    squeeze=False,
    dtype=None,
    engine=None,
    converters=None,
    true_values=None,
    false_values=None,
    skiprows=None,
    nrows=None,
    na_values=None,
    keep_default_na=True,
    na_filter=True,
    verbose=False,
    parse_dates=False,
    date_parser=None,
    thousands=None,
    comment=None,
    skipfooter=0,
    convert_float=True,
    mangle_dupe_cols=True,
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    intermediate = EngineDispatcher.read_excel(**kwargs)
    if isinstance(intermediate, (OrderedDict, dict)):
        parsed = type(intermediate)()
        for key in intermediate.keys():
            parsed[key] = DataFrame(query_compiler=intermediate.get(key))
        return parsed
    else:
        return DataFrame(query_compiler=intermediate)
예제 #28
0
파일: io.py 프로젝트: rolveb/modin
def read_json(
    path_or_buf=None,
    orient=None,
    typ="frame",
    dtype=None,
    convert_axes=None,
    convert_dates=True,
    keep_default_dates=True,
    numpy=False,
    precise_float=False,
    date_unit=None,
    encoding=None,
    lines=False,
    chunksize=None,
    compression="infer",
    nrows: Optional[int] = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_json(**kwargs))
예제 #29
0
파일: io.py 프로젝트: rolveb/modin
def read_html(
    io,
    match=".+",
    flavor=None,
    header=None,
    index_col=None,
    skiprows=None,
    attrs=None,
    parse_dates=False,
    thousands=",",
    encoding=None,
    decimal=".",
    converters=None,
    na_values=None,
    keep_default_na=True,
    displayed_only=True,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    return DataFrame(query_compiler=EngineDispatcher.read_html(**kwargs))
예제 #30
0
파일: io.py 프로젝트: yangl235/modin
def read_gbq(
    query: str,
    project_id: Optional[str] = None,
    index_col: Optional[str] = None,
    col_order: Optional[List[str]] = None,
    reauth: bool = False,
    auth_local_webserver: bool = False,
    dialect: Optional[str] = None,
    location: Optional[str] = None,
    configuration: Optional[Dict[str, Any]] = None,
    credentials=None,
    use_bqstorage_api: Optional[bool] = None,
    progress_bar_type: Optional[str] = None,
    max_results: Optional[int] = None,
) -> DataFrame:
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=EngineDispatcher.read_gbq(**kwargs))