Пример #1
0
def read_avro(
    file: Union[str, Path, BytesIO, BinaryIO],
    columns: Optional[Union[List[int], List[str]]] = None,
    n_rows: Optional[int] = None,
    **kwargs: Any,
) -> DataFrame:
    """
    Read into a DataFrame from Apache Avro format.

    Parameters
    ----------
    file
        Path to a file or a file-like object.
    columns
        Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
    n_rows
        Stop reading from Apache Avro file after reading ``n_rows``.

    Returns
    -------
    DataFrame
    """
    if isinstance(file, (str, Path)):
        file = format_path(file)
    if columns is None:
        columns = kwargs.pop("projection", None)

    return DataFrame._read_avro(file, n_rows=n_rows, columns=columns)
Пример #2
0
def _prepare_file_arg(
    file: Union[str, List[str], TextIO, Path, BinaryIO, bytes], **kwargs: Any
) -> ContextManager[Union[str, BinaryIO, List[str], List[BinaryIO]]]:
    """
    Utility for read_[csv, parquet]. (not to be used by scan_[csv, parquet]).
    Returned value is always usable as a context.

    A `StringIO`, `BytesIO` file is returned as a `BytesIO`.
    A local path is returned as a string.
    An http URL is read into a buffer and returned as a `BytesIO`.

    When fsspec is installed, remote file(s) is (are) opened with
    `fsspec.open(file, **kwargs)` or `fsspec.open_files(file, **kwargs)`.
    """

    # Small helper to use a variable as context
    @contextmanager
    def managed_file(file: Any) -> Iterator[Any]:
        try:
            yield file
        finally:
            pass

    if isinstance(file, StringIO):
        return BytesIO(file.read().encode("utf8"))
    if isinstance(file, BytesIO):
        return managed_file(file)
    if isinstance(file, Path):
        return managed_file(format_path(file))
    if isinstance(file, str):
        if _WITH_FSSPEC:
            if infer_storage_options(file)["protocol"] == "file":
                return managed_file(format_path(file))
            return fsspec.open(file, **kwargs)
        if file.startswith("http"):
            return _process_http_file(file)
    if isinstance(file, list) and bool(file) and all(
            isinstance(f, str) for f in file):
        if _WITH_FSSPEC:
            if all(
                    infer_storage_options(f)["protocol"] == "file"
                    for f in file):
                return managed_file([format_path(f) for f in file])
            return fsspec.open_files(file, **kwargs)
    if isinstance(file, str):
        file = format_path(file)
    return managed_file(file)
Пример #3
0
def scan_parquet(
    file: str | Path,
    n_rows: int | None = None,
    cache: bool = True,
    parallel: str = "auto",
    rechunk: bool = True,
    row_count_name: str | None = None,
    row_count_offset: int = 0,
    storage_options: dict | None = None,
    low_memory: bool = False,
    **kwargs: Any,
) -> LazyFrame:
    """
    Lazily read from a parquet file or multiple files via glob patterns.

    This allows the query optimizer to push down predicates and projections to the scan level,
    thereby potentially reducing memory overhead.

    Parameters
    ----------
    file
        Path to a file.
    n_rows
        Stop reading from parquet file after reading ``n_rows``.
    cache
        Cache the result after reading.
    parallel : {'auto', 'columns', 'row_groups', 'none'}
        This determines the direction of parallelism. 'auto' will try to determine the optimal direction.
    rechunk
        In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    storage_options
        Extra options that make sense for ``fsspec.open()`` or a
        particular storage connection.
        e.g. host, port, username, password, etc.
    low_memory: bool
        Reduce memory pressure at the expense of performance.
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if isinstance(file, (str, Path)):
        file = format_path(file)

    return LazyFrame.scan_parquet(
        file=file,
        n_rows=n_rows,
        cache=cache,
        parallel=parallel,
        rechunk=rechunk,
        row_count_name=row_count_name,
        row_count_offset=row_count_offset,
        storage_options=storage_options,
        low_memory=low_memory,
    )
Пример #4
0
def scan_parquet(
    file: Union[str, Path],
    n_rows: Optional[int] = None,
    cache: bool = True,
    parallel: bool = True,
    rechunk: bool = True,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    **kwargs: Any,
) -> LazyFrame:
    """
    Lazily read from a parquet file or multiple files via glob patterns.

    This allows the query optimizer to push down predicates and projections to the scan level,
    thereby potentially reducing memory overhead.

    Parameters
    ----------
    file
        Path to a file.
    n_rows
        Stop reading from parquet file after reading ``n_rows``.
    cache
        Cache the result after reading.
    parallel
        Read the parquet file in parallel. The single threaded reader consumes less memory.
    rechunk
        In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if isinstance(file, (str, Path)):
        file = format_path(file)

    return LazyFrame.scan_parquet(
        file=file,
        n_rows=n_rows,
        cache=cache,
        parallel=parallel,
        rechunk=rechunk,
        row_count_name=row_count_name,
        row_count_offset=row_count_offset,
    )
Пример #5
0
def scan_ipc(
    file: Union[str, Path],
    n_rows: Optional[int] = None,
    cache: bool = True,
    rechunk: bool = True,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    **kwargs: Any,
) -> LazyFrame:
    """
    Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.

    This allows the query optimizer to push down predicates and projections to the scan level,
    thereby potentially reducing memory overhead.

    Parameters
    ----------
    file
        Path to a IPC file.
    n_rows
        Stop reading from IPC file after reading ``n_rows``.
    cache
        Cache the result after reading.
    rechunk
        Reallocate to contiguous memory when all chunks/ files are parsed.
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if isinstance(file, (str, Path)):
        file = format_path(file)

    return LazyFrame.scan_ipc(
        file=file,
        n_rows=n_rows,
        cache=cache,
        rechunk=rechunk,
        row_count_name=row_count_name,
        row_count_offset=row_count_offset,
    )
Пример #6
0
def read_ipc_schema(
        file: Union[str, BinaryIO, Path, bytes]) -> Dict[str, Type[DataType]]:
    """
    Get a schema of the IPC file without reading data.

    Parameters
    ----------
    file
        Path to a file or a file-like object.

    Returns
    -------
    Dictionary mapping column names to datatypes
    """
    if isinstance(file, (str, Path)):
        file = format_path(file)

    return _ipc_schema(file)
Пример #7
0
def read_parquet_schema(
    file: str | BinaryIO | Path | bytes, ) -> dict[str, type[DataType]]:
    """
    Get a schema of the Parquet file without reading data.

    Parameters
    ----------
    file
        Path to a file or a file-like object.

    Returns
    -------
    Dictionary mapping column names to datatypes
    """
    if isinstance(file, (str, Path)):
        file = format_path(file)

    return _parquet_schema(file)
Пример #8
0
def scan_csv(
    file: Union[str, Path],
    has_header: bool = True,
    sep: str = ",",
    comment_char: Optional[str] = None,
    quote_char: Optional[str] = r'"',
    skip_rows: int = 0,
    dtypes: Optional[Dict[str, Type[DataType]]] = None,
    null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
    ignore_errors: bool = False,
    cache: bool = True,
    with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
    infer_schema_length: Optional[int] = 100,
    n_rows: Optional[int] = None,
    encoding: str = "utf8",
    low_memory: bool = False,
    rechunk: bool = True,
    skip_rows_after_header: int = 0,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    parse_dates: bool = False,
    **kwargs: Any,
) -> LazyFrame:
    """
    Lazily read from a CSV file or multiple files via glob patterns.

    This allows the query optimizer to push down predicates and
    projections to the scan level, thereby potentially reducing
    memory overhead.

    Parameters
    ----------
    file
        Path to a file.
    has_header
        Indicate if the first row of dataset is a header or not.
        If set to False, column names will be autogenerated in the
        following format: ``column_x``, with ``x`` being an
        enumeration over every column in the dataset starting at 1.
    sep
        Single byte character to use as delimiter in the file.
    comment_char
        Single byte character that indicates the start of a comment line,
        for instance ``#``.
    quote_char
        Single byte character used for csv quoting, default = ``"``.
        Set to None to turn off special handling and escaping of quotes.
    skip_rows
        Start reading after ``skip_rows`` lines. The header will be parsed at this offset.
    dtypes
        Overwrite dtypes during inference.
    null_values
        Values to interpret as null values. You can provide a:
          - ``str``: All values equal to this string will be null.
          - ``List[str]``: A null value per column.
          - ``Dict[str, str]``: A dictionary that maps column name to a
                                null value string.
    ignore_errors
        Try to keep reading lines if some lines yield errors.
        First try ``infer_schema_length=0`` to read all columns as
        ``pl.Utf8`` to check which values might cause an issue.
    cache
        Cache the result after reading.
    with_column_names
        Apply a function over the column names.
        This can be used to update a schema just in time, thus before
        scanning.
    infer_schema_length
        Maximum number of lines to read to infer schema.
        If set to 0, all columns will be read as ``pl.Utf8``.
        If set to ``None``, a full table scan will be done (slow).
    n_rows
        Stop reading from CSV file after reading ``n_rows``.
    encoding
        Allowed encodings: ``utf8`` or ``utf8-lossy``.
        Lossy means that invalid utf8 values are replaced with ``�``
        characters.
    low_memory
        Reduce memory usage in expense of performance.
    rechunk
        Reallocate to contiguous memory when all chunks/ files are parsed.
    skip_rows_after_header
        Skip these number of rows when the header is parsed
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    parse_dates
        Try to automatically parse dates. If this does not succeed,
        the column remains of data type ``pl.Utf8``.

    Examples
    --------
    >>> (
    ...     pl.scan_csv("my_long_file.csv")  # lazy, doesn't do a thing
    ...     .select(
    ...         ["a", "c"]
    ...     )  # select only 2 columns (other columns will not be read)
    ...     .filter(
    ...         pl.col("a") > 10
    ...     )  # the filter is pushed down the the scan, so less data read in memory
    ...     .fetch(100)  # pushed a limit of 100 rows to the scan level
    ... )  # doctest: +SKIP

    We can use `with_column_names` to modify the header before scanning:

    >>> df = pl.DataFrame(
    ...     {"BrEeZaH": [1, 2, 3, 4], "LaNgUaGe": ["is", "terrible", "to", "read"]}
    ... )
    >>> df.to_csv("mydf.csv")
    >>> pl.scan_csv(
    ...     "mydf.csv", with_column_names=lambda cols: [col.lower() for col in cols]
    ... ).fetch()
    shape: (4, 2)
    ┌─────────┬──────────┐
    │ breezah ┆ language │
    │ ---     ┆ ---      │
    │ i64     ┆ str      │
    ╞═════════╪══════════╡
    │ 1       ┆ is       │
    ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
    │ 2       ┆ terrible │
    ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
    │ 3       ┆ to       │
    ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
    │ 4       ┆ read     │
    └─────────┴──────────┘


    """

    # Map legacy arguments to current ones and remove them from kwargs.
    has_header = kwargs.pop("has_headers", has_header)
    dtypes = kwargs.pop("dtype", dtypes)
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    _check_arg_is_1byte("sep", sep, False)
    _check_arg_is_1byte("comment_char", comment_char, False)
    _check_arg_is_1byte("quote_char", quote_char, True)

    if isinstance(file, (str, Path)):
        file = format_path(file)

    return LazyFrame.scan_csv(
        file=file,
        has_header=has_header,
        sep=sep,
        comment_char=comment_char,
        quote_char=quote_char,
        skip_rows=skip_rows,
        dtypes=dtypes,
        null_values=null_values,
        ignore_errors=ignore_errors,
        cache=cache,
        with_column_names=with_column_names,
        infer_schema_length=infer_schema_length,
        n_rows=n_rows,
        low_memory=low_memory,
        rechunk=rechunk,
        skip_rows_after_header=skip_rows_after_header,
        encoding=encoding,
        row_count_name=row_count_name,
        row_count_offset=row_count_offset,
        parse_dates=parse_dates,
    )
Пример #9
0
def read_excel(
    file: Union[str, BytesIO, Path, BinaryIO, bytes],
    sheet_id: Optional[int] = 1,
    sheet_name: Optional[str] = None,
    xlsx2csv_options: Optional[dict] = None,
    read_csv_options: Optional[dict] = None,
) -> DataFrame:
    """
    Read Excel (XLSX) sheet into a DataFrame by converting an Excel
    sheet with ``xlsx2csv.Xlsx2csv().convert()`` to CSV and parsing
    the CSV output with ``pl.read_csv()``.

    Parameters
    ----------
    file
        Path to a file or a file-like object.
        By file-like object, we refer to objects with a ``read()``
        method, such as a file handler (e.g. via builtin ``open``
        function) or ``BytesIO``.
    sheet_id
        Sheet number to convert (0 for all sheets).
    sheet_name
        Sheet name to convert.
    xlsx2csv_options
        Extra options passed to ``xlsx2csv.Xlsx2csv()``.
        e.g.: ``{"skip_empty_lines": True}``
    read_csv_options
        Extra options passed to ``read_csv()`` for parsing
        the CSV file returned by ``xlsx2csv.Xlsx2csv().convert()``
        e.g.: ``{"has_header": False, "new_columns": ["a", "b", "c"], infer_schema_length=None}``

    Returns
    -------
    DataFrame

    Examples
    --------

    Read "My Datasheet" sheet from Excel sheet file to a DataFrame.

    >>> excel_file = "test.xlsx"
    >>> sheet_name = "My Datasheet"
    >>> pl.read_excel(
    ...     file=excel_file,
    ...     sheet_name=sheet_name,
    ... )  # doctest: +SKIP

    Read sheet 3 from Excel sheet file to a DataFrame while skipping
    empty lines in the sheet. As sheet 3 does not have header row,
    pass the needed settings to ``read_csv()``.

    >>> excel_file = "test.xlsx"
    >>> pl.read_excel(
    ...     file=excel_file,
    ...     sheet_id=3,
    ...     xlsx2csv_options={"skip_empty_lines": True},
    ...     read_csv_options={"has_header": False, "new_columns": ["a", "b", "c"]},
    ... )  # doctest: +SKIP

    If the correct datatypes can't be determined by polars, look
    at ``read_csv()`` documentation to see which options you can pass
    to fix this issue. For example ``"infer_schema_length": None``
    can be used to read the whole data twice, once to infer the
    correct output types and once to actually convert the input to
    the correct types. With `"infer_schema_length": 1000``, only
    the first 1000 lines are read twice.

    >>> excel_file = "test.xlsx"
    >>> pl.read_excel(
    ...     file=excel_file,
    ...     read_csv_options={"infer_schema_length": None},
    ... )  # doctest: +SKIP

    Alternative
    -----------

    If ``read_excel()`` does not work or you need to read other types
    of spreadsheet files, you can try pandas ``pd.read_excel()``
    (supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt`).

    >>> excel_file = "test.xlsx"
    >>> pl.from_pandas(pd.read_excel(excel_file))  # doctest: +SKIP
    """

    try:
        import xlsx2csv  # type: ignore
    except ImportError:
        raise ImportError(
            "xlsx2csv is not installed. Please run `pip install xlsx2csv`.")

    if isinstance(file, (str, Path)):
        file = format_path(file)

    if not xlsx2csv_options:
        xlsx2csv_options = {}

    if not read_csv_options:
        read_csv_options = {}

    # Override xlsx2csv eprint function so in case an error occurs
    # it raises an exception instead of writing to stderr.
    def _eprint(*args: Any, **kwargs: Any) -> None:
        raise xlsx2csv.XlsxException(format(*args))

    xlsx2csv.eprint = _eprint

    # Create Xlsx2csv instance.
    xlsx2csv_instance = xlsx2csv.Xlsx2csv(file, **xlsx2csv_options)

    if sheet_name:
        sheet_id = xlsx2csv_instance.getSheetIdByName(sheet_name)

        if not sheet_id:
            raise xlsx2csv.XlsxException(f"Sheet '{sheet_name}' not found.")

    csv_buffer = StringIO()

    # Convert sheet from XSLX document to CSV.
    xlsx2csv_instance.convert(outfile=csv_buffer, sheetid=sheet_id)

    # Rewind buffer to start.
    csv_buffer.seek(0)

    # Parse CSV output.
    return read_csv(csv_buffer, **read_csv_options)