Python from_arrow示例

编程语言: Python

命名空间/包名称: polars.convert

方法/功能: from_arrow

hotexamples.com的示例: 4

Python from_arrow - 已找到4个示例。这些是从开源项目中提取的最受好评的polars.convert.from_arrow现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： io.py 项目： PrettyWood/polars

def read_sql(
    sql: Union[List[str], str],
    connection_uri: str,
    partition_on: Optional[str] = None,
    partition_range: Optional[Tuple[int, int]] = None,
    partition_num: Optional[int] = None,
    protocol: Optional[str] = None,
) -> DataFrame:
    """
    Read a SQL query into a DataFrame.
    Make sure to install connectorx>=0.2

    # Sources
    Supports reading a sql query from the following data sources:

    * Postgres
    * Mysql
    * Sqlite
    * Redshift (through postgres protocol)
    * Clickhouse (through mysql protocol)

    ## Source not supported?
    If a database source is not supported, pandas can be used to load the query:

    >>> import pandas as pd
    >>> df = pl.from_pandas(pd.read_sql(sql, engine))  # doctest: +SKIP

    Parameters
    ----------
    sql
        raw sql query.
    connection_uri
        connectorx connection uri:
            - "postgresql://*****:*****@server:port/database"
    partition_on
      the column on which to partition the result.
    partition_range
      the value range of the partition column.
    partition_num
      how many partitions to generate.
    protocol
      backend-specific transfer protocol directive; see connectorx documentation for details.

    Examples
    --------

    ## Single threaded
    Read a DataFrame from a SQL query using a single thread:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(query, uri)  # doctest: +SKIP

    ## Using 10 threads
    Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(
    ...     query, uri, partition_on="partition_col", partition_num=10
    ... )  # doctest: +SKIP

    ## Using
    Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> queries = [
    ...     "SELECT * FROM lineitem WHERE partition_col <= 10",
    ...     "SELECT * FROM lineitem WHERE partition_col > 10",
    ... ]
    >>> pl.read_sql(uri, queries)  # doctest: +SKIP

    """
    if _WITH_CX:
        tbl = cx.read_sql(
            conn=connection_uri,
            query=sql,
            return_type="arrow",
            partition_on=partition_on,
            partition_range=partition_range,
            partition_num=partition_num,
            protocol=protocol,
        )
        return cast(DataFrame, from_arrow(tbl))
    else:
        raise ImportError(
            "connectorx is not installed. Please run `pip install connectorx>=0.2.2`."
        )

示例#2

显示文件

文件： io.py 项目： PrettyWood/polars

def read_csv(
    file: Union[str, TextIO, BytesIO, Path, BinaryIO, bytes],
    has_header: bool = True,
    columns: Optional[Union[List[int], List[str]]] = None,
    new_columns: Optional[List[str]] = None,
    sep: str = ",",
    comment_char: Optional[str] = None,
    quote_char: Optional[str] = r'"',
    skip_rows: int = 0,
    dtypes: Optional[Union[Mapping[str, Type[DataType]],
                           List[Type[DataType]]]] = None,
    null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
    ignore_errors: bool = False,
    parse_dates: bool = False,
    n_threads: Optional[int] = None,
    infer_schema_length: Optional[int] = 100,
    batch_size: int = 8192,
    n_rows: Optional[int] = None,
    encoding: str = "utf8",
    low_memory: bool = False,
    rechunk: bool = True,
    use_pyarrow: bool = False,
    storage_options: Optional[Dict] = None,
    skip_rows_after_header: int = 0,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    sample_size: int = 1024,
    **kwargs: Any,
) -> DataFrame:
    """
    Read a CSV file into a Dataframe.

    Parameters
    ----------
    file
        Path to a file or a file-like object.
        By file-like object, we refer to objects with a ``read()``
        method, such as a file handler (e.g. via builtin ``open``
        function) or ``StringIO`` or ``BytesIO``.
        If ``fsspec`` is installed, it will be used to open remote
        files.
    has_header
        Indicate if the first row of dataset is a header or not.
        If set to False, column names will be autogenerated in the
        following format: ``column_x``, with ``x`` being an
        enumeration over every column in the dataset starting at 1.
    columns
        Columns to select. Accepts a list of column indices (starting
        at zero) or a list of column names.
    new_columns
        Rename columns right after parsing the CSV file. If the given
        list is shorter than the width of the DataFrame the remaining
        columns will have their original name.
    sep
        Single byte character to use as delimiter in the file.
    comment_char
        Single byte character that indicates the start of a comment line,
        for instance ``#``.
    quote_char
        Single byte character used for csv quoting, default = ``"``.
        Set to None to turn off special handling and escaping of quotes.
    skip_rows
        Start reading after ``skip_rows`` lines.
    dtypes
        Overwrite dtypes during inference.
    null_values
        Values to interpret as null values. You can provide a:
          - ``str``: All values equal to this string will be null.
          - ``List[str]``: A null value per column.
          - ``Dict[str, str]``: A dictionary that maps column name to a
                                null value string.
    ignore_errors
        Try to keep reading lines if some lines yield errors.
        First try ``infer_schema_length=0`` to read all columns as
        ``pl.Utf8`` to check which values might cause an issue.
    parse_dates
        Try to automatically parse dates. If this does not succeed,
        the column remains of data type ``pl.Utf8``.
    n_threads
        Number of threads to use in csv parsing.
        Defaults to the number of physical cpu's of your system.
    infer_schema_length
        Maximum number of lines to read to infer schema.
        If set to 0, all columns will be read as ``pl.Utf8``.
        If set to ``None``, a full table scan will be done (slow).
    batch_size
        Number of lines to read into the buffer at once.
        Modify this to change performance.
    n_rows
        Stop reading from CSV file after reading ``n_rows``.
        During multi-threaded parsing, an upper bound of ``n_rows``
        rows cannot be guaranteed.
    encoding
        Allowed encodings: ``utf8`` or ``utf8-lossy``.
        Lossy means that invalid utf8 values are replaced with ``�``
        characters.
    low_memory
        Reduce memory usage at expense of performance.
    rechunk
        Make sure that all columns are contiguous in memory by
        aggregating the chunks into a single array.
    use_pyarrow
        Try to use pyarrow's native CSV parser.
        This is not always possible. The set of arguments given to
        this function determines if it is possible to use pyarrow's
        native parser. Note that pyarrow and polars may have a
        different strategy regarding type inference.
    storage_options
        Extra options that make sense for ``fsspec.open()`` or a
        particular storage connection.
        e.g. host, port, username, password, etc.
    skip_rows_after_header
        Skip these number of rows when the header is parsed
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    sample_size:
        Set the sample size. This is used to sample statistics to estimate the allocation needed.

    Returns
    -------
    DataFrame
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    has_header = kwargs.pop("has_headers", has_header)
    dtypes = kwargs.pop("dtype", dtypes)
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if columns is None:
        columns = kwargs.pop("projection", None)

    _check_arg_is_1byte("sep", sep, False)
    _check_arg_is_1byte("comment_char", comment_char, False)
    _check_arg_is_1byte("quote_char", quote_char, True)

    projection, columns = handle_projection_columns(columns)

    if isinstance(file, bytes) and len(file) == 0:
        raise ValueError("Empty bytes data provided.")

    storage_options = storage_options or {}

    if columns and not has_header:
        for column in columns:
            if isinstance(column, str) and not column.startswith("column_"):
                raise ValueError(
                    'Specified column names do not start with "column_", '
                    "but autogenerated header names were requested.")

    if use_pyarrow and not _PYARROW_AVAILABLE:
        raise ImportError(
            "'pyarrow' is required when using 'read_csv(..., use_pyarrow=True)'."
        )

    if (use_pyarrow and dtypes is None and n_rows is None and n_threads is None
            and encoding == "utf8" and not low_memory and null_values is None
            and parse_dates):
        include_columns = None

        if columns:
            if not has_header:
                # Convert 'column_1', 'column_2', ... column names to 'f0', 'f1', ... column names for pyarrow,
                # if CSV file does not contain a header.
                include_columns = [
                    f"f{int(column[7:]) - 1}" for column in columns
                ]
            else:
                include_columns = columns

        if not columns and projection:
            # Convert column indices from projection to 'f0', 'f1', ... column names for pyarrow.
            include_columns = [f"f{column_idx}" for column_idx in projection]

        with _prepare_file_arg(file, **storage_options) as data:
            tbl = pa.csv.read_csv(
                data,
                pa.csv.ReadOptions(skip_rows=skip_rows,
                                   autogenerate_column_names=not has_header),
                pa.csv.ParseOptions(delimiter=sep),
                pa.csv.ConvertOptions(
                    column_types=None,
                    include_columns=include_columns,
                    include_missing_columns=ignore_errors,
                ),
            )

        if not has_header:
            # Rename 'f0', 'f1', ... columns names autogenated by pyarrow to 'column_1', 'column_2', ...
            tbl = tbl.rename_columns([
                f"column_{int(column[1:]) + 1}" for column in tbl.column_names
            ])

        df = cast(DataFrame, from_arrow(tbl, rechunk))
        if new_columns:
            return update_columns(df, new_columns)
        return df

    if new_columns and dtypes and isinstance(dtypes, dict):
        current_columns = None

        # As new column names are not available yet while parsing the CSV file, rename column names in
        # dtypes to old names (if possible) so they can be used during CSV parsing.
        if columns:
            if len(columns) < len(new_columns):
                raise ValueError(
                    "More new column names are specified than there are selected columns."
                )

            # Get column names of requested columns.
            current_columns = columns[0:len(new_columns)]
        elif not has_header:
            # When there are no header, column names are autogenerated (and known).

            if projection:
                if columns and len(columns) < len(new_columns):
                    raise ValueError(
                        "More new column names are specified than there are selected columns."
                    )
                # Convert column indices from projection to 'column_1', 'column_2', ... column names.
                current_columns = [
                    f"column_{column_idx + 1}" for column_idx in projection
                ]
            else:
                # Generate autogenerated 'column_1', 'column_2', ... column names for new column names.
                current_columns = [
                    f"column_{column_idx}"
                    for column_idx in range(1,
                                            len(new_columns) + 1)
                ]
        else:
            # When a header is present, column names are not known yet.

            if len(dtypes) <= len(new_columns):
                # If dtypes dictionary contains less or same amount of values than new column names
                # a list of dtypes can be created if all listed column names in dtypes dictionary
                # appear in the first consecutive new column names.
                dtype_list = [
                    dtypes[new_column_name]
                    for new_column_name in new_columns[0:len(dtypes)]
                    if new_column_name in dtypes
                ]

                if len(dtype_list) == len(dtypes):
                    dtypes = dtype_list

        if current_columns and isinstance(dtypes, dict):
            new_to_current = {
                new_column: current_column
                for new_column, current_column in zip(new_columns,
                                                      current_columns)
            }
            # Change new column names to current column names in dtype.
            dtypes = {
                new_to_current.get(column_name, column_name): column_dtype
                for column_name, column_dtype in dtypes.items()
            }

    with _prepare_file_arg(file, **storage_options) as data:
        df = DataFrame._read_csv(
            file=data,
            has_header=has_header,
            columns=columns if columns else projection,
            sep=sep,
            comment_char=comment_char,
            quote_char=quote_char,
            skip_rows=skip_rows,
            dtypes=dtypes,
            null_values=null_values,
            ignore_errors=ignore_errors,
            parse_dates=parse_dates,
            n_threads=n_threads,
            infer_schema_length=infer_schema_length,
            batch_size=batch_size,
            n_rows=n_rows,
            encoding=encoding,
            low_memory=low_memory,
            rechunk=rechunk,
            skip_rows_after_header=skip_rows_after_header,
            row_count_name=row_count_name,
            row_count_offset=row_count_offset,
            sample_size=sample_size,
        )

    if new_columns:
        return update_columns(df, new_columns)
    return df

示例#3

显示文件

文件： io.py 项目： PrettyWood/polars

def read_parquet(
    source: Union[str, Path, BinaryIO, BytesIO, bytes],
    columns: Optional[Union[List[int], List[str]]] = None,
    n_rows: Optional[int] = None,
    use_pyarrow: bool = False,
    memory_map: bool = True,
    storage_options: Optional[Dict] = None,
    parallel: bool = True,
    row_count_name: Optional[str] = None,
    row_count_offset: int = 0,
    **kwargs: Any,
) -> DataFrame:
    """
    Read into a DataFrame from a parquet file.

    Parameters
    ----------
    source
        Path to a file, or a file-like object. If the path is a directory, that directory will be used
        as partition aware scan.
        If ``fsspec`` is installed, it will be used to open remote files.
    columns
        Columns to select. Accepts a list of column indices (starting at zero) or a list of column names.
    n_rows
        Stop reading from parquet file after reading ``n_rows``.
        Only valid when `use_pyarrow=False`.
    use_pyarrow
        Use pyarrow instead of the rust native parquet reader. The pyarrow reader is more stable.
    memory_map
        Memory map underlying file. This will likely increase performance.
        Only used when ``use_pyarrow=True``.
    storage_options
        Extra options that make sense for ``fsspec.open()`` or a particular storage connection, e.g. host, port, username, password, etc.
    parallel
        Read the parquet file in parallel. The single threaded reader consumes less memory.
    row_count_name
        If not None, this will insert a row count column with give name into the DataFrame
    row_count_offset
        Offset to start the row_count column (only use if the name is set)
    **kwargs
        kwargs for [pyarrow.parquet.read_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html)

    Returns
    -------
    DataFrame
    """

    # Map legacy arguments to current ones and remove them from kwargs.
    n_rows = kwargs.pop("stop_after_n_rows", n_rows)

    if columns is None:
        columns = kwargs.pop("projection", None)

    if use_pyarrow:
        if n_rows:
            raise ValueError(
                "``n_rows`` cannot be used with ``use_pyarrow=True``.")

    storage_options = storage_options or {}
    with _prepare_file_arg(source, **storage_options) as source_prep:
        if use_pyarrow:
            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    "'pyarrow' is required when using 'read_parquet(..., use_pyarrow=True)'."
                )

            return from_arrow(  # type: ignore[return-value]
                pa.parquet.read_table(
                    source_prep,
                    memory_map=memory_map,
                    columns=columns,
                    **kwargs,
                ))

        return DataFrame._read_parquet(
            source_prep,
            columns=columns,
            n_rows=n_rows,
            parallel=parallel,
            row_count_name=row_count_name,
            row_count_offset=row_count_offset,
        )

示例#4

显示文件

文件： io.py 项目： pola-rs/polars

def read_sql(
    sql: list[str] | str,
    connection_uri: str,
    partition_on: str | None = None,
    partition_range: tuple[int, int] | None = None,
    partition_num: int | None = None,
    protocol: str | None = None,
) -> DataFrame:
    """
    Read a SQL query into a DataFrame.

    .. note::
        Make sure to install connectorx>=0.2.2. Read the documentation
        `here <https://sfu-db.github.io/connector-x/intro.html>`_.

    Reading a SQL query from the following data sources are supported:

        * Postgres
        * Mysql
        * Sqlite
        * Redshift (through postgres protocol)
        * Clickhouse (through mysql protocol)

    If a database source is not supported, an alternative solution is to first use pandas
    to load the SQL query, then converting the result into a polars DataFrame:

    >>> import pandas as pd
    >>> df = pl.from_pandas(pd.read_sql(sql, engine))  # doctest: +SKIP

    Parameters
    ----------
    sql
        Raw SQL query / queries.
    connection_uri
        Connectorx connection uri, for example

        * "postgresql://*****:*****@server:port/database"
    partition_on
        The column on which to partition the result.
    partition_range
        The value range of the partition column.
    partition_num
        How many partitions to generate.
    protocol
        Backend-specific transfer protocol directive; see connectorx documentation for details.

    Examples
    --------
    Read a DataFrame from a SQL query using a single thread:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(query, uri)  # doctest: +SKIP

    Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> query = "SELECT * FROM lineitem"
    >>> pl.read_sql(
    ...     query, uri, partition_on="partition_col", partition_num=10
    ... )  # doctest: +SKIP

    Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries:

    >>> uri = "postgresql://*****:*****@server:port/database"
    >>> queries = [
    ...     "SELECT * FROM lineitem WHERE partition_col <= 10",
    ...     "SELECT * FROM lineitem WHERE partition_col > 10",
    ... ]
    >>> pl.read_sql(queries, uri)  # doctest: +SKIP

    """
    if _WITH_CX:
        tbl = cx.read_sql(
            conn=connection_uri,
            query=sql,
            return_type="arrow2",
            partition_on=partition_on,
            partition_range=partition_range,
            partition_num=partition_num,
            protocol=protocol,
        )
        return cast(DataFrame, from_arrow(tbl))
    else:
        raise ImportError(
            "connectorx is not installed. Please run `pip install connectorx>=0.2.2`."
        )