def read_sql( sql: Union[List[str], str], connection_uri: str, partition_on: Optional[str] = None, partition_range: Optional[Tuple[int, int]] = None, partition_num: Optional[int] = None, protocol: Optional[str] = None, ) -> DataFrame: """ Read a SQL query into a DataFrame. Make sure to install connectorx>=0.2 # Sources Supports reading a sql query from the following data sources: * Postgres * Mysql * Sqlite * Redshift (through postgres protocol) * Clickhouse (through mysql protocol) ## Source not supported? If a database source is not supported, pandas can be used to load the query: >>> import pandas as pd >>> df = pl.from_pandas(pd.read_sql(sql, engine)) # doctest: +SKIP Parameters ---------- sql raw sql query. connection_uri connectorx connection uri: - "postgresql://*****:*****@server:port/database" partition_on the column on which to partition the result. partition_range the value range of the partition column. partition_num how many partitions to generate. protocol backend-specific transfer protocol directive; see connectorx documentation for details. Examples -------- ## Single threaded Read a DataFrame from a SQL query using a single thread: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql(query, uri) # doctest: +SKIP ## Using 10 threads Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql( ... query, uri, partition_on="partition_col", partition_num=10 ... ) # doctest: +SKIP ## Using Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries: >>> uri = "postgresql://*****:*****@server:port/database" >>> queries = [ ... "SELECT * FROM lineitem WHERE partition_col <= 10", ... "SELECT * FROM lineitem WHERE partition_col > 10", ... ] >>> pl.read_sql(uri, queries) # doctest: +SKIP """ if _WITH_CX: tbl = cx.read_sql( conn=connection_uri, query=sql, return_type="arrow", partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, protocol=protocol, ) return cast(DataFrame, from_arrow(tbl)) else: raise ImportError( "connectorx is not installed. Please run `pip install connectorx>=0.2.2`." )
def read_csv( file: Union[str, TextIO, BytesIO, Path, BinaryIO, bytes], has_header: bool = True, columns: Optional[Union[List[int], List[str]]] = None, new_columns: Optional[List[str]] = None, sep: str = ",", comment_char: Optional[str] = None, quote_char: Optional[str] = r'"', skip_rows: int = 0, dtypes: Optional[Union[Mapping[str, Type[DataType]], List[Type[DataType]]]] = None, null_values: Optional[Union[str, List[str], Dict[str, str]]] = None, ignore_errors: bool = False, parse_dates: bool = False, n_threads: Optional[int] = None, infer_schema_length: Optional[int] = 100, batch_size: int = 8192, n_rows: Optional[int] = None, encoding: str = "utf8", low_memory: bool = False, rechunk: bool = True, use_pyarrow: bool = False, storage_options: Optional[Dict] = None, skip_rows_after_header: int = 0, row_count_name: Optional[str] = None, row_count_offset: int = 0, sample_size: int = 1024, **kwargs: Any, ) -> DataFrame: """ Read a CSV file into a Dataframe. Parameters ---------- file Path to a file or a file-like object. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO`` or ``BytesIO``. If ``fsspec`` is installed, it will be used to open remote files. has_header Indicate if the first row of dataset is a header or not. If set to False, column names will be autogenerated in the following format: ``column_x``, with ``x`` being an enumeration over every column in the dataset starting at 1. columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names. new_columns Rename columns right after parsing the CSV file. If the given list is shorter than the width of the DataFrame the remaining columns will have their original name. sep Single byte character to use as delimiter in the file. comment_char Single byte character that indicates the start of a comment line, for instance ``#``. quote_char Single byte character used for csv quoting, default = ``"``. Set to None to turn off special handling and escaping of quotes. skip_rows Start reading after ``skip_rows`` lines. dtypes Overwrite dtypes during inference. null_values Values to interpret as null values. You can provide a: - ``str``: All values equal to this string will be null. - ``List[str]``: A null value per column. - ``Dict[str, str]``: A dictionary that maps column name to a null value string. ignore_errors Try to keep reading lines if some lines yield errors. First try ``infer_schema_length=0`` to read all columns as ``pl.Utf8`` to check which values might cause an issue. parse_dates Try to automatically parse dates. If this does not succeed, the column remains of data type ``pl.Utf8``. n_threads Number of threads to use in csv parsing. Defaults to the number of physical cpu's of your system. infer_schema_length Maximum number of lines to read to infer schema. If set to 0, all columns will be read as ``pl.Utf8``. If set to ``None``, a full table scan will be done (slow). batch_size Number of lines to read into the buffer at once. Modify this to change performance. n_rows Stop reading from CSV file after reading ``n_rows``. During multi-threaded parsing, an upper bound of ``n_rows`` rows cannot be guaranteed. encoding Allowed encodings: ``utf8`` or ``utf8-lossy``. Lossy means that invalid utf8 values are replaced with ``�`` characters. low_memory Reduce memory usage at expense of performance. rechunk Make sure that all columns are contiguous in memory by aggregating the chunks into a single array. use_pyarrow Try to use pyarrow's native CSV parser. This is not always possible. The set of arguments given to this function determines if it is possible to use pyarrow's native parser. Note that pyarrow and polars may have a different strategy regarding type inference. storage_options Extra options that make sense for ``fsspec.open()`` or a particular storage connection. e.g. host, port, username, password, etc. skip_rows_after_header Skip these number of rows when the header is parsed row_count_name If not None, this will insert a row count column with give name into the DataFrame row_count_offset Offset to start the row_count column (only use if the name is set) sample_size: Set the sample size. This is used to sample statistics to estimate the allocation needed. Returns ------- DataFrame """ # Map legacy arguments to current ones and remove them from kwargs. has_header = kwargs.pop("has_headers", has_header) dtypes = kwargs.pop("dtype", dtypes) n_rows = kwargs.pop("stop_after_n_rows", n_rows) if columns is None: columns = kwargs.pop("projection", None) _check_arg_is_1byte("sep", sep, False) _check_arg_is_1byte("comment_char", comment_char, False) _check_arg_is_1byte("quote_char", quote_char, True) projection, columns = handle_projection_columns(columns) if isinstance(file, bytes) and len(file) == 0: raise ValueError("Empty bytes data provided.") storage_options = storage_options or {} if columns and not has_header: for column in columns: if isinstance(column, str) and not column.startswith("column_"): raise ValueError( 'Specified column names do not start with "column_", ' "but autogenerated header names were requested.") if use_pyarrow and not _PYARROW_AVAILABLE: raise ImportError( "'pyarrow' is required when using 'read_csv(..., use_pyarrow=True)'." ) if (use_pyarrow and dtypes is None and n_rows is None and n_threads is None and encoding == "utf8" and not low_memory and null_values is None and parse_dates): include_columns = None if columns: if not has_header: # Convert 'column_1', 'column_2', ... column names to 'f0', 'f1', ... column names for pyarrow, # if CSV file does not contain a header. include_columns = [ f"f{int(column[7:]) - 1}" for column in columns ] else: include_columns = columns if not columns and projection: # Convert column indices from projection to 'f0', 'f1', ... column names for pyarrow. include_columns = [f"f{column_idx}" for column_idx in projection] with _prepare_file_arg(file, **storage_options) as data: tbl = pa.csv.read_csv( data, pa.csv.ReadOptions(skip_rows=skip_rows, autogenerate_column_names=not has_header), pa.csv.ParseOptions(delimiter=sep), pa.csv.ConvertOptions( column_types=None, include_columns=include_columns, include_missing_columns=ignore_errors, ), ) if not has_header: # Rename 'f0', 'f1', ... columns names autogenated by pyarrow to 'column_1', 'column_2', ... tbl = tbl.rename_columns([ f"column_{int(column[1:]) + 1}" for column in tbl.column_names ]) df = cast(DataFrame, from_arrow(tbl, rechunk)) if new_columns: return update_columns(df, new_columns) return df if new_columns and dtypes and isinstance(dtypes, dict): current_columns = None # As new column names are not available yet while parsing the CSV file, rename column names in # dtypes to old names (if possible) so they can be used during CSV parsing. if columns: if len(columns) < len(new_columns): raise ValueError( "More new column names are specified than there are selected columns." ) # Get column names of requested columns. current_columns = columns[0:len(new_columns)] elif not has_header: # When there are no header, column names are autogenerated (and known). if projection: if columns and len(columns) < len(new_columns): raise ValueError( "More new column names are specified than there are selected columns." ) # Convert column indices from projection to 'column_1', 'column_2', ... column names. current_columns = [ f"column_{column_idx + 1}" for column_idx in projection ] else: # Generate autogenerated 'column_1', 'column_2', ... column names for new column names. current_columns = [ f"column_{column_idx}" for column_idx in range(1, len(new_columns) + 1) ] else: # When a header is present, column names are not known yet. if len(dtypes) <= len(new_columns): # If dtypes dictionary contains less or same amount of values than new column names # a list of dtypes can be created if all listed column names in dtypes dictionary # appear in the first consecutive new column names. dtype_list = [ dtypes[new_column_name] for new_column_name in new_columns[0:len(dtypes)] if new_column_name in dtypes ] if len(dtype_list) == len(dtypes): dtypes = dtype_list if current_columns and isinstance(dtypes, dict): new_to_current = { new_column: current_column for new_column, current_column in zip(new_columns, current_columns) } # Change new column names to current column names in dtype. dtypes = { new_to_current.get(column_name, column_name): column_dtype for column_name, column_dtype in dtypes.items() } with _prepare_file_arg(file, **storage_options) as data: df = DataFrame._read_csv( file=data, has_header=has_header, columns=columns if columns else projection, sep=sep, comment_char=comment_char, quote_char=quote_char, skip_rows=skip_rows, dtypes=dtypes, null_values=null_values, ignore_errors=ignore_errors, parse_dates=parse_dates, n_threads=n_threads, infer_schema_length=infer_schema_length, batch_size=batch_size, n_rows=n_rows, encoding=encoding, low_memory=low_memory, rechunk=rechunk, skip_rows_after_header=skip_rows_after_header, row_count_name=row_count_name, row_count_offset=row_count_offset, sample_size=sample_size, ) if new_columns: return update_columns(df, new_columns) return df
def read_parquet( source: Union[str, Path, BinaryIO, BytesIO, bytes], columns: Optional[Union[List[int], List[str]]] = None, n_rows: Optional[int] = None, use_pyarrow: bool = False, memory_map: bool = True, storage_options: Optional[Dict] = None, parallel: bool = True, row_count_name: Optional[str] = None, row_count_offset: int = 0, **kwargs: Any, ) -> DataFrame: """ Read into a DataFrame from a parquet file. Parameters ---------- source Path to a file, or a file-like object. If the path is a directory, that directory will be used as partition aware scan. If ``fsspec`` is installed, it will be used to open remote files. columns Columns to select. Accepts a list of column indices (starting at zero) or a list of column names. n_rows Stop reading from parquet file after reading ``n_rows``. Only valid when `use_pyarrow=False`. use_pyarrow Use pyarrow instead of the rust native parquet reader. The pyarrow reader is more stable. memory_map Memory map underlying file. This will likely increase performance. Only used when ``use_pyarrow=True``. storage_options Extra options that make sense for ``fsspec.open()`` or a particular storage connection, e.g. host, port, username, password, etc. parallel Read the parquet file in parallel. The single threaded reader consumes less memory. row_count_name If not None, this will insert a row count column with give name into the DataFrame row_count_offset Offset to start the row_count column (only use if the name is set) **kwargs kwargs for [pyarrow.parquet.read_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html) Returns ------- DataFrame """ # Map legacy arguments to current ones and remove them from kwargs. n_rows = kwargs.pop("stop_after_n_rows", n_rows) if columns is None: columns = kwargs.pop("projection", None) if use_pyarrow: if n_rows: raise ValueError( "``n_rows`` cannot be used with ``use_pyarrow=True``.") storage_options = storage_options or {} with _prepare_file_arg(source, **storage_options) as source_prep: if use_pyarrow: if not _PYARROW_AVAILABLE: raise ImportError( "'pyarrow' is required when using 'read_parquet(..., use_pyarrow=True)'." ) return from_arrow( # type: ignore[return-value] pa.parquet.read_table( source_prep, memory_map=memory_map, columns=columns, **kwargs, )) return DataFrame._read_parquet( source_prep, columns=columns, n_rows=n_rows, parallel=parallel, row_count_name=row_count_name, row_count_offset=row_count_offset, )
def read_sql( sql: list[str] | str, connection_uri: str, partition_on: str | None = None, partition_range: tuple[int, int] | None = None, partition_num: int | None = None, protocol: str | None = None, ) -> DataFrame: """ Read a SQL query into a DataFrame. .. note:: Make sure to install connectorx>=0.2.2. Read the documentation `here <https://sfu-db.github.io/connector-x/intro.html>`_. Reading a SQL query from the following data sources are supported: * Postgres * Mysql * Sqlite * Redshift (through postgres protocol) * Clickhouse (through mysql protocol) If a database source is not supported, an alternative solution is to first use pandas to load the SQL query, then converting the result into a polars DataFrame: >>> import pandas as pd >>> df = pl.from_pandas(pd.read_sql(sql, engine)) # doctest: +SKIP Parameters ---------- sql Raw SQL query / queries. connection_uri Connectorx connection uri, for example * "postgresql://*****:*****@server:port/database" partition_on The column on which to partition the result. partition_range The value range of the partition column. partition_num How many partitions to generate. protocol Backend-specific transfer protocol directive; see connectorx documentation for details. Examples -------- Read a DataFrame from a SQL query using a single thread: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql(query, uri) # doctest: +SKIP Read a DataFrame in parallel using 10 threads by automatically partitioning the provided SQL on the partition column: >>> uri = "postgresql://*****:*****@server:port/database" >>> query = "SELECT * FROM lineitem" >>> pl.read_sql( ... query, uri, partition_on="partition_col", partition_num=10 ... ) # doctest: +SKIP Read a DataFrame in parallel using 2 threads by explicitly providing two SQL queries: >>> uri = "postgresql://*****:*****@server:port/database" >>> queries = [ ... "SELECT * FROM lineitem WHERE partition_col <= 10", ... "SELECT * FROM lineitem WHERE partition_col > 10", ... ] >>> pl.read_sql(queries, uri) # doctest: +SKIP """ if _WITH_CX: tbl = cx.read_sql( conn=connection_uri, query=sql, return_type="arrow2", partition_on=partition_on, partition_range=partition_range, partition_num=partition_num, protocol=protocol, ) return cast(DataFrame, from_arrow(tbl)) else: raise ImportError( "connectorx is not installed. Please run `pip install connectorx>=0.2.2`." )