def sequence_to_pydf( data: Sequence[Any], columns: Optional[ColumnsType] = None, orient: Optional[str] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a sequence. """ data_series: List["PySeries"] if len(data) == 0: return dict_to_pydf({}, columns=columns) elif isinstance(data[0], pli.Series): series_names = [s.name for s in data] columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data)) data_series = [] for i, s in enumerate(data): if not s.name: # TODO: Replace by `if s.name is None` once allowed s.rename(columns[i], in_place=True) new_dtype = dtypes.get(columns[i]) if new_dtype and new_dtype != s.dtype: s = s.cast(new_dtype) data_series.append(s.inner()) elif isinstance(data[0], dict): pydf = PyDataFrame.read_dicts(data) if columns: pydf = _post_apply_columns(pydf, columns) return pydf elif isinstance(data[0], Sequence) and not isinstance(data[0], str): # Infer orientation if orient is None and columns is not None: orient = "col" if len(columns) == len(data) else "row" if orient == "row": pydf = PyDataFrame.read_rows(data) if columns: pydf = _post_apply_columns(pydf, columns) return pydf else: columns, dtypes = _unpack_columns(columns, n_expected=len(data)) data_series = [ pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner() for i in range(len(data)) ] else: columns, dtypes = _unpack_columns(columns, n_expected=1) data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def dict_to_pydf(data: dict[str, Sequence[Any]], columns: ColumnsType | None = None) -> PyDataFrame: """ Construct a PyDataFrame from a dictionary of sequences. """ if columns is not None: # the columns arg may also set the dtype of the series columns, dtypes = _unpack_columns(columns, lookup_names=data.keys()) if not data and dtypes: data_series = [ pli.Series(name, [], dtypes.get(name)).inner() for name in columns ] else: data_series = [ pli.Series(name, values, dtypes.get(name)).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series) if _NUMPY_AVAILABLE: all_numpy = True for val in data.values(): # only start a thread pool from a reasonable size. all_numpy = all_numpy and isinstance( val, np.ndarray) and len(val) > 1000 if not all_numpy: break if all_numpy: # yes, multi-threading was easier in python here # we cannot run multiple threads that run python code # and release the gil in pyo3 # it will deadlock. # dummy is threaded import multiprocessing.dummy pool_size = threadpool_size() pool = multiprocessing.dummy.Pool(pool_size) data_series = pool.map( lambda t: pli.Series(t[0], t[1]).inner(), [(k, v) for k, v in data.items()], ) return PyDataFrame(data_series) # fast path return PyDataFrame.read_dict(data)
def sequence_to_pydf( data: Sequence[Any], columns: Optional[Sequence[str]] = None, orient: Optional[str] = None, nullable: bool = True, ) -> "PyDataFrame": """ Construct a PyDataFrame from a sequence. """ data_series: List["PySeries"] if len(data) == 0: data_series = [] elif isinstance(data[0], pl.Series): data_series = [] for i, s in enumerate(data): if not s.name: # TODO: Replace by `if s.name is None` once allowed s.rename(f"column_{i}", in_place=True) data_series.append(s.inner()) elif isinstance(data[0], dict): pydf = PyDataFrame.read_dicts(data) if columns is not None: pydf.set_column_names(columns) return pydf elif isinstance(data[0], Sequence) and not isinstance(data[0], str): # Infer orientation if orient is None and columns is not None: orient = "col" if len(columns) == len(data) else "row" if orient == "row": pydf = PyDataFrame.read_rows(data) if columns is not None: pydf.set_column_names(columns) return pydf else: data_series = [ pl.Series(f"column_{i}", data[i], nullable=nullable).inner() for i in range(len(data)) ] else: s = pl.Series("column_0", data, nullable=nullable).inner() data_series = [s] data_series = _handle_columns_arg(data_series, columns=columns, nullable=nullable) return PyDataFrame(data_series)
def arrow_to_pydf(data: pa.Table, columns: Optional[Sequence[str]] = None, rechunk: bool = True) -> "PyDataFrame": """ Construct a PyDataFrame from an Arrow Table. """ if columns is not None: try: data = data.rename_columns(columns) except pa.lib.ArrowInvalid as e: raise ValueError( "Dimensions of columns arg must match data dimensions.") from e data_dict = {} for i, column in enumerate(data): # extract the name before casting if column._name is None: name = f"column_{i}" else: name = column._name column = coerce_arrow(column) data_dict[name] = column batches = pa.table(data_dict).to_batches() pydf = PyDataFrame.from_arrow_record_batches(batches) if rechunk: pydf = pydf.rechunk() return pydf
def _post_apply_columns(pydf: PyDataFrame, columns: ColumnsType) -> PyDataFrame: """ Apply 'columns' param _after_ PyDataFrame creation (if no alternative). """ pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes() columns, dtypes = _unpack_columns(columns or pydf_columns) if columns != pydf_columns: pydf.set_column_names(columns) column_casts = [ pli.col(col).cast(dtypes[col])._pyexpr for i, col in enumerate(columns) if col in dtypes and dtypes[col] != pydf_dtypes[i] ] if column_casts: pydf = pydf.lazy().with_columns(column_casts).collect() return pydf
def numpy_to_pydf( data: np.ndarray, columns: Optional[ColumnsType] = None, orient: Optional[str] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a numpy ndarray. """ shape = data.shape n_columns = (0 if shape == (0, ) else (1 if len(shape) == 1 else (shape[1] if orient in ("row", None) else shape[0]))) columns, dtypes = _unpack_columns(columns, n_expected=n_columns) if columns and len(columns) != n_columns: raise ValueError( "Dimensions of columns arg must match data dimensions.") if shape == (0, ): data_series = [] elif len(shape) == 1: data_series = [ pli.Series(columns[0], data, dtypes.get(columns[0])).inner() ] elif len(shape) == 2: # Infer orientation if orient is None: warnings.warn( "Default orientation for constructing DataFrame from numpy " 'array will change from "row" to "column" in a future version. ' "Specify orientation explicitly to silence this warning.", DeprecationWarning, stacklevel=2, ) orient = "row" # Exchange if-block above for block below when removing warning # if orientation is None and columns is not None: # orientation = "col" if len(columns) == shape[0] else "row" if orient == "row": data_series = [ pli.Series(columns[i], data[:, i], dtypes.get(columns[i])).inner() for i in range(n_columns) ] else: data_series = [ pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner() for i in range(n_columns) ] else: raise ValueError( "A numpy array should not have more than two dimensions.") data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def arrow_to_pydf(data: "pa.Table", columns: Optional[Sequence[str]] = None, rechunk: bool = True) -> "PyDataFrame": """ Construct a PyDataFrame from an Arrow Table. """ if not _PYARROW_AVAILABLE: # pragma: no cover raise ImportError( "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table." ) if columns is not None: try: data = data.rename_columns(columns) except pa.lib.ArrowInvalid as e: raise ValueError( "Dimensions of columns arg must match data dimensions.") from e data_dict = {} # dictionaries cannot be build in different batches (categorical does not allow that) # so we rechunk them and create them separate. dictionary_cols = {} names = [] for i, column in enumerate(data): # extract the name before casting if column._name is None: name = f"column_{i}" else: name = column._name names.append(name) column = coerce_arrow(column) if pa.types.is_dictionary(column.type): ps = arrow_to_pyseries(name, column, rechunk) dictionary_cols[i] = pli.wrap_s(ps) else: data_dict[name] = column if len(data_dict) > 0: tbl = pa.table(data_dict) # path for table without rows that keeps datatype if tbl.shape[0] == 0: pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df else: pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches()) else: pydf = pli.DataFrame([])._df if rechunk: pydf = pydf.rechunk() if len(dictionary_cols) > 0: df = pli.wrap_df(pydf) for i, s in dictionary_cols.items(): df[s.name] = s df = df[names] pydf = df._df return pydf
def series_to_pydf( data: "pli.Series", columns: Optional[Sequence[str]] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a Polars Series. """ data_series = [data.inner()] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def dict_to_pydf( data: Dict[str, Sequence[Any]], columns: Optional[Sequence[str]] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a dictionary of sequences. """ data_series = [ pli.Series(name, values).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def numpy_to_pydf( data: np.ndarray, columns: Optional[Sequence[str]] = None, orient: Optional[str] = None, nullable: bool = True, ) -> "PyDataFrame": """ Construct a PyDataFrame from a numpy ndarray. """ shape = data.shape if shape == (0, ): data_series = [] elif len(shape) == 1: s = pl.Series("column_0", data, nullable=False).inner() data_series = [s] elif len(shape) == 2: # Infer orientation if orient is None: warnings.warn( "Default orientation for constructing DataFrame from numpy " 'array will change from "row" to "column" in a future version. ' "Specify orientation explicitly to silence this warning.", DeprecationWarning, stacklevel=2, ) orient = "row" # Exchange if-block above for block below when removing warning # if orientation is None and columns is not None: # orientation = "col" if len(columns) == shape[0] else "row" if orient == "row": data_series = [ pl.Series(f"column_{i}", data[:, i], nullable=False).inner() for i in range(shape[1]) ] else: data_series = [ pl.Series(f"column_{i}", data[i], nullable=False).inner() for i in range(shape[0]) ] else: raise ValueError( "A numpy array should not have more than two dimensions.") data_series = _handle_columns_arg(data_series, columns=columns, nullable=nullable) return PyDataFrame(data_series)
def series_to_pydf(data: pli.Series, columns: ColumnsType | None = None) -> PyDataFrame: """ Construct a PyDataFrame from a Polars Series. """ data_series = [data.inner()] series_name = [s.name() for s in data_series] columns, dtypes = _unpack_columns(columns or series_name, n_expected=1) if dtypes: new_dtype = list(dtypes.values())[0] if new_dtype != data.dtype: data_series[0] = data_series[0].cast(new_dtype, True) data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def dict_to_pydf( data: Dict[str, Sequence[Any]], columns: Optional[ColumnsType] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a dictionary of sequences. """ if columns is not None: # the columns arg may also set the dtype of the series columns, dtypes = _unpack_columns(columns, lookup_names=data.keys()) if not data and dtypes: data_series = [ pli.Series(name, [], dtypes.get(name)).inner() for name in columns ] else: data_series = [ pli.Series(name, values, dtypes.get(name)).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series) # fast path return PyDataFrame.read_dict(data)