def numpy_to_pydf( data: np.ndarray, columns: Optional[ColumnsType] = None, orient: Optional[str] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a numpy ndarray. """ shape = data.shape n_columns = (0 if shape == (0, ) else (1 if len(shape) == 1 else (shape[1] if orient in ("row", None) else shape[0]))) columns, dtypes = _unpack_columns(columns, n_expected=n_columns) if columns and len(columns) != n_columns: raise ValueError( "Dimensions of columns arg must match data dimensions.") if shape == (0, ): data_series = [] elif len(shape) == 1: data_series = [ pli.Series(columns[0], data, dtypes.get(columns[0])).inner() ] elif len(shape) == 2: # Infer orientation if orient is None: warnings.warn( "Default orientation for constructing DataFrame from numpy " 'array will change from "row" to "column" in a future version. ' "Specify orientation explicitly to silence this warning.", DeprecationWarning, stacklevel=2, ) orient = "row" # Exchange if-block above for block below when removing warning # if orientation is None and columns is not None: # orientation = "col" if len(columns) == shape[0] else "row" if orient == "row": data_series = [ pli.Series(columns[i], data[:, i], dtypes.get(columns[i])).inner() for i in range(n_columns) ] else: data_series = [ pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner() for i in range(n_columns) ] else: raise ValueError( "A numpy array should not have more than two dimensions.") data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def sequence_to_pydf( data: Sequence[Any], columns: Optional[ColumnsType] = None, orient: Optional[str] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a sequence. """ data_series: List["PySeries"] if len(data) == 0: return dict_to_pydf({}, columns=columns) elif isinstance(data[0], pli.Series): series_names = [s.name for s in data] columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data)) data_series = [] for i, s in enumerate(data): if not s.name: # TODO: Replace by `if s.name is None` once allowed s.rename(columns[i], in_place=True) new_dtype = dtypes.get(columns[i]) if new_dtype and new_dtype != s.dtype: s = s.cast(new_dtype) data_series.append(s.inner()) elif isinstance(data[0], dict): pydf = PyDataFrame.read_dicts(data) if columns: pydf = _post_apply_columns(pydf, columns) return pydf elif isinstance(data[0], Sequence) and not isinstance(data[0], str): # Infer orientation if orient is None and columns is not None: orient = "col" if len(columns) == len(data) else "row" if orient == "row": pydf = PyDataFrame.read_rows(data) if columns: pydf = _post_apply_columns(pydf, columns) return pydf else: columns, dtypes = _unpack_columns(columns, n_expected=len(data)) data_series = [ pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner() for i in range(len(data)) ] else: columns, dtypes = _unpack_columns(columns, n_expected=1) data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def dict_to_pydf(data: dict[str, Sequence[Any]], columns: ColumnsType | None = None) -> PyDataFrame: """ Construct a PyDataFrame from a dictionary of sequences. """ if columns is not None: # the columns arg may also set the dtype of the series columns, dtypes = _unpack_columns(columns, lookup_names=data.keys()) if not data and dtypes: data_series = [ pli.Series(name, [], dtypes.get(name)).inner() for name in columns ] else: data_series = [ pli.Series(name, values, dtypes.get(name)).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series) if _NUMPY_AVAILABLE: all_numpy = True for val in data.values(): # only start a thread pool from a reasonable size. all_numpy = all_numpy and isinstance( val, np.ndarray) and len(val) > 1000 if not all_numpy: break if all_numpy: # yes, multi-threading was easier in python here # we cannot run multiple threads that run python code # and release the gil in pyo3 # it will deadlock. # dummy is threaded import multiprocessing.dummy pool_size = threadpool_size() pool = multiprocessing.dummy.Pool(pool_size) data_series = pool.map( lambda t: pli.Series(t[0], t[1]).inner(), [(k, v) for k, v in data.items()], ) return PyDataFrame(data_series) # fast path return PyDataFrame.read_dict(data)
def numpy_to_pydf( data: np.ndarray, columns: Optional[Sequence[str]] = None, orient: Optional[str] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a numpy ndarray. """ shape = data.shape if shape == (0, ): data_series = [] elif len(shape) == 1: s = pli.Series("column_0", data).inner() data_series = [s] elif len(shape) == 2: # Infer orientation if orient is None: warnings.warn( "Default orientation for constructing DataFrame from numpy " 'array will change from "row" to "column" in a future version. ' "Specify orientation explicitly to silence this warning.", DeprecationWarning, stacklevel=2, ) orient = "row" # Exchange if-block above for block below when removing warning # if orientation is None and columns is not None: # orientation = "col" if len(columns) == shape[0] else "row" if orient == "row": data_series = [ pli.Series(f"column_{i}", data[:, i]).inner() for i in range(shape[1]) ] else: data_series = [ pli.Series(f"column_{i}", data[i]).inner() for i in range(shape[0]) ] else: raise ValueError( "A numpy array should not have more than two dimensions.") data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def sequence_to_pydf( data: Sequence[Any], columns: Optional[Sequence[str]] = None, orient: Optional[str] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a sequence. """ data_series: List["PySeries"] if len(data) == 0: data_series = [] elif isinstance(data[0], pli.Series): data_series = [] for i, s in enumerate(data): if not s.name: # TODO: Replace by `if s.name is None` once allowed s.rename(f"column_{i}", in_place=True) data_series.append(s.inner()) elif isinstance(data[0], dict): pydf = PyDataFrame.read_dicts(data) if columns is not None: pydf.set_column_names(columns) return pydf elif isinstance(data[0], Sequence) and not isinstance(data[0], str): # Infer orientation if orient is None and columns is not None: orient = "col" if len(columns) == len(data) else "row" if orient == "row": pydf = PyDataFrame.read_rows(data) if columns is not None: pydf.set_column_names(columns) return pydf else: data_series = [ pli.Series(f"column_{i}", data[i]).inner() for i in range(len(data)) ] else: s = pli.Series("column_0", data).inner() data_series = [s] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def dict_to_pydf( data: Dict[str, Sequence[Any]], columns: Optional[ColumnsType] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a dictionary of sequences. """ columns, dtypes = _unpack_columns(columns, lookup_names=data.keys()) if not data and dtypes: data_series = [ pli.Series(name, [], dtypes.get(name)).inner() for name in columns ] else: data_series = [ pli.Series(name, values, dtypes.get(name)).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def dict_to_pydf( data: Dict[str, Sequence[Any]], columns: Optional[Sequence[str]] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a dictionary of sequences. """ data_series = [ pli.Series(name, values).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series)
def _handle_columns_arg( data: list[PySeries], columns: Sequence[str] | None = None) -> list[PySeries]: """ Rename data according to columns argument. """ if not columns: return data else: if not data: return [pli.Series(c, None).inner() for c in columns] elif len(data) == len(columns): for i, c in enumerate(columns): data[i].rename(c) return data else: raise ValueError( "Dimensions of columns arg must match data dimensions.")
def draw_series(draw: Callable) -> pli.Series: # create/assign series dtype and retrieve matching strategy series_dtype = (draw(sampled_from(selectable_dtypes)) if dtype is None else dtype) dtype_strategy = strategy or dtype_strategy_mapping[series_dtype] # create/assign series size series_size = (between(draw, int, min_=(min_size or 0), max_=(max_size or MAX_DATA_SIZE)) if size is None else size) # assign series name series_name = name if isinstance(name, (str, type(None))) else draw(name) # create series using dtype-specific strategy to generate values series_values = ([None] * series_size if null_probability == 1 else (draw( lists( dtype_strategy, min_size=series_size, max_size=series_size, unique=unique, )) if (series_size > 0) else [])) # optionally apply null values (custom frequency) if 0.0 < null_probability < 1.0: for idx in range(series_size): if random.random() < null_probability: series_values[idx] = None # init series with strategy-generated data s = pli.Series( name=series_name, dtype=series_dtype, values=series_values, ) if is_categorical_dtype(dtype): s = s.cast(Categorical) return s
def cut( s: pli.Series, bins: list[float], labels: Optional[list[str]] = None, break_point_label: str = "break_point", category_label: str = "category", ) -> pli.DataFrame: """ Bin values into discrete values .. warning:: This function is experimental and might change without it being considered a breaking change. Parameters ---------- s Series to bin. bins Bins to create. labels Labels to assign to the bins. If given the length of labels must be len(bins) + 1. break_point_label Name given to the breakpoint column. category_label Name given to the category column. Returns ------- DataFrame Examples -------- >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)]) >>> pl.cut(a, bins=[-1, 1]) shape: (12, 3) ┌──────┬─────────────┬──────────────┐ │ a ┆ break_point ┆ category │ │ --- ┆ --- ┆ --- │ │ f64 ┆ f64 ┆ cat │ ╞══════╪═════════════╪══════════════╡ │ -3.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -2.0 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ -1.5 ┆ -1.0 ┆ (-inf, -1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ ... ┆ ... ┆ ... │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.0 ┆ 1.0 ┆ (-1.0, 1.0] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 1.5 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.0 ┆ inf ┆ (1.0, inf] │ ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2.5 ┆ inf ┆ (1.0, inf] │ └──────┴─────────────┴──────────────┘ """ var_nm = s.name cuts_df = pli.DataFrame([ pli.Series(name=break_point_label, values=bins, dtype=Float64).extend_constant(float("inf"), 1) ]) if labels: if len(labels) != len(bins) + 1: raise ValueError("expected more labels") cuts_df = cuts_df.with_column( pli.Series(name=category_label, values=labels)) else: cuts_df = cuts_df.with_column( pli.format( "({}, {}]", pli.col(break_point_label).shift_and_fill(1, float("-inf")), pli.col(break_point_label), ).alias(category_label)) cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical)) result = (s.sort().to_frame().join_asof( cuts_df, left_on=var_nm, right_on=break_point_label, strategy="forward", )) return result
def lit( value: None | (float | int | str | date | datetime | pli.Series | np.ndarray | Any), dtype: type[DataType] | None = None, ) -> pli.Expr: """ A literal value. Parameters ---------- value Value that should be used as a `literal`. dtype Optionally define a dtype. Examples -------- Literal integer: >>> pl.lit(1) # doctest: +IGNORE_RESULT Literal str: >>> pl.lit("foo") # doctest: +IGNORE_RESULT Literal datetime: >>> from datetime import datetime >>> pl.lit(datetime(2021, 1, 20)) # doctest: +IGNORE_RESULT Literal Null: >>> pl.lit(None) # doctest: +IGNORE_RESULT Literal eager Series: >>> pl.lit(pl.Series("a", [1, 2, 3])) # doctest: +IGNORE_RESULT """ if isinstance(value, datetime): if in_nanoseconds_window(value): tu = "ns" else: tu = "ms" return (lit(_datetime_to_pl_timestamp( value, tu)).cast(Datetime).dt.and_time_unit(tu)) if isinstance(value, timedelta): if timedelta_in_nanoseconds_window(value): tu = "ns" else: tu = "ms" return (lit(_timedelta_to_pl_timedelta( value, tu)).cast(Duration).dt.and_time_unit(tu, dtype=Duration)) if isinstance(value, date): return lit(datetime(value.year, value.month, value.day)).cast(Date) if isinstance(value, pli.Series): name = value.name value = value._s e = pli.wrap_expr(pylit(value)) if name == "": return e return e.alias(name) if _NUMPY_AVAILABLE and isinstance(value, np.ndarray): return lit(pli.Series("", value)) if dtype: return pli.wrap_expr(pylit(value)).cast(dtype) # numpy literals like np.float32(0) # have an item if hasattr(value, "item"): value = value.item() # type: ignore[union-attr] return pli.wrap_expr(pylit(value))
def lit( value: Optional[Union[float, int, str, date, datetime, "pli.Series"]], dtype: Optional[Type[DataType]] = None, ) -> "pli.Expr": """ A literal value. Parameters ---------- value Value that should be used as a `literal`. dtype Optionally define a dtype. Examples -------- Literal integer: >>> pl.lit(1) # doctest: +IGNORE_RESULT Literal str: >>> pl.lit("foo") # doctest: +IGNORE_RESULT Literal datetime: >>> from datetime import datetime >>> pl.lit(datetime(2021, 1, 20)) # doctest: +IGNORE_RESULT Literal Null: >>> pl.lit(None) # doctest: +IGNORE_RESULT Literal eager Series: >>> pl.lit(pl.Series("a", [1, 2, 3])) # doctest: +IGNORE_RESULT """ if isinstance(value, datetime): if in_nanoseconds_window(value): tu = "ns" else: tu = "ms" return (lit(_datetime_to_pl_timestamp( value, tu)).cast(Datetime).dt.and_time_unit(tu)) if isinstance(value, timedelta): if timedelta_in_nanoseconds_window(value): tu = "ns" else: tu = "ms" return (lit(_timedelta_to_pl_timedelta( value, tu)).cast(Duration).dt.and_time_unit(tu, dtype=Duration)) if isinstance(value, date): return lit(datetime(value.year, value.month, value.day)).cast(Date) if isinstance(value, pli.Series): name = value.name value = value._s return pli.wrap_expr(pylit(value)).alias(name) if isinstance(value, np.ndarray): return lit(pli.Series("", value)) if dtype: return pli.wrap_expr(pylit(value)).cast(dtype) return pli.wrap_expr(pylit(value))