Пример #1
0
def numpy_to_pydf(
    data: np.ndarray,
    columns: Optional[ColumnsType] = None,
    orient: Optional[str] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a numpy ndarray.
    """
    shape = data.shape
    n_columns = (0 if shape == (0, ) else
                 (1 if len(shape) == 1 else
                  (shape[1] if orient in ("row", None) else shape[0])))
    columns, dtypes = _unpack_columns(columns, n_expected=n_columns)
    if columns and len(columns) != n_columns:
        raise ValueError(
            "Dimensions of columns arg must match data dimensions.")

    if shape == (0, ):
        data_series = []

    elif len(shape) == 1:
        data_series = [
            pli.Series(columns[0], data, dtypes.get(columns[0])).inner()
        ]

    elif len(shape) == 2:
        # Infer orientation
        if orient is None:
            warnings.warn(
                "Default orientation for constructing DataFrame from numpy "
                'array will change from "row" to "column" in a future version. '
                "Specify orientation explicitly to silence this warning.",
                DeprecationWarning,
                stacklevel=2,
            )
            orient = "row"
        # Exchange if-block above for block below when removing warning
        # if orientation is None and columns is not None:
        #     orientation = "col" if len(columns) == shape[0] else "row"
        if orient == "row":
            data_series = [
                pli.Series(columns[i], data[:, i],
                           dtypes.get(columns[i])).inner()
                for i in range(n_columns)
            ]
        else:
            data_series = [
                pli.Series(columns[i], data[i],
                           dtypes.get(columns[i])).inner()
                for i in range(n_columns)
            ]
    else:
        raise ValueError(
            "A numpy array should not have more than two dimensions.")

    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Пример #2
0
def sequence_to_pydf(
    data: Sequence[Any],
    columns: Optional[ColumnsType] = None,
    orient: Optional[str] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a sequence.
    """
    data_series: List["PySeries"]

    if len(data) == 0:
        return dict_to_pydf({}, columns=columns)

    elif isinstance(data[0], pli.Series):
        series_names = [s.name for s in data]
        columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data))
        data_series = []
        for i, s in enumerate(data):
            if not s.name:  # TODO: Replace by `if s.name is None` once allowed
                s.rename(columns[i], in_place=True)

            new_dtype = dtypes.get(columns[i])
            if new_dtype and new_dtype != s.dtype:
                s = s.cast(new_dtype)

            data_series.append(s.inner())

    elif isinstance(data[0], dict):
        pydf = PyDataFrame.read_dicts(data)
        if columns:
            pydf = _post_apply_columns(pydf, columns)
        return pydf

    elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
        # Infer orientation
        if orient is None and columns is not None:
            orient = "col" if len(columns) == len(data) else "row"

        if orient == "row":
            pydf = PyDataFrame.read_rows(data)
            if columns:
                pydf = _post_apply_columns(pydf, columns)
            return pydf
        else:
            columns, dtypes = _unpack_columns(columns, n_expected=len(data))
            data_series = [
                pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner()
                for i in range(len(data))
            ]

    else:
        columns, dtypes = _unpack_columns(columns, n_expected=1)
        data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()]

    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Пример #3
0
def dict_to_pydf(data: dict[str, Sequence[Any]],
                 columns: ColumnsType | None = None) -> PyDataFrame:
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    if columns is not None:
        # the columns arg may also set the dtype of the series
        columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())

        if not data and dtypes:
            data_series = [
                pli.Series(name, [], dtypes.get(name)).inner()
                for name in columns
            ]
        else:
            data_series = [
                pli.Series(name, values, dtypes.get(name)).inner()
                for name, values in data.items()
            ]
        data_series = _handle_columns_arg(data_series, columns=columns)
        return PyDataFrame(data_series)

    if _NUMPY_AVAILABLE:
        all_numpy = True
        for val in data.values():
            # only start a thread pool from a reasonable size.
            all_numpy = all_numpy and isinstance(
                val, np.ndarray) and len(val) > 1000
            if not all_numpy:
                break

        if all_numpy:
            # yes, multi-threading was easier in python here
            # we cannot run multiple threads that run python code
            # and release the gil in pyo3
            # it will deadlock.

            # dummy is threaded
            import multiprocessing.dummy

            pool_size = threadpool_size()
            pool = multiprocessing.dummy.Pool(pool_size)
            data_series = pool.map(
                lambda t: pli.Series(t[0], t[1]).inner(),
                [(k, v) for k, v in data.items()],
            )
            return PyDataFrame(data_series)

    # fast path
    return PyDataFrame.read_dict(data)
Пример #4
0
def numpy_to_pydf(
    data: np.ndarray,
    columns: Optional[Sequence[str]] = None,
    orient: Optional[str] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a numpy ndarray.
    """
    shape = data.shape

    if shape == (0, ):
        data_series = []

    elif len(shape) == 1:
        s = pli.Series("column_0", data).inner()
        data_series = [s]

    elif len(shape) == 2:
        # Infer orientation
        if orient is None:
            warnings.warn(
                "Default orientation for constructing DataFrame from numpy "
                'array will change from "row" to "column" in a future version. '
                "Specify orientation explicitly to silence this warning.",
                DeprecationWarning,
                stacklevel=2,
            )
            orient = "row"
        # Exchange if-block above for block below when removing warning
        # if orientation is None and columns is not None:
        #     orientation = "col" if len(columns) == shape[0] else "row"

        if orient == "row":
            data_series = [
                pli.Series(f"column_{i}", data[:, i]).inner()
                for i in range(shape[1])
            ]
        else:
            data_series = [
                pli.Series(f"column_{i}", data[i]).inner()
                for i in range(shape[0])
            ]
    else:
        raise ValueError(
            "A numpy array should not have more than two dimensions.")

    data_series = _handle_columns_arg(data_series, columns=columns)

    return PyDataFrame(data_series)
Пример #5
0
def sequence_to_pydf(
    data: Sequence[Any],
    columns: Optional[Sequence[str]] = None,
    orient: Optional[str] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a sequence.
    """
    data_series: List["PySeries"]
    if len(data) == 0:
        data_series = []

    elif isinstance(data[0], pli.Series):
        data_series = []
        for i, s in enumerate(data):
            if not s.name:  # TODO: Replace by `if s.name is None` once allowed
                s.rename(f"column_{i}", in_place=True)
            data_series.append(s.inner())

    elif isinstance(data[0], dict):
        pydf = PyDataFrame.read_dicts(data)
        if columns is not None:
            pydf.set_column_names(columns)
        return pydf

    elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
        # Infer orientation
        if orient is None and columns is not None:
            orient = "col" if len(columns) == len(data) else "row"

        if orient == "row":
            pydf = PyDataFrame.read_rows(data)
            if columns is not None:
                pydf.set_column_names(columns)
            return pydf
        else:
            data_series = [
                pli.Series(f"column_{i}", data[i]).inner()
                for i in range(len(data))
            ]

    else:
        s = pli.Series("column_0", data).inner()
        data_series = [s]

    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Пример #6
0
def dict_to_pydf(
    data: Dict[str, Sequence[Any]],
    columns: Optional[ColumnsType] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())
    if not data and dtypes:
        data_series = [
            pli.Series(name, [], dtypes.get(name)).inner() for name in columns
        ]
    else:
        data_series = [
            pli.Series(name, values, dtypes.get(name)).inner()
            for name, values in data.items()
        ]
    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Пример #7
0
def dict_to_pydf(
    data: Dict[str, Sequence[Any]],
    columns: Optional[Sequence[str]] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    data_series = [
        pli.Series(name, values).inner() for name, values in data.items()
    ]
    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Пример #8
0
def _handle_columns_arg(
        data: list[PySeries],
        columns: Sequence[str] | None = None) -> list[PySeries]:
    """
    Rename data according to columns argument.
    """
    if not columns:
        return data
    else:
        if not data:
            return [pli.Series(c, None).inner() for c in columns]
        elif len(data) == len(columns):
            for i, c in enumerate(columns):
                data[i].rename(c)
            return data
        else:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.")
Пример #9
0
        def draw_series(draw: Callable) -> pli.Series:
            # create/assign series dtype and retrieve matching strategy
            series_dtype = (draw(sampled_from(selectable_dtypes))
                            if dtype is None else dtype)
            dtype_strategy = strategy or dtype_strategy_mapping[series_dtype]

            # create/assign series size
            series_size = (between(draw,
                                   int,
                                   min_=(min_size or 0),
                                   max_=(max_size or MAX_DATA_SIZE))
                           if size is None else size)

            # assign series name
            series_name = name if isinstance(name,
                                             (str, type(None))) else draw(name)

            # create series using dtype-specific strategy to generate values
            series_values = ([None] * series_size if null_probability == 1 else
                             (draw(
                                 lists(
                                     dtype_strategy,
                                     min_size=series_size,
                                     max_size=series_size,
                                     unique=unique,
                                 )) if (series_size > 0) else []))
            # optionally apply null values (custom frequency)
            if 0.0 < null_probability < 1.0:
                for idx in range(series_size):
                    if random.random() < null_probability:
                        series_values[idx] = None

            # init series with strategy-generated data
            s = pli.Series(
                name=series_name,
                dtype=series_dtype,
                values=series_values,
            )
            if is_categorical_dtype(dtype):
                s = s.cast(Categorical)
            return s
Пример #10
0
def cut(
    s: pli.Series,
    bins: list[float],
    labels: Optional[list[str]] = None,
    break_point_label: str = "break_point",
    category_label: str = "category",
) -> pli.DataFrame:
    """
    Bin values into discrete values

    .. warning::
        This function is experimental and might change without it being considered a breaking change.

    Parameters
    ----------
    s
        Series to bin.
    bins
        Bins to create.
    labels
        Labels to assign to the bins. If given the length of labels must be len(bins) + 1.
    break_point_label
        Name given to the breakpoint column.
    category_label
        Name given to the category column.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> a = pl.Series("a", [v / 10 for v in range(-30, 30, 5)])
    >>> pl.cut(a, bins=[-1, 1])
    shape: (12, 3)
    ┌──────┬─────────────┬──────────────┐
    │ a    ┆ break_point ┆ category     │
    │ ---  ┆ ---         ┆ ---          │
    │ f64  ┆ f64         ┆ cat          │
    ╞══════╪═════════════╪══════════════╡
    │ -3.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -2.0 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ -1.5 ┆ -1.0        ┆ (-inf, -1.0] │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ ...  ┆ ...         ┆ ...          │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.0  ┆ 1.0         ┆ (-1.0, 1.0]  │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 1.5  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.0  ┆ inf         ┆ (1.0, inf]   │
    ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2.5  ┆ inf         ┆ (1.0, inf]   │
    └──────┴─────────────┴──────────────┘

    """
    var_nm = s.name

    cuts_df = pli.DataFrame([
        pli.Series(name=break_point_label, values=bins,
                   dtype=Float64).extend_constant(float("inf"), 1)
    ])

    if labels:
        if len(labels) != len(bins) + 1:
            raise ValueError("expected more labels")
        cuts_df = cuts_df.with_column(
            pli.Series(name=category_label, values=labels))
    else:
        cuts_df = cuts_df.with_column(
            pli.format(
                "({}, {}]",
                pli.col(break_point_label).shift_and_fill(1, float("-inf")),
                pli.col(break_point_label),
            ).alias(category_label))

    cuts_df = cuts_df.with_column(pli.col(category_label).cast(Categorical))

    result = (s.sort().to_frame().join_asof(
        cuts_df,
        left_on=var_nm,
        right_on=break_point_label,
        strategy="forward",
    ))
    return result
Пример #11
0
def lit(
    value: None |
    (float | int | str | date | datetime | pli.Series | np.ndarray | Any),
    dtype: type[DataType] | None = None,
) -> pli.Expr:
    """
    A literal value.

    Parameters
    ----------
    value
        Value that should be used as a `literal`.
    dtype
        Optionally define a dtype.

    Examples
    --------
    Literal integer:

    >>> pl.lit(1)  # doctest: +IGNORE_RESULT

    Literal str:

    >>> pl.lit("foo")  # doctest: +IGNORE_RESULT

    Literal datetime:

    >>> from datetime import datetime
    >>> pl.lit(datetime(2021, 1, 20))  # doctest: +IGNORE_RESULT

    Literal Null:

    >>> pl.lit(None)  # doctest: +IGNORE_RESULT

    Literal eager Series:

    >>> pl.lit(pl.Series("a", [1, 2, 3]))  # doctest: +IGNORE_RESULT

    """
    if isinstance(value, datetime):
        if in_nanoseconds_window(value):
            tu = "ns"
        else:
            tu = "ms"
        return (lit(_datetime_to_pl_timestamp(
            value, tu)).cast(Datetime).dt.and_time_unit(tu))
    if isinstance(value, timedelta):
        if timedelta_in_nanoseconds_window(value):
            tu = "ns"
        else:
            tu = "ms"
        return (lit(_timedelta_to_pl_timedelta(
            value, tu)).cast(Duration).dt.and_time_unit(tu, dtype=Duration))

    if isinstance(value, date):
        return lit(datetime(value.year, value.month, value.day)).cast(Date)

    if isinstance(value, pli.Series):
        name = value.name
        value = value._s
        e = pli.wrap_expr(pylit(value))
        if name == "":
            return e
        return e.alias(name)

    if _NUMPY_AVAILABLE and isinstance(value, np.ndarray):
        return lit(pli.Series("", value))

    if dtype:
        return pli.wrap_expr(pylit(value)).cast(dtype)
    # numpy literals like np.float32(0)
    # have an item
    if hasattr(value, "item"):
        value = value.item()  # type: ignore[union-attr]
    return pli.wrap_expr(pylit(value))
Пример #12
0
def lit(
    value: Optional[Union[float, int, str, date, datetime, "pli.Series"]],
    dtype: Optional[Type[DataType]] = None,
) -> "pli.Expr":
    """
    A literal value.

    Parameters
    ----------
    value
        Value that should be used as a `literal`.
    dtype
        Optionally define a dtype.

    Examples
    --------

    Literal integer:

    >>> pl.lit(1)  # doctest: +IGNORE_RESULT

    Literal str:

    >>> pl.lit("foo")  # doctest: +IGNORE_RESULT

    Literal datetime:

    >>> from datetime import datetime
    >>> pl.lit(datetime(2021, 1, 20))  # doctest: +IGNORE_RESULT

    Literal Null:

    >>> pl.lit(None)  # doctest: +IGNORE_RESULT

    Literal eager Series:

    >>> pl.lit(pl.Series("a", [1, 2, 3]))  # doctest: +IGNORE_RESULT

    """
    if isinstance(value, datetime):
        if in_nanoseconds_window(value):
            tu = "ns"
        else:
            tu = "ms"
        return (lit(_datetime_to_pl_timestamp(
            value, tu)).cast(Datetime).dt.and_time_unit(tu))
    if isinstance(value, timedelta):
        if timedelta_in_nanoseconds_window(value):
            tu = "ns"
        else:
            tu = "ms"
        return (lit(_timedelta_to_pl_timedelta(
            value, tu)).cast(Duration).dt.and_time_unit(tu, dtype=Duration))

    if isinstance(value, date):
        return lit(datetime(value.year, value.month, value.day)).cast(Date)

    if isinstance(value, pli.Series):
        name = value.name
        value = value._s
        return pli.wrap_expr(pylit(value)).alias(name)

    if isinstance(value, np.ndarray):
        return lit(pli.Series("", value))

    if dtype:
        return pli.wrap_expr(pylit(value)).cast(dtype)
    return pli.wrap_expr(pylit(value))