Пример #1
0
def apply(
    exprs: List[Union[str, "pli.Expr"]],
    f: Callable[[List["pli.Series"]], Union["pli.Series", Any]],
    return_dtype: Optional[Type[DataType]] = None,
) -> "pli.Expr":
    """
    Apply a custom function in a GroupBy context.

    Depending on the context it has the following behavior:

    ## Context

    * Select/Project
        Don't do this, use `map`
    * GroupBy
        expected type `f`: Callable[[Series], Series]
        Applies a python function over each group.

    Parameters
    ----------
    exprs
        Input Series to f
    f
        Function to apply over the input
    return_dtype
        dtype of the output Series

    Returns
    -------
    Expr
    """
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_map_mul(exprs, f, return_dtype, apply_groups=True))
Пример #2
0
    def sort(
        self,
        by: Union[str, "pli.Expr", List[str], List["pli.Expr"]],
        reverse: Union[bool, List[bool]] = False,
    ) -> "LazyFrame":
        """
        Sort the DataFrame by:

            - A single column name
            - An expression
            - Multiple expressions

        Parameters
        ----------
        by
            Column (expressions) to sort by.
        reverse
            Whether or not to sort in reverse order.
        """
        if type(by) is str:
            return wrap_ldf(self._ldf.sort(by, reverse))
        if type(reverse) is bool:
            reverse = [reverse]

        by = pli.selection_to_pyexpr_list(by)
        return wrap_ldf(self._ldf.sort_by_exprs(by, reverse))
Пример #3
0
def apply(
    exprs: list[str | pli.Expr],
    f: Callable[[list[pli.Series]], pli.Series | Any],
    return_dtype: type[DataType] | None = None,
) -> pli.Expr:
    """
    Apply a custom function in a GroupBy context.

    Depending on the context it has the following behavior:

    * Select
        Don't use apply, use `map`
    * GroupBy
        expected type `f`: Callable[[Series], Series]
        Applies a python function over each group.

    Parameters
    ----------
    exprs
        Input Series to f
    f
        Function to apply over the input
    return_dtype
        dtype of the output Series

    Returns
    -------
    Expr
    """
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_map_mul(exprs, f, return_dtype, apply_groups=True))
Пример #4
0
    def agg(self, aggs: Union[List["pli.Expr"], "pli.Expr"]) -> "LazyFrame":
        """
        Describe the aggregation that need to be done on a group.

        Parameters
        ----------
        aggs
            Single/ Multiple aggregation expression(s).

        Examples
        --------

        >>> (
        ...     pl.scan_csv("data.csv")
        ...     .groupby("groups")
        ...     .agg(
        ...         [
        ...             pl.col("name").n_unique().alias("unique_names"),
        ...             pl.max("values"),
        ...         ]
        ...     )
        ... )  # doctest: +SKIP

        """
        aggs = pli.selection_to_pyexpr_list(aggs)
        return wrap_ldf(self.lgb.agg(aggs))
Пример #5
0
def fold(
    acc: pli.IntoExpr,
    f: Callable[[pli.Series, pli.Series], pli.Series],
    exprs: Sequence[pli.Expr | str] | pli.Expr,
) -> pli.Expr:
    """
    Accumulate over multiple columns horizontally/ row wise with a left fold.

    Parameters
    ----------
    acc
     Accumulator Expression. This is the value that will be initialized when the fold starts.
     For a sum this could for instance be lit(0).

    f
        Function to apply over the accumulator and the value.
        Fn(acc, value) -> new_value
    exprs
        Expressions to aggregate over. May also be a wildcard expression.
    """
    # in case of pl.col("*")
    acc = pli.expr_to_lit_or_expr(acc, str_to_lit=True)
    if isinstance(exprs, pli.Expr):
        exprs = [exprs]

    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(pyfold(acc._pyexpr, f, exprs))
Пример #6
0
def struct(
        exprs: Union[Sequence[Union["pli.Expr", str]],
                     "pli.Expr"]) -> "pli.Expr":
    """
    Collect several columns into a Series of dtype Struct

    Parameters
    ----------
    exprs
        Columns/Expressions to collect into a Struct

    Examples
    --------

    >>> pl.DataFrame(
    ...     {
    ...         "int": [1, 2],
    ...         "str": ["a", "b"],
    ...         "bool": [True, None],
    ...         "list": [[1, 2], [3]],
    ...     }
    ... ).select([pl.struct(pl.all()).alias("my_struct")])
    shape: (2, 1)
    ┌───────────────────────┐
    │ my_struct             │
    │ ---                   │
    │ struct{int, ... list} │
    ╞═══════════════════════╡
    │ {1,"a",true,[1, 2]}   │
    ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ {2,"b",null,[3]}      │
    └───────────────────────┘

    Only collect specific columns as a struct:

    >>> df = pl.DataFrame(
    ...     {"a": [1, 2, 3, 4], "b": ["one", "two", "three", "four"], "c": [9, 8, 7, 6]}
    ... )
    >>> df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b"))
    shape: (4, 4)
    ┌─────┬───────┬─────┬───────────────────────────────┐
    │ a   ┆ b     ┆ c   ┆ a_and_b                       │
    │ --- ┆ ---   ┆ --- ┆ ---                           │
    │ i64 ┆ str   ┆ i64 ┆ struct[2]{'a': i64, 'b': str} │
    ╞═════╪═══════╪═════╪═══════════════════════════════╡
    │ 1   ┆ one   ┆ 9   ┆ {1,"one"}                     │
    ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2   ┆ two   ┆ 8   ┆ {2,"two"}                     │
    ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 3   ┆ three ┆ 7   ┆ {3,"three"}                   │
    ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 4   ┆ four  ┆ 6   ┆ {4,"four"}                    │
    └─────┴───────┴─────┴───────────────────────────────┘

    """
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_as_struct(exprs))
Пример #7
0
def concat_list(
    exprs: Union[Sequence[Union[str, "pli.Expr", "pli.Series"]], "pli.Expr"]
) -> "pli.Expr":
    """
    Concat the arrays in a Series dtype List in linear time.

    Parameters
    ----------
    exprs
        Columns to concat into a List Series

    Examples
    --------

    Create lagged columns and collect them into a list. This mimics a rolling window.

    >>> df = pl.DataFrame(
    ...     {
    ...         "A": [1.0, 2.0, 9.0, 2.0, 13.0],
    ...     }
    ... )
    >>> (
    ...     df.with_columns(
    ...         [pl.col("A").shift(i).alias(f"A_lag_{i}") for i in range(3)]
    ...     ).select(
    ...         [
    ...             pl.concat_list([f"A_lag_{i}" for i in range(3)][::-1]).alias(
    ...                 "A_rolling"
    ...             )
    ...         ]
    ...     )
    ... )
    shape: (5, 1)
    ┌─────────────────┐
    │ A_rolling       │
    │ ---             │
    │ list [f64]      │
    ╞═════════════════╡
    │ [null, null, 1] │
    ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ [null, 1, 2]    │
    ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ [1, 2, 9]       │
    ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ [2, 9, 2]       │
    ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ [9, 2, 13]      │
    └─────────────────┘

    """
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_concat_lst(exprs))
Пример #8
0
def concat_str(exprs: Union[Sequence[Union["pli.Expr", str]], "pli.Expr"],
               sep: str = "") -> "pli.Expr":
    """
    Horizontally Concat Utf8 Series in linear time. Non utf8 columns are cast to utf8.

    Parameters
    ----------
    exprs
        Columns to concat into a Utf8 Series
    sep
        String value that will be used to separate the values.
    """
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_concat_str(exprs, sep))
Пример #9
0
def concat_str(exprs: Sequence[pli.Expr | str] | pli.Expr,
               sep: str = "") -> pli.Expr:
    """
    Horizontally concat Utf8 Series in linear time. Non-Utf8 columns are cast to Utf8.

    Parameters
    ----------
    exprs
        Columns to concat into a Utf8 Series.
    sep
        String value that will be used to separate the values.

    Examples
    --------
    >>> df = pl.DataFrame(
    ...     {
    ...         "a": [1, 2, 3],
    ...         "b": ["dogs", "cats", None],
    ...         "c": ["play", "swim", "walk"],
    ...     }
    ... )
    >>> df.with_columns(
    ...     [
    ...         pl.concat_str(
    ...             [
    ...                 pl.col("a") * 2,
    ...                 pl.col("b"),
    ...                 pl.col("c"),
    ...             ],
    ...             sep=" ",
    ...         ).alias("full_sentence"),
    ...     ]
    ... )
    shape: (3, 4)
    ┌─────┬──────┬──────┬───────────────┐
    │ a   ┆ b    ┆ c    ┆ full_sentence │
    │ --- ┆ ---  ┆ ---  ┆ ---           │
    │ i64 ┆ str  ┆ str  ┆ str           │
    ╞═════╪══════╪══════╪═══════════════╡
    │ 1   ┆ dogs ┆ play ┆ 2 dogs play   │
    ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2   ┆ cats ┆ swim ┆ 4 cats swim   │
    ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 3   ┆ null ┆ walk ┆ null          │
    └─────┴──────┴──────┴───────────────┘

    """
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_concat_str(exprs, sep))
Пример #10
0
    def select(
        self,
        exprs: Union[
            str, "pli.Expr", Sequence[str], Sequence["pli.Expr"], "pli.Series"
        ],
    ) -> "LazyFrame":
        """
        Select columns from this DataFrame.

        Parameters
        ----------
        exprs
            Column or columns to select.
        """
        exprs = pli.selection_to_pyexpr_list(exprs)
        return wrap_ldf(self._ldf.select(exprs))
Пример #11
0
def min(column: str | list[pli.Expr | str] | pli.Series) -> pli.Expr | Any:
    """
    Get the minimum value.

    column
        Column(s) to be used in aggregation. Will lead to different behavior based on the input.
        input:
        - Union[str, Series] -> aggregate the sum value of that column.
        - List[Expr] -> aggregate the sum value horizontally.
    """
    if isinstance(column, pli.Series):
        return column.min()
    elif isinstance(column, list):
        exprs = pli.selection_to_pyexpr_list(column)
        return pli.wrap_expr(_min_exprs(exprs))
    else:
        return col(column).min()
Пример #12
0
def argsort_by(exprs: List[Union["pli.Expr", str]],
               reverse: Union[List[bool], bool] = False) -> "pli.Expr":
    """
    Find the indexes that would sort the columns.

    Argsort by multiple columns. The first column will be used for the ordering.
    If there are duplicates in the first column, the second column will be used to determine the ordering
    and so on.

    Parameters
    ----------
    exprs
        Columns use to determine the ordering.
    reverse
        Default is ascending.
    """
    if not isinstance(reverse, list):
        reverse = [reverse] * len(exprs)
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(pyargsort_by(exprs, reverse))
Пример #13
0
def max(
    column: Union[str, List[Union["pli.Expr", str]], "pli.Series"]
) -> Union["pli.Expr", Any]:
    """
    Get the maximum value. Can be used horizontally or vertically.

    Parameters
    ----------
    column
        Column(s) to be used in aggregation. Will lead to different behavior based on the input.
        input:
        - Union[str, Series] -> aggregate the maximum value of that column.
        - List[Expr] -> aggregate the maximum value horizontally.
    """
    if isinstance(column, pli.Series):
        return column.max()
    elif isinstance(column, list):
        exprs = pli.selection_to_pyexpr_list(column)
        return pli.wrap_expr(_max_exprs(exprs))
    else:
        return col(column).max()
Пример #14
0
def map(
    exprs: list[str] | list[pli.Expr],
    f: Callable[[list[pli.Series]], pli.Series],
    return_dtype: type[DataType] | None = None,
) -> pli.Expr:
    """
    Map a custom function over multiple columns/expressions and produce a single Series result.

    Parameters
    ----------
    exprs
        Input Series to f
    f
        Function to apply over the input
    return_dtype
        dtype of the output Series

    Returns
    -------
    Expr
    """
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_map_mul(exprs, f, return_dtype, apply_groups=False))
Пример #15
0
def argsort_by(
    exprs: pli.Expr | str | Sequence[pli.Expr | str],
    reverse: list[bool] | bool = False,
) -> pli.Expr:
    """
    Find the indexes that would sort the columns.

    Argsort by multiple columns. The first column will be used for the ordering.
    If there are duplicates in the first column, the second column will be used to determine the ordering
    and so on.

    Parameters
    ----------
    exprs
        Columns use to determine the ordering.
    reverse
        Default is ascending.
    """
    if isinstance(exprs, str) or not isinstance(exprs, Sequence):
        exprs = [exprs]
    if isinstance(reverse, bool):
        reverse = [reverse] * len(exprs)
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(pyargsort_by(exprs, reverse))
Пример #16
0
    def explode(
        self, columns: Union[str, List[str], "pli.Expr", List["pli.Expr"]]
    ) -> "LazyFrame":
        """
        Explode lists to long format.

        Examples
        --------

        >>> df = pl.DataFrame(
        ...     {
        ...         "letters": ["c", "c", "a", "c", "a", "b"],
        ...         "nrs": [[1, 2], [1, 3], [4, 3], [5, 5, 5], [6], [2, 1, 2]],
        ...     }
        ... )
        >>> df
        shape: (6, 2)
        ┌─────────┬────────────┐
        │ letters ┆ nrs        │
        │ ---     ┆ ---        │
        │ str     ┆ list [i64] │
        ╞═════════╪════════════╡
        │ c       ┆ [1, 2]     │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ c       ┆ [1, 3]     │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ a       ┆ [4, 3]     │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ c       ┆ [5, 5, 5]  │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ a       ┆ [6]        │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ b       ┆ [2, 1, 2]  │
        └─────────┴────────────┘
        >>> df.explode("nrs")
        shape: (13, 2)
        ┌─────────┬─────┐
        │ letters ┆ nrs │
        │ ---     ┆ --- │
        │ str     ┆ i64 │
        ╞═════════╪═════╡
        │ c       ┆ 1   │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ c       ┆ 2   │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ c       ┆ 1   │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ c       ┆ 3   │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ ...     ┆ ... │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ a       ┆ 6   │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ b       ┆ 2   │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ b       ┆ 1   │
        ├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
        │ b       ┆ 2   │
        └─────────┴─────┘

        """
        columns = pli.selection_to_pyexpr_list(columns)
        return wrap_ldf(self._ldf.explode(columns))
Пример #17
0
def struct(
    exprs: Sequence[pli.Expr | str | pli.Series] | pli.Expr | pli.Series,
    eager: bool = False,
) -> pli.Expr | pli.Series:
    """
    Collect several columns into a Series of dtype Struct

    Parameters
    ----------
    exprs
        Columns/Expressions to collect into a Struct
    eager
        Evaluate immediately

    Examples
    --------
    >>> pl.DataFrame(
    ...     {
    ...         "int": [1, 2],
    ...         "str": ["a", "b"],
    ...         "bool": [True, None],
    ...         "list": [[1, 2], [3]],
    ...     }
    ... ).select([pl.struct(pl.all()).alias("my_struct")])
    shape: (2, 1)
    ┌─────────────────────┐
    │ my_struct           │
    │ ---                 │
    │ struct[4]           │
    ╞═════════════════════╡
    │ {1,"a",true,[1, 2]} │
    ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ {2,"b",null,[3]}    │
    └─────────────────────┘

    Only collect specific columns as a struct:

    >>> df = pl.DataFrame(
    ...     {"a": [1, 2, 3, 4], "b": ["one", "two", "three", "four"], "c": [9, 8, 7, 6]}
    ... )
    >>> df.with_column(pl.struct(pl.col(["a", "b"])).alias("a_and_b"))
    shape: (4, 4)
    ┌─────┬───────┬─────┬─────────────┐
    │ a   ┆ b     ┆ c   ┆ a_and_b     │
    │ --- ┆ ---   ┆ --- ┆ ---         │
    │ i64 ┆ str   ┆ i64 ┆ struct[2]   │
    ╞═════╪═══════╪═════╪═════════════╡
    │ 1   ┆ one   ┆ 9   ┆ {1,"one"}   │
    ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 2   ┆ two   ┆ 8   ┆ {2,"two"}   │
    ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 3   ┆ three ┆ 7   ┆ {3,"three"} │
    ├╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
    │ 4   ┆ four  ┆ 6   ┆ {4,"four"}  │
    └─────┴───────┴─────┴─────────────┘

    """

    if eager:
        return pli.select(struct(exprs, eager=False)).to_series()
    exprs = pli.selection_to_pyexpr_list(exprs)
    return pli.wrap_expr(_as_struct(exprs))