def fill_null(self, fill_value: Union[int, str, "pli.Expr"]) -> "LazyFrame": """ Fill missing values Parameters ---------- fill_value Value to fill the missing values with """ if not isinstance(fill_value, pli.Expr): fill_value = pli.lit(fill_value) return wrap_ldf(self._ldf.fill_null(fill_value._pyexpr))
def _assert_series_inner( left: pli.Series, right: pli.Series, check_dtype: bool, check_exact: bool, nans_compare_equal: bool, atol: float, rtol: float, obj: str, ) -> None: """ Compares Series dtype + values """ try: can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__") except NotImplementedError: can_be_subtracted = False check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean if check_dtype: if left.dtype != right.dtype: raise_assert_detail(obj, "Dtype mismatch", left.dtype, right.dtype) # create mask of which (if any) values are unequal unequal = left != right if unequal.any() and nans_compare_equal and left.dtype in (Float32, Float64): # handle NaN values (which compare unequal to themselves) unequal = unequal & ~( (left.is_nan() & right.is_nan()).fill_null(pli.lit(False))) # assert exact, or with tolerance if unequal.any(): if check_exact: raise_assert_detail(obj, "Exact value mismatch", left=list(left), right=list(right)) else: # apply check with tolerance, but only to the known-unequal matches left, right = left.filter(unequal), right.filter(unequal) if ((left - right).abs() > (atol + rtol * right.abs())).sum() != 0: raise_assert_detail(obj, "Value mismatch", left=list(left), right=list(right))
def shift_and_fill( self, periods: int, fill_value: Union["pli.Expr", int, str, float] ) -> "LazyFrame": """ Shift the values by a given period and fill the parts that will be empty due to this operation with the result of the `fill_value` expression. Parameters ---------- periods Number of places to shift (may be negative). fill_value fill None values with the result of this expression. """ if not isinstance(fill_value, pli.Expr): fill_value = pli.lit(fill_value) return wrap_ldf(self._ldf.shift_and_fill(periods, fill_value._pyexpr))
def fill_nan(self, fill_value: Union[int, str, float, "pli.Expr"]) -> "LazyFrame": """ Fill floating point NaN values. ..warning:: NOTE that floating point NaN (No a Number) are not missing values! to replace missing values, use `fill_null`. Parameters ---------- fill_value Value to fill the NaN values with """ if not isinstance(fill_value, pli.Expr): fill_value = pli.lit(fill_value) return wrap_ldf(self._ldf.fill_nan(fill_value._pyexpr))
def with_columns(self, exprs: Union[List["pli.Expr"], "pli.Expr"]) -> "LazyFrame": """ Add or overwrite multiple columns in a DataFrame. Parameters ---------- exprs List of Expressions that evaluate to columns. """ if isinstance(exprs, pli.Expr): return self.with_column(exprs) pyexprs = [] for e in exprs: if isinstance(e, pli.Expr): pyexprs.append(e._pyexpr) elif isinstance(e, pli.Series): pyexprs.append(pli.lit(e)._pyexpr) return wrap_ldf(self._ldf.with_columns(pyexprs))
def arrow_to_pydf(data: "pa.Table", columns: Optional[ColumnsType] = None, rechunk: bool = True) -> "PyDataFrame": """ Construct a PyDataFrame from an Arrow Table. """ if not _PYARROW_AVAILABLE: # pragma: no cover raise ImportError( "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table." ) original_columns = columns columns, dtypes = _unpack_columns(columns) if columns is not None: try: data = data.rename_columns(columns) except pa.lib.ArrowInvalid as e: raise ValueError( "Dimensions of columns arg must match data dimensions.") from e data_dict = {} # dictionaries cannot be build in different batches (categorical does not allow that) # so we rechunk them and create them separate. dictionary_cols = {} names = [] for i, column in enumerate(data): # extract the name before casting if column._name is None: name = f"column_{i}" else: name = column._name names.append(name) column = coerce_arrow(column) if pa.types.is_dictionary(column.type): ps = arrow_to_pyseries(name, column, rechunk) dictionary_cols[i] = pli.wrap_s(ps) else: data_dict[name] = column if len(data_dict) > 0: tbl = pa.table(data_dict) # path for table without rows that keeps datatype if tbl.shape[0] == 0: pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df else: pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches()) else: pydf = pli.DataFrame([])._df if rechunk: pydf = pydf.rechunk() if len(dictionary_cols) > 0: df = pli.wrap_df(pydf) df = df.with_columns( [pli.lit(s).alias(s.name) for s in dictionary_cols.values()]) df = df[names] pydf = df._df if dtypes and original_columns: pydf = _post_apply_columns(pydf, original_columns) return pydf