def to_validation_rules(data: Dict[str, Any]) -> Dict[str, Any]: res: Dict[str, Any] = {} for k, v in data.items(): if k in ["partitionby_has", "partitionby_is"]: if isinstance(v, str): v = [x.strip() for x in v.split(",")] res[k] = PartitionSpec(by=v).partition_by elif k in ["presort_has", "presort_is"]: res[k] = list(parse_presort_exp(v).items()) elif k in ["input_has"]: if isinstance(v, str): res[k] = v.replace(" ", "").split(",") else: assert_or_throw( isinstance(v, list), lambda: SyntaxError(f"{v} is neither a string or a list"), ) res[k] = [x.replace(" ", "") for x in v] elif k in ["input_is"]: try: res[k] = str(Schema(v)) except SyntaxError: raise SyntaxError( # pylint: disable=W0707 f"for input_is, the input must be a schema expression {v}") else: raise NotImplementedError(k) return res
def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = self.to_df(df).native nulls_last = bool(na_position == "last") if presort: presort = parse_presort_exp(presort) # Use presort over partition_spec.presort if possible _presort: IndexedOrderedDict = presort or partition_spec.presort def _presort_to_col(_col: str, _asc: bool) -> Any: if nulls_last: if _asc: return col(_col).asc_nulls_last() else: return col(_col).desc_nulls_last() else: if _asc: return col(_col).asc_nulls_first() else: return col(_col).desc_nulls_first() # If no partition if len(partition_spec.partition_by) == 0: if len(_presort.keys()) > 0: d = d.orderBy( [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()] ) d = d.limit(n) # If partition exists else: w = Window.partitionBy([col(x) for x in partition_spec.partition_by]) if len(_presort.keys()) > 0: w = w.orderBy( [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()] ) else: # row_number() still needs an orderBy w = w.orderBy(lit(1)) d = ( d.select(col("*"), row_number().over(w).alias("__row_number__")) .filter(col("__row_number__") <= n) .drop("__row_number__") ) return self.to_df(d, df.schema, metadata)
def _select_top(self, df: DataFrame, top_n: int): if top_n > 0: if len(self.partition_spec.partition_by) > 0: p_keys = ", ".join(self.partition_spec.partition_by) if len(self.partition_spec.presort) > 0: sort_expr = f"ORDER BY {self.partition_spec.presort_expr}" else: sort_expr = "" cols = ", ".join(df.schema.names) sql = """ SELECT {cols} FROM ( SELECT *, ROW_NUMBER() OVER(PARTITION BY {p_keys} {sort_expr}) AS __top_row_number__ FROM __plot_df__) WHERE __top_row_number__ <= {top_n} """.format(cols=cols, p_keys=p_keys, sort_expr=sort_expr, top_n=top_n) df = self.execution_engine.default_sql_engine.select( DataFrames(__plot_df__=df), sql) else: order_expr = "" if "order_by" in self.params: order_by = parse_presort_exp( self.params.get_or_throw("order_by", object)) if len(order_by) > 0: order_expr = "ORDER BY " + ", ".join( k + " " + ("ASC" if v else "DESC") for k, v in order_by.items()) sql = """ SELECT * FROM __plot_df__ {order_expr} LIMIT {top_n} """.format(order_expr=order_expr, top_n=top_n) df = self.execution_engine.default_sql_engine.select( DataFrames(__plot_df__=df), sql) return df
def process(self, dfs: DataFrames) -> None: kwargs: Dict[str, Any] = { k: v for k, v in self.params.items() if k not in [ "top_n", "order_by", "x", "y", "kind", "width", "height", "group" ] } top_n = self.params.get("top_n", 0) df = self._select_top(dfs[0], top_n).as_pandas() if "order_by" in self.params: order_by: Any = parse_presort_exp( self.params.get_or_throw("order_by", object)) else: order_by = self.partition_spec.presort self._plot( df, self.partition_spec.partition_by, x=self.params.get_or_throw("x", str), y=self.params.get_or_none("y", object), kind=self.params.get("kind", self.kind), width=self.params.get("width", 1.0), height=self.params.get("height", 0.5), order_by=order_by, group=self.params.get_or_none("group", object), **kwargs, )
def validate_on_compile(self) -> None: if self.kind == "": self.params.get_or_throw("kind", str) else: assert_or_throw("kind" not in self.params, f"can't reset kind {self.kind}") self.params.get("top_n", 0) parse_presort_exp(self.params.get("order_by", "a")) self.params.get_or_throw("x", str) y = self.params.get_or_none("y", object) gp = self.params.get_or_none("group", object) assert_or_throw( gp is None or isinstance(y, str), "when group is set, y must be set as a string", ) self.params.get("height", 0.5) width = self.params.get("width", 1.0) assert_or_throw(width in [0.5, 1.0], ValueError())
def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = self.to_df(df).native meta = [(d[x].name, d[x].dtype) for x in d.columns] if presort: presort = parse_presort_exp(presort) # Use presort over partition_spec.presort if possible _presort: IndexedOrderedDict = presort or partition_spec.presort def _partition_take(partition, n, presort): if len(presort.keys()) > 0: partition = partition.sort_values( list(presort.keys()), ascending=list(presort.values()), na_position=na_position, ) return partition.head(n) if len(partition_spec.partition_by) == 0: if len(_presort.keys()) == 0: d = d.head(n) else: # Use the default partition d = (d.map_partitions( _partition_take, n, _presort, meta=meta).reset_index(drop=True).compute()) # compute() brings this to Pandas so we can use pandas d = d.sort_values( list(_presort.keys()), ascending=list(_presort.values()), na_position=na_position, ).head(n) else: d = (d.groupby(partition_spec.partition_by, dropna=False).apply( _partition_take, n=n, presort=_presort, meta=meta).reset_index(drop=True)) return DaskDataFrame(d, df.schema, metadata)
def take( self, df: DataFrame, n: int, presort: str, na_position: str = "last", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, metadata: Any = None, ) -> DataFrame: assert_or_throw( isinstance(n, int), ValueError("n needs to be an integer"), ) d = df.as_pandas() # Use presort over partition_spec.presort if possible if presort: presort = parse_presort_exp(presort) _presort: IndexedOrderedDict = presort or partition_spec.presort if len(_presort.keys()) > 0: d = d.sort_values( list(_presort.keys()), ascending=list(_presort.values()), na_position=na_position, ) if len(partition_spec.partition_by) == 0: d = d.head(n) else: d = d.groupby(by=partition_spec.partition_by, dropna=False).head(n) return PandasDataFrame(d.reset_index(drop=True), df.schema, metadata, pandas_df_wrapper=True)
def test_parse_presort_exp(): assert parse_presort_exp(None) == IndexedOrderedDict() assert parse_presort_exp(IndexedOrderedDict([ ('c', True) ])) == IndexedOrderedDict([('c', True)]) assert parse_presort_exp("c") == IndexedOrderedDict([('c', True)]) assert parse_presort_exp(" c") == IndexedOrderedDict([('c', True)]) assert parse_presort_exp("c desc") == IndexedOrderedDict([ ('c', False) ]) assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([ ('b', False), ('c', True) ]) assert parse_presort_exp("DESC DESC, ASC ASC") == IndexedOrderedDict([ ('DESC', False), ('ASC', True) ]) assert parse_presort_exp([("b", False), ("c", True) ]) == IndexedOrderedDict([('b', False), ('c', True)]) assert parse_presort_exp("B DESC, C ASC") == IndexedOrderedDict([ ('B', False), ('C', True) ]) assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([ ('b', False), ('c', True) ]) with raises(SyntaxError): parse_presort_exp("b dsc, c asc") # mispelling of desc with raises(SyntaxError): parse_presort_exp("c true") # string format needs desc/asc with raises(SyntaxError): parse_presort_exp("c true, c true") # cannot contain duplicates with raises(SyntaxError): parse_presort_exp([("b", "desc"), ("c", "asc") ]) # instead of desc and asc, needs to be bool