def assign( self, df: DataFrame, columns: List[ColumnExpr], metadata: Any = None ) -> DataFrame: """Update existing columns with new values and add new columns :param df: the dataframe to set columns :param columns: column expressions :param metadata: dict-like object to add to the result dataframe, defaults to None. It's currently not used :return: the updated dataframe .. tip:: This can be used to cast data types, alter column values or add new columns. But you can't use aggregation in columns. .. admonition:: New Since :class: hint **0.6.0** .. seealso:: Please find more expression examples in :mod:`fugue.column.sql` and :mod:`fugue.column.functions` .. admonition:: Examples .. code-block:: python # assume df has schema: a:int,b:str # add constant column x engine.assign(df, lit(1,"x")) # change column b to be a constant integer engine.assign(df, lit(1,"b")) # add new x to be a+b engine.assign(df, (col("a")+col("b")).alias("x")) # cast column a data type to double engine.assign(df, col("a").cast(float)) """ SelectColumns( *columns ).assert_no_wildcard().assert_all_with_names().assert_no_agg() cols = [col(n) for n in df.schema.names] for c in columns: if c.output_name not in df.schema: cols.append(c) else: cols[df.schema.index_of_key(c.output_name)] = c return self.select(df, SelectColumns(*cols), metadata=metadata)
def filter( self, df: DataFrame, condition: ColumnExpr, metadata: Any = None ) -> DataFrame: """Filter rows by the given condition :param df: the dataframe to be filtered :param condition: (boolean) column expression :param metadata: dict-like object to add to the result dataframe, defaults to None. It's currently not used :return: the filtered dataframe .. admonition:: New Since :class: hint **0.6.0** .. seealso:: Please find more expression examples in :mod:`fugue.column.sql` and :mod:`fugue.column.functions` .. admonition:: Examples .. code-block:: python import fugue.column.functions as f engine.filter(df, (col("a")>1) & (col("b")=="x")) engine.filter(df, f.coalesce(col("a"),col("b"))>1) """ return self.select( df, cols=SelectColumns(col("*")), where=condition, metadata=metadata )
def aggregate( self, df: DataFrame, partition_spec: Optional[PartitionSpec], agg_cols: List[ColumnExpr], metadata: Any = None, ): """Aggregate on dataframe :param df: the dataframe to aggregate on :param partition_spec: PartitionSpec to specify partition keys :param agg_cols: aggregation expressions :param metadata: dict-like object to add to the result dataframe, defaults to None. It's currently not used :return: the aggregated result as a dataframe .. admonition:: New Since :class: hint **0.6.0** .. seealso:: Please find more expression examples in :mod:`fugue.column.sql` and :mod:`fugue.column.functions` .. admonition:: Examples .. code-block:: python import fugue.column.functions as f # SELECT MAX(b) AS b FROM df engine.aggregate( df, partition_spec=None, agg_cols=[f.max(col("b"))]) # SELECT a, MAX(b) AS x FROM df GROUP BY a engine.aggregate( df, partition_spec=PartitionSpec(by=["a"]), agg_cols=[f.max(col("b")).alias("x")]) """ assert_or_throw(len(agg_cols) > 0, ValueError("agg_cols can't be empty")) assert_or_throw( all(is_agg(x) for x in agg_cols), ValueError("all agg_cols must be aggregation functions"), ) keys: List[ColumnExpr] = [] if partition_spec is not None and len(partition_spec.partition_by) > 0: keys = [col(y) for y in partition_spec.partition_by] cols = SelectColumns(*keys, *agg_cols) return self.select(df, cols=cols, metadata=metadata)
def test_correct_select_schema(): schema = Schema("a:double,b:str") gen = SQLExpressionGenerator() sc = SelectColumns(col("*"), col("c")) output = Schema("a:double,b:str,c:str") c = gen.correct_select_schema(schema, sc, output) assert c is None output = Schema("a:int,b:int,c:str") c = gen.correct_select_schema(schema, sc, output) assert c == "a:double,b:str" sc = SelectColumns(f.count(col("*")).alias("t"), col("c").alias("a")) output = Schema("t:int,a:str") c = gen.correct_select_schema(schema, sc, output) assert c is None sc = SelectColumns((col("a") + col("b")).cast(str).alias("a"), lit(1, "c")) output = Schema("a:int,c:str") c = gen.correct_select_schema(schema, sc, output) assert c == "a:str,c:long"
def test_select_columns(): # not all with names cols = SelectColumns(col("a"), lit(1, "b"), col("bb") + col("cc"), f.first(col("c"))) assert to_uuid(cols) == to_uuid(cols) raises(ValueError, lambda: cols.assert_all_with_names()) # distinct cols2 = SelectColumns( col("a"), lit(1, "b"), col("bb") + col("cc"), f.first(col("c")), arg_distinct=True, ) assert to_uuid(cols) != to_uuid(cols2) # duplicated names cols = SelectColumns(col("a").alias("b"), lit(1, "b")) assert to_uuid(cols) != to_uuid( SelectColumns(col("a").alias("b"), lit(1, "c"))) raises(ValueError, lambda: cols.assert_all_with_names()) # with *, all cols must have alias cols = SelectColumns(col("*"), col("a")).assert_no_agg() raises(ValueError, lambda: cols.assert_all_with_names()) # * can be used at most once raises(ValueError, lambda: SelectColumns(col("*"), col("*"), col("a").alias("p"))) # * can't be used with aggregation raises(ValueError, lambda: SelectColumns(col("*"), f.first(col("a")).alias("x"))) cols = SelectColumns( col("aa").alias("a").cast(int), lit(1, "b"), (col("bb") + col("cc")).alias("c"), f.first(col("c")).alias("d"), ).assert_all_with_names() raises(AssertionError, lambda: cols.assert_no_agg()) assert not cols.simple assert 1 == len(cols.simple_cols) assert "CAST(aa AS long) AS a" == str(cols.simple_cols[0]) assert cols.has_literals assert 1 == len(cols.literals) assert "1 AS b" == str(cols.literals[0]) assert cols.has_agg assert 1 == len(cols.non_agg_funcs) assert "+(bb,cc) AS c" == str(cols.non_agg_funcs[0]) assert 1 == len(cols.agg_funcs) assert "FIRST(c) AS d" == str(cols.agg_funcs[0]) assert 2 == len(cols.group_keys) # a, c assert "aa" == cols.group_keys[0].output_name assert "" == cols.group_keys[1].output_name assert isinstance(cols.group_keys[1], _BinaryOpExpr) cols = SelectColumns(col("a")).assert_no_wildcard() assert cols.simple assert not cols.has_literals assert not cols.has_agg cols = SelectColumns(col("x"), col("*"), col("y") + col("z")) cols = cols.replace_wildcard(Schema("a:int,b:int")) assert "x" == str(cols.all_cols[0])
def test_no_cast(): gen = SQLExpressionGenerator(enable_cast=False) cols = SelectColumns( f.max(col("c")).cast("long").alias("c"), col("a", "aa"), col("b")) assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select( cols, "t")
def test_select(): gen = SQLExpressionGenerator() # no aggregation cols = SelectColumns(col("*")) assert "SELECT * FROM x" == gen.select(cols, "x") cols = SelectColumns(col("a"), lit(1).alias("b"), (col("b") + col("c")).alias("x")) where = (col("a") > 5).alias("aa") assert "SELECT a, 1 AS b, b+c AS x FROM t WHERE a>5" == gen.select( cols, "t", where=where) # aggregation without literals cols = SelectColumns(f.max(col("c")).alias("c"), col("a", "aa"), col("b")) assert "SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b" == gen.select( cols, "t") where = col("a") < 10 having = (f.max(col("a")) > 5).alias("aaa") assert ( "SELECT MAX(c) AS c, a AS aa, b FROM t WHERE a<10 GROUP BY a, b HAVING MAX(a)>5" == gen.select(cols, "t", where=where, having=having)) cols = SelectColumns( f.min(col("c") + 1).alias("c"), f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert "SELECT MIN(c+1) AS c, CAST(AVG(d+e) AS long) AS d FROM t" == gen.select( cols, "t") # aggregation with literals cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j"), col("a", "aa"), col("b")) assert ( "SELECT 1 AS k, c, 2 AS j, aa, b FROM (SELECT MAX(c) AS c, a AS aa, b FROM t GROUP BY a, b)" == gen.select(cols, "t")) cols = SelectColumns(lit(1, "k"), f.max(col("c")).alias("c"), lit(2, "j")) assert "SELECT 1 AS k, c, 2 AS j FROM (SELECT MAX(c) AS c FROM t)" == gen.select( cols, "t") cols = SelectColumns(lit(1, "k"), col("a"), f.max(col("c")).alias("c"), lit(2, "j")) assert ( "SELECT 1 AS k, a, c, 2 AS j FROM (SELECT a, MAX(c) AS c FROM t GROUP BY a)" == gen.select(cols, "t")) # cast cols = SelectColumns( col("c").cast(float), f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert ( "SELECT CAST(c AS double) AS c, CAST(AVG(d+e) AS long) AS d FROM t GROUP BY c" == gen.select(cols, "t")) # infer alias cols = SelectColumns( (-col("c")).cast(float), f.max(col("e")).cast(int), f.avg(col("d") + col("e")).cast(int).alias("d"), ) assert ("SELECT CAST(-c AS double) AS c, CAST(MAX(e) AS long) AS e, " "CAST(AVG(d+e) AS long) AS d FROM t GROUP BY -c" == gen.select( cols, "t"))
def test_select(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) # simple b = e.select( a, SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str))) df_eq( b, [[2, "3"], [2, "3"], [1, "2"], [4, "5"], [4, "5"]], "b:int,c:str", throw=True, ) # with distinct b = e.select( a, SelectColumns(col("b"), (col("b") + 1).alias("c").cast(str), arg_distinct=True), ) df_eq( b, [[2, "3"], [1, "2"], [4, "5"]], "b:int,c:str", throw=True, ) # wildcard b = e.select(a, SelectColumns(col("*")), where=col("a") + col("b") == 3) df_eq(b, [[1, 2]], "a:double,b:int", throw=True) # aggregation b = e.select( a, SelectColumns(col("a"), ff.sum(col("b")).cast(float).alias("b"))) df_eq(b, [[1, 2], [3, 4], [None, 7]], "a:double,b:double", throw=True) # having # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) b = e.select( a, SelectColumns(col("a"), col_b.cast(float).alias("b")), having=(col_b >= 7) | (col("a") == 1), ) df_eq(b, [[1, 2], [None, 7]], "a:double,b:double", throw=True) # literal + alias inference # https://github.com/fugue-project/fugue/issues/222 col_b = ff.sum(col("b")) b = e.select( a, SelectColumns(col("a"), lit(1, "o").cast(str), col_b.cast(float)), having=(col_b >= 7) | (col("a") == 1), ) df_eq(b, [[1, "1", 2], [None, "1", 7]], "a:double,o:str,b:double", throw=True)