def test_map_with_special_values(self): def with_nat(cursor, data): df = data.as_pandas() df["nat"] = pd.NaT schema = data.schema + "nat:datetime" return PandasDataFrame(df, schema) e = self.engine # test with multiple key with null values o = ArrayDataFrame( [[1, None, 1], [1, None, 0], [None, None, 1]], "a:double,b:double,c:int", dict(a=1), ) c = e.map( o, select_top, o.schema, PartitionSpec(by=["a", "b"], presort="c") ) df_eq( c, [[1, None, 0], [None, None, 1]], "a:double,b:double,c:int", throw=True, ) # test datetime with nat dt = datetime.now() o = ArrayDataFrame( [ [dt, 2, 1], [None, 2, None], [None, 1, None], [dt, 5, 1], [None, 4, None], ], "a:datetime,b:int,c:double", dict(a=1), ) c = e.map( o, select_top, o.schema, PartitionSpec(by=["a", "c"], presort="b DESC") ) df_eq( c, [[None, 4, None], [dt, 5, 1]], "a:datetime,b:int,c:double", throw=True, ) d = e.map( c, with_nat, "a:datetime,b:int,c:double,nat:datetime", PartitionSpec() ) df_eq( d, [[None, 4, None, None], [dt, 5, 1, None]], "a:datetime,b:int,c:double,nat:datetime", throw=True, ) # test list o = ArrayDataFrame([[dt, [1, 2]]], "a:datetime,b:[int]") c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"])) df_eq(c, o, check_order=True, throw=True)
def test_comap_with_key(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1]], "c:int,a:int") z1 = e.persist(e.zip(a, b, df1_name="x", df2_name="y")) z2 = e.persist(e.zip_all(DataFrames(x=a, y=b, z=b))) z3 = e.persist( e.zip_all(DataFrames(z=c), partition_spec=PartitionSpec(by=["a"])) ) def comap(cursor, dfs): assert dfs.has_key v = ",".join([k + str(v.count()) for k, v in dfs.items()]) keys = cursor.key_value_array # if len(keys) == 0: # return ArrayDataFrame([[v]], "v:str") return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str") def on_init(partition_no, dfs): assert dfs.has_key assert partition_no >= 0 assert len(dfs) > 0 res = e.comap( z1, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "x2,y1"]], "a:int,v:str", metadata=dict(a=1), throw=True) res = e.comap( z2, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "x2,y1,z1"]], "a:int,v:str", metadata=dict(a=1), throw=True) res = e.comap( z3, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "z1"]], "a:int,v:str", metadata=dict(a=1), throw=True)
def test_map(self): def noop(cursor, data): return data def on_init(partition_no, data): # TODO: this test is not sufficient assert partition_no >= 0 data.peek_array() e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) # no partition c = e.map(a, noop, a.schema, PartitionSpec(), dict(a=1)) df_eq(c, o, throw=True) # with key partition c = e.map(a, noop, a.schema, PartitionSpec(by=["a"], presort="b"), dict(a=1)) df_eq(c, o, throw=True) # select top c = e.map(a, select_top, a.schema, PartitionSpec(by=["a"], presort="b")) df_eq(c, [[None, 1], [1, 2], [3, 4]], "a:double,b:int", throw=True) # select top with another order c = e.map( a, select_top, a.schema, PartitionSpec(partition_by=["a"], presort="b DESC"), metadata=dict(a=1), ) df_eq( c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", metadata=dict(a=1), throw=True, ) # add num_partitions, on_init should not matter c = e.map( a, select_top, a.schema, PartitionSpec(partition_by=["a"], presort="b DESC", num_partitions=3), on_init=on_init, ) df_eq(c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", throw=True)
def test__serialize_by_partition(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") s = e._serialize_by_partition( a, PartitionSpec(by=["a"], presort="b"), df_name="_0" ) assert s.count() == 2 s = e.persist(e._serialize_by_partition(a, PartitionSpec(), df_name="_0")) assert s.count() == 1 s = e.persist( e._serialize_by_partition(a, PartitionSpec(by=["x"]), df_name="_0") ) assert s.count() == 1
def test_comap(self): ps = PartitionSpec(presort="b,c") e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") z1 = e.persist(e.zip(a, b)) z2 = e.persist(e.zip(a, b, partition_spec=ps, how="left_outer")) z3 = e.persist( e._serialize_by_partition(a, partition_spec=ps, df_name="_x") ) z4 = e.persist(e.zip(a, b, partition_spec=ps, how="cross")) def comap(cursor, dfs): assert not dfs.has_key v = ",".join([k + str(v.count()) for k, v in dfs.items()]) keys = cursor.key_value_array if len(keys) == 0: return ArrayDataFrame([[v]], "v:str") return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str") def on_init(partition_no, dfs): assert not dfs.has_key assert partition_no >= 0 assert len(dfs) > 0 res = e.comap( z1, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "_02,_11"]], "a:int,v:str", metadata=dict(a=1), throw=True) # for outer joins, the NULL will be filled with empty dataframe res = e.comap(z2, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1)) df_eq( res, [[1, "_02,_11"], [3, "_01,_10"]], "a:int,v:str", metadata=dict(a=1), throw=True, ) res = e.comap(z3, comap, "v:str", PartitionSpec(), metadata=dict(a=1)) df_eq(res, [["_03"]], "v:str", metadata=dict(a=1), throw=True) res = e.comap(z4, comap, "v:str", PartitionSpec(), metadata=dict(a=1)) df_eq(res, [["_03,_12"]], "v:str", metadata=dict(a=1), throw=True)
def test_zip(self): ps = PartitionSpec(by=["a"], presort="b DESC,c DESC") e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") sa = e._serialize_by_partition(a, ps, df_name="_0") sb = e._serialize_by_partition(b, ps, df_name="_1") # test zip with serialized dfs z1 = e.persist(e.zip(sa, sb, how="inner", partition_spec=ps)) assert 1 == z1.count() assert not z1.metadata.get("serialized_has_name", False) z2 = e.persist(e.zip(sa, sb, how="left_outer", partition_spec=ps)) assert 2 == z2.count() # can't have duplicated keys raises(ValueError, lambda: e.zip(sa, sa, how="inner", partition_spec=ps)) # not support semi or anti raises( InvalidOperationError, lambda: e.zip(sa, sa, how="anti", partition_spec=ps), ) raises( InvalidOperationError, lambda: e.zip(sa, sa, how="leftsemi", partition_spec=ps), ) raises( InvalidOperationError, lambda: e.zip(sa, sa, how="LEFT SEMI", partition_spec=ps), ) # can't specify keys for cross join raises( InvalidOperationError, lambda: e.zip(sa, sa, how="cross", partition_spec=ps), ) # test zip with unserialized dfs z3 = e.persist(e.zip(a, b, partition_spec=ps)) df_eq(z1, z3, throw=True, check_metadata=False) z3 = e.persist(e.zip(a, sb, partition_spec=ps)) df_eq(z1, z3, throw=True, check_metadata=False) z3 = e.persist(e.zip(sa, b, partition_spec=ps)) df_eq(z1, z3, throw=True, check_metadata=False) z4 = e.persist(e.zip(a, b, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True, check_metadata=False) z4 = e.persist(e.zip(a, sb, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True, check_metadata=False) z4 = e.persist(e.zip(sa, b, how="left_outer", partition_spec=ps)) df_eq(z2, z4, throw=True, check_metadata=False) z5 = e.persist(e.zip(a, b, how="cross")) assert z5.count() == 1 assert len(z5.schema) == 2 z6 = e.persist(e.zip(sa, b, how="cross")) assert z6.count() == 2 assert len(z6.schema) == 3 z7 = e.zip(a, b, df1_name="x", df2_name="y") z7.show() assert z7.metadata.get("serialized_has_name", False)
def test_map_with_dict_col(self): e = self.engine dt = datetime.now() # test dict o = ArrayDataFrame([[dt, dict(a=1)]], "a:datetime,b:{a:int}") c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"])) df_eq(c, o, no_pandas=True, check_order=True, throw=True)
def test_aggregate(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.aggregate( df=a, partition_spec=None, agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq(b, [[4, 8]], "b:int,c:int", throw=True) b = e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq( b, [[None, 4, 8], [1, 2, 4], [3, 4, 8]], "a:double,b:int,c:int", throw=True, ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ff.max(col("b")), lit(1)], ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[], )
def visitFugueZipTask(self, ctx: fp.FugueZipTaskContext) -> WorkflowDataFrame: data = self.get_dict(ctx, "dfs", "how") partition_spec = PartitionSpec(**self.get_dict(ctx, "by", "presort")) # TODO: currently SQL does not support cache to file on ZIP return self.workflow.zip(data["dfs"], how=data.get("how", "inner"), partition=partition_spec)
def test_persist_checkpoint_broadcast(): dag = FugueWorkflow() dag.create(mock_create1).persist() dag.create(mock_create1).weak_checkpoint(lazy=True, level="a.b") dag.create(mock_create1).broadcast() dag.create(mock_create1).weak_checkpoint(level="a.b").broadcast() dag.create(mock_create1).checkpoint() dag.create(mock_create1).strong_checkpoint(lazy=True) dag.create(mock_create1).strong_checkpoint(lazy=True, x="xy z") dag.create(mock_create1).strong_checkpoint(lazy=False, partition=PartitionSpec(num=5), single=True, x="xy z").broadcast() dag.create(mock_create1).deterministic_checkpoint() dag.create(mock_create1).deterministic_checkpoint( lazy=False, partition=PartitionSpec(num=4), single=True, namespace="n", x=2) assert_eq( """ create using mock_create1 persist a=create using mock_create1 lazy persist (level="a.b") create using mock_create1 broadcast a=create using mock_create1 persist(level="a.b") broadcast create using mock_create1 checkpoint a= create using mock_create1 lazy strong checkpoint a=create using mock_create1 lazy checkpoint(x="xy z") a=create using mock_create1 checkpoint prepartition 5 single (x="xy z") broadcast create using mock_create1 deterministic checkpoint create using mock_create1 deterministic checkpoint "n" prepartition 4 single params x=2 """, dag, )
def test_zip_all(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") z = e.persist(e.zip_all(DataFrames(a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert not z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(x=a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) z = e.persist( e.zip_all(DataFrames(x=a), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1], [2, 7]], "d:int,a:int") z = e.persist(e.zip_all(DataFrames(a, b, c))) assert 1 == z.count() assert not z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(x=a, y=b, z=c))) assert 1 == z.count() assert z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(b, b))) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema z = e.persist(e.zip_all(DataFrames(x=b, y=b))) assert 2 == z.count() assert z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema z = e.persist( e.zip_all(DataFrames(b, b), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert "c" not in z.schema
def test_map_with_binary(self): e = self.engine o = ArrayDataFrame( [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]], "a:bytes", ) c = e.map(o, binary_map, o.schema, PartitionSpec()) expected = ArrayDataFrame( [ [pickle.dumps(BinaryObject("ax"))], [pickle.dumps(BinaryObject("bx"))], ], "a:bytes", ) df_eq(expected, c, no_pandas=True, check_order=True, throw=True)
def visitFugueTakeTask(self, ctx: fp.FugueTakeTaskContext): data = self.get_dict(ctx, "partition", "presort", "df") if "df" in data: df = data["df"] else: df = self.last params: Dict[str, Any] = {} params["n"] = int(self.ctxToStr(ctx.rows)) or 20 # default is 20 params["na_position"] = "first" if ctx.FIRST() is not None else "last" if data.get("partition"): _partition_spec = PartitionSpec(data.get("partition")) return df.partition(by=_partition_spec.partition_by, presort=_partition_spec.presort).take(**params) else: if data.get("presort"): params["presort"] = data.get("presort") return df.take(**params)
def test_save_and_load_json(self): e = self.engine b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") e.save_df( e.repartition(e.to_df(b), PartitionSpec(num=2)), path, format_hint="json", ) c = e.load_df( path, format_hint="json", columns=["a", "c"], ) df_eq(c, [[1, 6], [7, 2], [4, 3], [8, 4], [7, 6]], "a:long,c:long", throw=True)
def visitFuguePrepartition( self, ctx: fp.FuguePrepartitionContext) -> PartitionSpec: params = self.get_dict(ctx, "algo", "num", "by", "presort") return PartitionSpec(**params)
def test_take(self): e = self.engine ps = PartitionSpec(by=["a"], presort="b DESC,c DESC") ps2 = PartitionSpec(by=["c"], presort="b ASC") a = e.to_df( [ [1, 2, 3], [1, 3, 4], [2, 1, 2], [2, 2, 2], [None, 4, 2], [None, 2, 1], ], "a:double,b:double,c:double", ) b = e.take(a, n=1, presort="b desc", metadata=(dict(a=1))) c = e.take(a, n=2, presort="a desc", na_position="first") d = e.take(a, n=1, presort="a asc, b desc", partition_spec=ps) f = e.take(a, n=1, presort=None, partition_spec=ps2) g = e.take(a, n=2, presort="a desc", na_position="last") h = e.take(a, n=2, presort="a", na_position="first") df_eq( b, [[None, 4, 2]], "a:double,b:double,c:double", metadata=dict(a=1), throw=True, ) df_eq( c, [[None, 4, 2], [None, 2, 1]], "a:double,b:double,c:double", throw=True, ) df_eq( d, [[1, 3, 4], [2, 2, 2], [None, 4, 2]], "a:double,b:double,c:double", throw=True, ) df_eq( f, [[1, 2, 3], [1, 3, 4], [2, 1, 2], [None, 2, 1]], "a:double,b:double,c:double", throw=True, ) df_eq( g, [[2, 1, 2], [2, 2, 2]], "a:double,b:double,c:double", throw=True, ) df_eq( h, [ [None, 4, 2], [None, 2, 1], ], "a:double,b:double,c:double", throw=True, ) raises(ValueError, lambda: e.take(a, n=0.5, presort=None))