예제 #1
0
        def test_map_with_special_values(self):
            def with_nat(cursor, data):
                df = data.as_pandas()
                df["nat"] = pd.NaT
                schema = data.schema + "nat:datetime"
                return PandasDataFrame(df, schema)

            e = self.engine
            # test with multiple key with null values
            o = ArrayDataFrame(
                [[1, None, 1], [1, None, 0], [None, None, 1]],
                "a:double,b:double,c:int",
                dict(a=1),
            )
            c = e.map(
                o, select_top, o.schema, PartitionSpec(by=["a", "b"], presort="c")
            )
            df_eq(
                c,
                [[1, None, 0], [None, None, 1]],
                "a:double,b:double,c:int",
                throw=True,
            )
            # test datetime with nat
            dt = datetime.now()
            o = ArrayDataFrame(
                [
                    [dt, 2, 1],
                    [None, 2, None],
                    [None, 1, None],
                    [dt, 5, 1],
                    [None, 4, None],
                ],
                "a:datetime,b:int,c:double",
                dict(a=1),
            )
            c = e.map(
                o, select_top, o.schema, PartitionSpec(by=["a", "c"], presort="b DESC")
            )
            df_eq(
                c,
                [[None, 4, None], [dt, 5, 1]],
                "a:datetime,b:int,c:double",
                throw=True,
            )
            d = e.map(
                c, with_nat, "a:datetime,b:int,c:double,nat:datetime", PartitionSpec()
            )
            df_eq(
                d,
                [[None, 4, None, None], [dt, 5, 1, None]],
                "a:datetime,b:int,c:double,nat:datetime",
                throw=True,
            )
            # test list
            o = ArrayDataFrame([[dt, [1, 2]]], "a:datetime,b:[int]")
            c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"]))
            df_eq(c, o, check_order=True, throw=True)
예제 #2
0
        def test_comap_with_key(self):
            e = self.engine
            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
            c = e.to_df([[6, 1]], "c:int,a:int")
            z1 = e.persist(e.zip(a, b, df1_name="x", df2_name="y"))
            z2 = e.persist(e.zip_all(DataFrames(x=a, y=b, z=b)))
            z3 = e.persist(
                e.zip_all(DataFrames(z=c), partition_spec=PartitionSpec(by=["a"]))
            )

            def comap(cursor, dfs):
                assert dfs.has_key
                v = ",".join([k + str(v.count()) for k, v in dfs.items()])
                keys = cursor.key_value_array
                # if len(keys) == 0:
                #    return ArrayDataFrame([[v]], "v:str")
                return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str")

            def on_init(partition_no, dfs):
                assert dfs.has_key
                assert partition_no >= 0
                assert len(dfs) > 0

            res = e.comap(
                z1,
                comap,
                "a:int,v:str",
                PartitionSpec(),
                metadata=dict(a=1),
                on_init=on_init,
            )
            df_eq(res, [[1, "x2,y1"]], "a:int,v:str", metadata=dict(a=1), throw=True)

            res = e.comap(
                z2,
                comap,
                "a:int,v:str",
                PartitionSpec(),
                metadata=dict(a=1),
                on_init=on_init,
            )
            df_eq(res, [[1, "x2,y1,z1"]], "a:int,v:str", metadata=dict(a=1), throw=True)

            res = e.comap(
                z3,
                comap,
                "a:int,v:str",
                PartitionSpec(),
                metadata=dict(a=1),
                on_init=on_init,
            )
            df_eq(res, [[1, "z1"]], "a:int,v:str", metadata=dict(a=1), throw=True)
예제 #3
0
        def test_map(self):
            def noop(cursor, data):
                return data

            def on_init(partition_no, data):
                # TODO: this test is not sufficient
                assert partition_no >= 0
                data.peek_array()

            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)
            # no partition
            c = e.map(a, noop, a.schema, PartitionSpec(), dict(a=1))
            df_eq(c, o, throw=True)
            # with key partition
            c = e.map(a, noop, a.schema, PartitionSpec(by=["a"], presort="b"),
                      dict(a=1))
            df_eq(c, o, throw=True)
            # select top
            c = e.map(a, select_top, a.schema,
                      PartitionSpec(by=["a"], presort="b"))
            df_eq(c, [[None, 1], [1, 2], [3, 4]], "a:double,b:int", throw=True)
            # select top with another order
            c = e.map(
                a,
                select_top,
                a.schema,
                PartitionSpec(partition_by=["a"], presort="b DESC"),
                metadata=dict(a=1),
            )
            df_eq(
                c,
                [[None, 4], [1, 2], [3, 4]],
                "a:double,b:int",
                metadata=dict(a=1),
                throw=True,
            )
            # add num_partitions, on_init should not matter
            c = e.map(
                a,
                select_top,
                a.schema,
                PartitionSpec(partition_by=["a"],
                              presort="b DESC",
                              num_partitions=3),
                on_init=on_init,
            )
            df_eq(c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", throw=True)
예제 #4
0
 def test__serialize_by_partition(self):
     e = self.engine
     a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
     s = e._serialize_by_partition(
         a, PartitionSpec(by=["a"], presort="b"), df_name="_0"
     )
     assert s.count() == 2
     s = e.persist(e._serialize_by_partition(a, PartitionSpec(), df_name="_0"))
     assert s.count() == 1
     s = e.persist(
         e._serialize_by_partition(a, PartitionSpec(by=["x"]), df_name="_0")
     )
     assert s.count() == 1
예제 #5
0
        def test_comap(self):
            ps = PartitionSpec(presort="b,c")
            e = self.engine
            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
            z1 = e.persist(e.zip(a, b))
            z2 = e.persist(e.zip(a, b, partition_spec=ps, how="left_outer"))
            z3 = e.persist(
                e._serialize_by_partition(a, partition_spec=ps, df_name="_x")
            )
            z4 = e.persist(e.zip(a, b, partition_spec=ps, how="cross"))

            def comap(cursor, dfs):
                assert not dfs.has_key
                v = ",".join([k + str(v.count()) for k, v in dfs.items()])
                keys = cursor.key_value_array
                if len(keys) == 0:
                    return ArrayDataFrame([[v]], "v:str")
                return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str")

            def on_init(partition_no, dfs):
                assert not dfs.has_key
                assert partition_no >= 0
                assert len(dfs) > 0

            res = e.comap(
                z1,
                comap,
                "a:int,v:str",
                PartitionSpec(),
                metadata=dict(a=1),
                on_init=on_init,
            )
            df_eq(res, [[1, "_02,_11"]], "a:int,v:str", metadata=dict(a=1), throw=True)

            # for outer joins, the NULL will be filled with empty dataframe
            res = e.comap(z2, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1))
            df_eq(
                res,
                [[1, "_02,_11"], [3, "_01,_10"]],
                "a:int,v:str",
                metadata=dict(a=1),
                throw=True,
            )

            res = e.comap(z3, comap, "v:str", PartitionSpec(), metadata=dict(a=1))
            df_eq(res, [["_03"]], "v:str", metadata=dict(a=1), throw=True)

            res = e.comap(z4, comap, "v:str", PartitionSpec(), metadata=dict(a=1))
            df_eq(res, [["_03,_12"]], "v:str", metadata=dict(a=1), throw=True)
예제 #6
0
        def test_zip(self):
            ps = PartitionSpec(by=["a"], presort="b DESC,c DESC")
            e = self.engine
            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
            sa = e._serialize_by_partition(a, ps, df_name="_0")
            sb = e._serialize_by_partition(b, ps, df_name="_1")
            # test zip with serialized dfs
            z1 = e.persist(e.zip(sa, sb, how="inner", partition_spec=ps))
            assert 1 == z1.count()
            assert not z1.metadata.get("serialized_has_name", False)
            z2 = e.persist(e.zip(sa, sb, how="left_outer", partition_spec=ps))
            assert 2 == z2.count()

            # can't have duplicated keys
            raises(ValueError, lambda: e.zip(sa, sa, how="inner", partition_spec=ps))
            # not support semi or anti
            raises(
                InvalidOperationError,
                lambda: e.zip(sa, sa, how="anti", partition_spec=ps),
            )
            raises(
                InvalidOperationError,
                lambda: e.zip(sa, sa, how="leftsemi", partition_spec=ps),
            )
            raises(
                InvalidOperationError,
                lambda: e.zip(sa, sa, how="LEFT SEMI", partition_spec=ps),
            )
            # can't specify keys for cross join
            raises(
                InvalidOperationError,
                lambda: e.zip(sa, sa, how="cross", partition_spec=ps),
            )

            # test zip with unserialized dfs
            z3 = e.persist(e.zip(a, b, partition_spec=ps))
            df_eq(z1, z3, throw=True, check_metadata=False)
            z3 = e.persist(e.zip(a, sb, partition_spec=ps))
            df_eq(z1, z3, throw=True, check_metadata=False)
            z3 = e.persist(e.zip(sa, b, partition_spec=ps))
            df_eq(z1, z3, throw=True, check_metadata=False)

            z4 = e.persist(e.zip(a, b, how="left_outer", partition_spec=ps))
            df_eq(z2, z4, throw=True, check_metadata=False)
            z4 = e.persist(e.zip(a, sb, how="left_outer", partition_spec=ps))
            df_eq(z2, z4, throw=True, check_metadata=False)
            z4 = e.persist(e.zip(sa, b, how="left_outer", partition_spec=ps))
            df_eq(z2, z4, throw=True, check_metadata=False)

            z5 = e.persist(e.zip(a, b, how="cross"))
            assert z5.count() == 1
            assert len(z5.schema) == 2
            z6 = e.persist(e.zip(sa, b, how="cross"))
            assert z6.count() == 2
            assert len(z6.schema) == 3

            z7 = e.zip(a, b, df1_name="x", df2_name="y")
            z7.show()
            assert z7.metadata.get("serialized_has_name", False)
예제 #7
0
 def test_map_with_dict_col(self):
     e = self.engine
     dt = datetime.now()
     # test dict
     o = ArrayDataFrame([[dt, dict(a=1)]], "a:datetime,b:{a:int}")
     c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"]))
     df_eq(c, o, no_pandas=True, check_order=True, throw=True)
예제 #8
0
        def test_aggregate(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.aggregate(
                df=a,
                partition_spec=None,
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(b, [[4, 8]], "b:int,c:int", throw=True)

            b = e.aggregate(
                df=a,
                partition_spec=PartitionSpec(by=["a"]),
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(
                b,
                [[None, 4, 8], [1, 2, 4], [3, 4, 8]],
                "a:double,b:int,c:int",
                throw=True,
            )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[ff.max(col("b")), lit(1)],
                )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[],
                )
예제 #9
0
 def visitFugueZipTask(self,
                       ctx: fp.FugueZipTaskContext) -> WorkflowDataFrame:
     data = self.get_dict(ctx, "dfs", "how")
     partition_spec = PartitionSpec(**self.get_dict(ctx, "by", "presort"))
     # TODO: currently SQL does not support cache to file on ZIP
     return self.workflow.zip(data["dfs"],
                              how=data.get("how", "inner"),
                              partition=partition_spec)
예제 #10
0
def test_persist_checkpoint_broadcast():
    dag = FugueWorkflow()
    dag.create(mock_create1).persist()
    dag.create(mock_create1).weak_checkpoint(lazy=True, level="a.b")

    dag.create(mock_create1).broadcast()
    dag.create(mock_create1).weak_checkpoint(level="a.b").broadcast()

    dag.create(mock_create1).checkpoint()
    dag.create(mock_create1).strong_checkpoint(lazy=True)
    dag.create(mock_create1).strong_checkpoint(lazy=True, x="xy z")
    dag.create(mock_create1).strong_checkpoint(lazy=False,
                                               partition=PartitionSpec(num=5),
                                               single=True,
                                               x="xy z").broadcast()

    dag.create(mock_create1).deterministic_checkpoint()
    dag.create(mock_create1).deterministic_checkpoint(
        lazy=False,
        partition=PartitionSpec(num=4),
        single=True,
        namespace="n",
        x=2)
    assert_eq(
        """
    create using mock_create1 persist
    a=create using mock_create1 lazy persist (level="a.b")

    create using mock_create1 broadcast
    a=create using mock_create1 persist(level="a.b") broadcast

    create using mock_create1 checkpoint
    a= create using mock_create1 lazy strong checkpoint
    a=create using mock_create1 lazy checkpoint(x="xy z")
    a=create using mock_create1 checkpoint prepartition 5 single (x="xy z") broadcast

    create using mock_create1 deterministic checkpoint
    create using mock_create1 deterministic checkpoint "n"
        prepartition 4 single params x=2
    """,
        dag,
    )
예제 #11
0
        def test_zip_all(self):
            e = self.engine
            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
            z = e.persist(e.zip_all(DataFrames(a)))
            assert 1 == z.count()
            assert z.metadata.get("serialized", False)
            assert not z.metadata.get("serialized_has_name", False)
            z = e.persist(e.zip_all(DataFrames(x=a)))
            assert 1 == z.count()
            assert z.metadata.get("serialized", False)
            assert z.metadata.get("serialized_has_name", False)
            z = e.persist(
                e.zip_all(DataFrames(x=a), partition_spec=PartitionSpec(by=["a"]))
            )
            assert 2 == z.count()
            assert z.metadata.get("serialized", False)
            assert z.metadata.get("serialized_has_name", False)

            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
            c = e.to_df([[6, 1], [2, 7]], "d:int,a:int")
            z = e.persist(e.zip_all(DataFrames(a, b, c)))
            assert 1 == z.count()
            assert not z.metadata.get("serialized_has_name", False)
            z = e.persist(e.zip_all(DataFrames(x=a, y=b, z=c)))
            assert 1 == z.count()
            assert z.metadata.get("serialized_has_name", False)

            z = e.persist(e.zip_all(DataFrames(b, b)))
            assert 2 == z.count()
            assert not z.metadata.get("serialized_has_name", False)
            assert ["a", "c"] in z.schema
            z = e.persist(e.zip_all(DataFrames(x=b, y=b)))
            assert 2 == z.count()
            assert z.metadata.get("serialized_has_name", False)
            assert ["a", "c"] in z.schema

            z = e.persist(
                e.zip_all(DataFrames(b, b), partition_spec=PartitionSpec(by=["a"]))
            )
            assert 2 == z.count()
            assert not z.metadata.get("serialized_has_name", False)
            assert "c" not in z.schema
예제 #12
0
 def test_map_with_binary(self):
     e = self.engine
     o = ArrayDataFrame(
         [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]],
         "a:bytes",
     )
     c = e.map(o, binary_map, o.schema, PartitionSpec())
     expected = ArrayDataFrame(
         [
             [pickle.dumps(BinaryObject("ax"))],
             [pickle.dumps(BinaryObject("bx"))],
         ],
         "a:bytes",
     )
     df_eq(expected, c, no_pandas=True, check_order=True, throw=True)
예제 #13
0
 def visitFugueTakeTask(self, ctx: fp.FugueTakeTaskContext):
     data = self.get_dict(ctx, "partition", "presort", "df")
     if "df" in data:
         df = data["df"]
     else:
         df = self.last
     params: Dict[str, Any] = {}
     params["n"] = int(self.ctxToStr(ctx.rows)) or 20  # default is 20
     params["na_position"] = "first" if ctx.FIRST() is not None else "last"
     if data.get("partition"):
         _partition_spec = PartitionSpec(data.get("partition"))
         return df.partition(by=_partition_spec.partition_by,
                             presort=_partition_spec.presort).take(**params)
     else:
         if data.get("presort"):
             params["presort"] = data.get("presort")
         return df.take(**params)
예제 #14
0
 def test_save_and_load_json(self):
     e = self.engine
     b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]],
                        "c:int,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(
         e.repartition(e.to_df(b), PartitionSpec(num=2)),
         path,
         format_hint="json",
     )
     c = e.load_df(
         path,
         format_hint="json",
         columns=["a", "c"],
     )
     df_eq(c, [[1, 6], [7, 2], [4, 3], [8, 4], [7, 6]],
           "a:long,c:long",
           throw=True)
예제 #15
0
 def visitFuguePrepartition(
         self, ctx: fp.FuguePrepartitionContext) -> PartitionSpec:
     params = self.get_dict(ctx, "algo", "num", "by", "presort")
     return PartitionSpec(**params)
예제 #16
0
 def test_take(self):
     e = self.engine
     ps = PartitionSpec(by=["a"], presort="b DESC,c DESC")
     ps2 = PartitionSpec(by=["c"], presort="b ASC")
     a = e.to_df(
         [
             [1, 2, 3],
             [1, 3, 4],
             [2, 1, 2],
             [2, 2, 2],
             [None, 4, 2],
             [None, 2, 1],
         ],
         "a:double,b:double,c:double",
     )
     b = e.take(a, n=1, presort="b desc", metadata=(dict(a=1)))
     c = e.take(a, n=2, presort="a desc", na_position="first")
     d = e.take(a, n=1, presort="a asc, b desc", partition_spec=ps)
     f = e.take(a, n=1, presort=None, partition_spec=ps2)
     g = e.take(a, n=2, presort="a desc", na_position="last")
     h = e.take(a, n=2, presort="a", na_position="first")
     df_eq(
         b,
         [[None, 4, 2]],
         "a:double,b:double,c:double",
         metadata=dict(a=1),
         throw=True,
     )
     df_eq(
         c,
         [[None, 4, 2], [None, 2, 1]],
         "a:double,b:double,c:double",
         throw=True,
     )
     df_eq(
         d,
         [[1, 3, 4], [2, 2, 2], [None, 4, 2]],
         "a:double,b:double,c:double",
         throw=True,
     )
     df_eq(
         f,
         [[1, 2, 3], [1, 3, 4], [2, 1, 2], [None, 2, 1]],
         "a:double,b:double,c:double",
         throw=True,
     )
     df_eq(
         g,
         [[2, 1, 2], [2, 2, 2]],
         "a:double,b:double,c:double",
         throw=True,
     )
     df_eq(
         h,
         [
             [None, 4, 2],
             [None, 2, 1],
         ],
         "a:double,b:double,c:double",
         throw=True,
     )
     raises(ValueError, lambda: e.take(a, n=0.5, presort=None))