Exemplos de ArrayDataFrame em Python, exemplos de fugue.ArrayDataFrame em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: execution_suite.py Projeto: yang-zhang-work/fugue

 def comap(cursor, dfs):
     assert not dfs.has_key
     v = ",".join([k + str(v.count()) for k, v in dfs.items()])
     keys = cursor.key_value_array
     if len(keys) == 0:
         return ArrayDataFrame([[v]], "v:str")
     return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str")

Exemplo n.º 2

0

Exibir arquivo

        def test_to_df_general(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1.1, 2.2], [3.3, 4.4]],
                "a:double,b:double",
                dict(a=1),
            )
            # all engines should accept these types of inputs
            # should take fugue.DataFrame
            df_eq(o, e.to_df(o), throw=True)
            # should take array, shema and metadata
            df_eq(
                o,
                e.to_df([[1.1, 2.2], [3.3, 4.4]], "a:double,b:double",
                        dict(a=1)),
                throw=True,
            )
            # should take pandas dataframe
            pdf = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]], columns=["a", "b"])
            df_eq(o, e.to_df(pdf, metadata=dict(a=1)), throw=True)

            # should convert string to datetime in to_df
            df_eq(
                e.to_df([["2020-01-01"]], "a:datetime"),
                [[datetime(2020, 1, 1)]],
                "a:datetime",
                throw=True,
            )

            # should handle empty pandas dataframe
            o = ArrayDataFrame([], "a:double,b:str")
            pdf = pd.DataFrame([[0.1, "a"]], columns=["a", "b"])
            pdf = pdf[pdf.a < 0]
            df_eq(o, e.to_df(pdf), throw=True)

Exemplo n.º 3

0

Exibir arquivo

 def test_load_csv_folder(self):
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double")
     b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a,
                    os.path.join(path, "a.csv"),
                    format_hint="csv",
                    header=True)
     native.save_df(b,
                    os.path.join(path, "b.csv"),
                    format_hint="csv",
                    header=True)
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(
         path,
         format_hint="csv",
         header=True,
         infer_schema=True,
         columns=["a", "c"],
     )
     df_eq(c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]],
           "a:double,c:double",
           throw=True)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: execution_suite.py Projeto: yang-zhang-work/fugue

        def test_map_with_special_values(self):
            def with_nat(cursor, data):
                df = data.as_pandas()
                df["nat"] = pd.NaT
                schema = data.schema + "nat:datetime"
                return PandasDataFrame(df, schema)

            e = self.engine
            # test with multiple key with null values
            o = ArrayDataFrame(
                [[1, None, 1], [1, None, 0], [None, None, 1]],
                "a:double,b:double,c:int",
                dict(a=1),
            )
            c = e.map(
                o, select_top, o.schema, PartitionSpec(by=["a", "b"], presort="c")
            )
            df_eq(
                c,
                [[1, None, 0], [None, None, 1]],
                "a:double,b:double,c:int",
                throw=True,
            )
            # test datetime with nat
            dt = datetime.now()
            o = ArrayDataFrame(
                [
                    [dt, 2, 1],
                    [None, 2, None],
                    [None, 1, None],
                    [dt, 5, 1],
                    [None, 4, None],
                ],
                "a:datetime,b:int,c:double",
                dict(a=1),
            )
            c = e.map(
                o, select_top, o.schema, PartitionSpec(by=["a", "c"], presort="b DESC")
            )
            df_eq(
                c,
                [[None, 4, None], [dt, 5, 1]],
                "a:datetime,b:int,c:double",
                throw=True,
            )
            d = e.map(
                c, with_nat, "a:datetime,b:int,c:double,nat:datetime", PartitionSpec()
            )
            df_eq(
                d,
                [[None, 4, None, None], [dt, 5, 1, None]],
                "a:datetime,b:int,c:double,nat:datetime",
                throw=True,
            )
            # test list
            o = ArrayDataFrame([[dt, [1, 2]]], "a:datetime,b:[int]")
            c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"]))
            df_eq(c, o, check_order=True, throw=True)

Exemplo n.º 5

0

Exibir arquivo

 def test_load_parquet_folder(self):
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6, 1]], "c:int,a:long")
     b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a, os.path.join(path, "a.parquet"))
     native.save_df(b, os.path.join(path, "b.parquet"))
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)

Exemplo n.º 6

0

Exibir arquivo

 def test_load_avro_folder(self):
     # TODO: switch to c:int,a:long when we can preserve schema to avro
     e = self.engine
     native = NativeExecutionEngine()
     a = ArrayDataFrame([[6, 1]], "c:long,a:long")
     b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     native.save_df(a, os.path.join(path, "a.avro"))
     native.save_df(b, os.path.join(path, "b.avro"))
     FileSystem().touch(os.path.join(path, "_SUCCESS"))
     c = e.load_df(path, format_hint="avro", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: execution_suite.py Projeto: yang-zhang-work/fugue

 def test_map_with_binary(self):
     e = self.engine
     o = ArrayDataFrame(
         [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]],
         "a:bytes",
     )
     c = e.map(o, binary_map, o.schema, PartitionSpec())
     expected = ArrayDataFrame(
         [
             [pickle.dumps(BinaryObject("ax"))],
             [pickle.dumps(BinaryObject("bx"))],
         ],
         "a:bytes",
     )
     df_eq(expected, c, no_pandas=True, check_order=True, throw=True)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: tune.py Projeto: fugue-project/fugue-incubator

 def transform(self, df: LocalDataFrame) -> LocalDataFrame:
     p = _get_temp_path(self.params.get("path", ""),
                        self.workflow_conf)
     fp = os.path.join(p, str(uuid4()) + ".parquet")
     df.as_pandas().to_parquet(fp)
     return ArrayDataFrame([self.cursor.key_value_array + [fp]],
                           self.output_schema)

Exemplo n.º 9

0

Exibir arquivo

def test_worflow_dataframes():
    dag1 = FugueWorkflow()
    df1 = dag1.df([[0]], "a:int")
    df2 = dag1.df([[0]], "b:int")
    dag2 = FugueWorkflow()
    df3 = dag2.df([[0]], "a:int")

    dfs1 = WorkflowDataFrames(a=df1, b=df2)
    assert dfs1["a"] is df1
    assert dfs1["b"] is df2

    dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2)
    assert 4 == len(dfs2)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=df3)

    with raises(ValueError):
        WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int"))

    dag = FugueWorkflow()
    df = dag.df([[0, 1], [1, 1]], "a:int,b:int")
    assert df.partition_spec.empty
    df2 = df.partition(by=["a"])
    assert df.partition_spec.empty
    assert df2.partition_spec == PartitionSpec(by=["a"])
    df3 = df.partition_by("a", "b")
    assert df.partition_spec.empty
    assert df3.partition_spec == PartitionSpec(by=["a", "b"])
    df4 = df.per_partition_by("a", "b")
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even")
    df4 = df.per_row()
    assert df.partition_spec.empty
    assert df4.partition_spec == PartitionSpec("per_row")

Exemplo n.º 10

0

Exibir arquivo

Arquivo: execution_suite.py Projeto: yang-zhang-work/fugue

def binary_map(cursor, df):
    arr = df.as_array(type_safe=True)
    for i in range(len(arr)):
        obj = pickle.loads(arr[i][0])
        obj.data += "x"
        arr[i][0] = pickle.dumps(obj)
    return ArrayDataFrame(arr, df.schema)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: dataset.py Projeto: fugue-project/tune

 def save_single_file(e: ExecutionEngine,
                      _input: DataFrame) -> DataFrame:
     p = _get_temp_path(path, e.conf)
     fp = os.path.join(p, str(uuid4()) + ".parquet")
     e.save_df(_input, fp, force_single=True)
     return ArrayDataFrame([[fp]],
                           f"{TUNE_DATASET_DF_PREFIX}{name}:str")

Exemplo n.º 12

0

Exibir arquivo

def test_workflow():
    builder = FugueWorkflow()

    a = builder.create_data([[0], [0], [1]], "a:int")
    raises(InvalidOperationError, lambda: a._task.copy())
    raises(InvalidOperationError, lambda: copy.copy(a._task))
    raises(InvalidOperationError, lambda: copy.deepcopy(a._task))
    a.show()
    a.show()

    raises(FugueWorkflowCompileError, lambda: builder.df(123))

    b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"]))
    b.show()
    builder.create_data([[0], [1]], "b:int").show()
    c = ArrayDataFrame([[100]], "a:int")
    builder.show(a, b, c)
    b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast()
    b.show()

    builder.run()
    df_eq(a.result, [[0], [0], [1]], "a:int")
    raises(TypeError, lambda: builder.run("abc"))
    builder.run(FugueWorkflowContext())
    df_eq(a.result, [[0], [0], [1]], "a:int")
    builder.run("NativeExecutionEngine")
    df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
    df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]],
          "a:int,b:int")

Exemplo n.º 13

0

Exibir arquivo

 def test_save_and_load_parquet(self):
     e = self.engine
     b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(b, path, format_hint="parquet")
     c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: execution_suite.py Projeto: yang-zhang-work/fugue

 def test_map_with_dict_col(self):
     e = self.engine
     dt = datetime.now()
     # test dict
     o = ArrayDataFrame([[dt, dict(a=1)]], "a:datetime,b:{a:int}")
     c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"]))
     df_eq(c, o, no_pandas=True, check_order=True, throw=True)

Exemplo n.º 15

0

Exibir arquivo

        def test_save_single_and_load_parquet(self):
            e = self.engine
            b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
            path = os.path.join(self.tmpdir, "a", "b")
            e.fs.makedirs(path, recreate=True)
            # over write folder with single file
            e.save_df(b, path, format_hint="parquet", force_single=True)
            assert e.fs.isfile(path)
            c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
            df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)

            # overwirte single with folder (if applicable)
            b = ArrayDataFrame([[60, 1], [20, 7]], "c:int,a:long")
            e.save_df(b, path, format_hint="parquet", mode="overwrite")
            c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
            df_eq(c, [[1, 60], [7, 20]], "a:long,c:int", throw=True)

Exemplo n.º 16

0

Exibir arquivo

        def test_assign(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.assign(
                a,
                [
                    lit(1, "x"),
                    col("b").cast(str), (col("b") + 1).alias("c").cast(int)
                ],
            )
            df_eq(
                b,
                [
                    [1, "2", 1, 3],
                    [None, "2", 1, 3],
                    [None, "1", 1, 2],
                    [3, "4", 1, 5],
                    [None, "4", 1, 5],
                ],
                "a:double,b:str,x:long,c:long",
                throw=True,
            )

Exemplo n.º 17

0

Exibir arquivo

Arquivo: dataset.py Projeto: fugue-project/tune

 def transform(self, df: LocalDataFrame) -> LocalDataFrame:
     p = _get_temp_path(self.params.get("path", ""),
                        self.workflow_conf)
     fp = os.path.join(p, str(uuid4()) + ".parquet")
     first = df.peek_dict()
     keys = [first[x] for x in self.key_schema.names]
     df.as_pandas().to_parquet(fp)
     return ArrayDataFrame([keys + [fp]], self.output_schema)

Exemplo n.º 18

0

Exibir arquivo

 def test_save_and_load_avro(self):
     # TODO: switch to c:int,a:long when we can preserve schema to avro
     e = self.engine
     b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(b, path, format_hint="avro")
     c = e.load_df(path, format_hint="avro", columns=["a", "c"])
     df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: tune.py Projeto: fugue-project/fugue-incubator

    def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame:
        def get_rows() -> Iterable[Any]:
            keys = list(
                df.schema.names) + ["__fmin_value__", "__fmin_metadata__"]
            for row in compute_transformer(df.as_dict_iterable()):
                yield [row[k] for k in keys]

        t._execution_engine = engine  # type:ignore
        return ArrayDataFrame(
            get_rows(),
            df.schema + "__fmin_value__:double,__fmin_metadata__:str")

Exemplo n.º 20

0

Exibir arquivo

 def get(self, key: str):
     if self.dummy:
         return True, False, ArrayDataFrame([[100]], "a:int")
     self.get_called += 1
     if key not in self.tb:
         print("not get", key)
         return False, False, None
     x = self.tb[key]
     print("get", key)
     self.hit += 1
     return True, x[0], x[1]

Exemplo n.º 21

0

Exibir arquivo

Arquivo: study.py Projeto: HumbertoGonzalezGranda/tune

        def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame:
            out_schema = df.schema + TUNE_REPORT_ADD_SCHEMA

            def get_rows() -> Iterable[Any]:
                for row in compute_transformer(
                    df.as_local().as_dict_iterable(), on_report
                ):
                    yield [row[k] for k in out_schema.names]

            # TODO: need to add back execution_engine for engine aware runners
            # t._execution_engine = engine  # type:ignore
            return ArrayDataFrame(get_rows(), out_schema)

Exemplo n.º 22

0

Exibir arquivo

        def test_map(self):
            def noop(cursor, data):
                return data

            def on_init(partition_no, data):
                # TODO: this test is not sufficient
                assert partition_no >= 0
                data.peek_array()

            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)
            # no partition
            c = e.map(a, noop, a.schema, PartitionSpec(), dict(a=1))
            df_eq(c, o, throw=True)
            # with key partition
            c = e.map(a, noop, a.schema, PartitionSpec(by=["a"], presort="b"),
                      dict(a=1))
            df_eq(c, o, throw=True)
            # select top
            c = e.map(a, select_top, a.schema,
                      PartitionSpec(by=["a"], presort="b"))
            df_eq(c, [[None, 1], [1, 2], [3, 4]], "a:double,b:int", throw=True)
            # select top with another order
            c = e.map(
                a,
                select_top,
                a.schema,
                PartitionSpec(partition_by=["a"], presort="b DESC"),
                metadata=dict(a=1),
            )
            df_eq(
                c,
                [[None, 4], [1, 2], [3, 4]],
                "a:double,b:int",
                metadata=dict(a=1),
                throw=True,
            )
            # add num_partitions, on_init should not matter
            c = e.map(
                a,
                select_top,
                a.schema,
                PartitionSpec(partition_by=["a"],
                              presort="b DESC",
                              num_partitions=3),
                on_init=on_init,
            )
            df_eq(c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", throw=True)

Exemplo n.º 23

0

Exibir arquivo

        def test_save_single_and_load_csv(self):
            e = self.engine
            b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
            path = os.path.join(self.tmpdir, "a", "b")
            e.fs.makedirs(path, recreate=True)
            # over write folder with single file
            e.save_df(b,
                      path,
                      format_hint="csv",
                      header=True,
                      force_single=True)
            assert e.fs.isfile(path)
            c = e.load_df(
                path,
                format_hint="csv",
                header=True,
                infer_schema=False,
                columns=["a", "c"],
            )
            df_eq(c, [["1.1", "6.1"], ["7.1", "2.1"]],
                  "a:str,c:str",
                  throw=True)

            # overwirte single with folder (if applicable)
            b = ArrayDataFrame([[60.1, 1.1], [20.1, 7.1]], "c:double,a:double")
            e.save_df(b,
                      path,
                      format_hint="csv",
                      header=True,
                      mode="overwrite")
            c = e.load_df(
                path,
                format_hint="csv",
                header=True,
                infer_schema=False,
                columns=["a", "c"],
            )
            df_eq(c, [["1.1", "60.1"], ["7.1", "20.1"]],
                  "a:str,c:str",
                  throw=True)

Exemplo n.º 24

0

Exibir arquivo

def test_random_walk():
    """
    test Fugue func random_walk()
    """
    from node2vec.fugue import random_walk

    graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1],
             [4, 3, 0.37]]
    df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double")
    n2v_params = {"num_walks": 2, "walk_length": 3, "return_param": 0.5}

    res = random_walk(NativeExecutionEngine(), df, n2v_params)
    assert res is not None
    res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params)
    assert res is not None
    df1 = df.rename({"src": "id"})[["id"]]
    res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params, df1)
    assert res is not None
    pytest.raises(
        ValueError, random_walk, NativeExecutionEngine(), df.as_pandas(),
        n2v_params, df,
    )

    spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate()
    r = Row("src", "dst", "weight")
    df = spark.sparkContext.parallelize([r(*x) for x in graph]).toDF()
    res = random_walk(SparkExecutionEngine(spark), SparkDataFrame(df), n2v_params)
    assert res is not None
    pytest.raises(
        ValueError, random_walk, SparkExecutionEngine(spark), SparkDataFrame(df),
        n2v_params, SparkDataFrame(df),
    )

Exemplo n.º 25

0

Exibir arquivo

 def test_save_and_load_csv(self):
     e = self.engine
     b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(b, path, format_hint="csv", header=True)
     c = e.load_df(
         path,
         format_hint="csv",
         header=True,
         infer_schema=True,
         columns=["a", "c"],
     )
     df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True)

Exemplo n.º 26

0

Exibir arquivo

 def test_filter(self):
     e = self.engine
     o = ArrayDataFrame(
         [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
         "a:double,b:int",
         dict(a=1),
     )
     a = e.to_df(o)
     b = e.filter(a, col("a").not_null())
     df_eq(b, [[1, 2], [3, 4]], "a:double,b:int", throw=True)
     c = e.filter(a, col("a").not_null() & (col("b") < 3))
     df_eq(c, [[1, 2]], "a:double,b:int", throw=True)
     c = e.filter(a, col("a") + col("b") == 3)
     df_eq(c, [[1, 2]], "a:double,b:int", throw=True)

Exemplo n.º 27

0

Exibir arquivo

Arquivo: study.py Projeto: fugue-project/tune

        def compute_processor(engine: ExecutionEngine,
                              df: DataFrame) -> DataFrame:
            out_schema = df.schema + TUNE_REPORT_ADD_SCHEMA

            def get_rows() -> Iterable[Any]:
                for row in self._compute_transformer(
                        df.as_local().as_dict_iterable(),
                        entrypoint=entrypoint,
                        stop_check_interval=_interval,
                ):
                    yield [row[k] for k in out_schema.names]

            # TODO: need to add back execution_engine for engine aware optimizers
            # t._execution_engine = engine  # type:ignore
            return ArrayDataFrame(get_rows(), out_schema)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: execution_suite.py Projeto: yang-zhang-work/fugue

        def test_io(self):
            e = self.engine
            b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
            path = os.path.join(self.tmpdir, "a")
            e.save_df(b, path, format_hint="parquet", force_single=True)
            assert e.fs.isfile(path)
            c = e.load_df(path, format_hint="parquet", columns=["a", "c"])
            df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)

            path = os.path.join(self.tmpdir, "b.csv")
            e.save_df(b, path, header=True)
            c = e.load_df(path, header=True, columns="c:int,a:long")
            df_eq(c, b, throw=True)

            # reading multiple csv using wildcard from folder
            fpath = os.path.join(self.tmpdir, "f.csv")
            e.fs.makedir(fpath)
            e.save_df(b, os.path.join(fpath, "1.csv"), header=True, force_single=True)
            e.save_df(b, os.path.join(fpath, "2.csv"), header=True, force_single=True)

            r = e.load_df(os.path.join(fpath, "*.csv"), header=True, infer_schema=False)
            df_eq(
                r,
                [["6", "1"], ["2", "7"], ["6", "1"], ["2", "7"]],
                "c:str,a:str",
                throw=True,
            )

            # reading multiple csv with/without infer schema
            e.fs.touch(os.path.join(fpath, "_SUCCESS"))

            r = e.load_df(fpath, header=True, infer_schema=False)
            df_eq(
                r,
                [["6", "1"], ["2", "7"], ["6", "1"], ["2", "7"]],
                "c:str,a:str",
                throw=True,
            )

            r = e.load_df(fpath, header=True, infer_schema=True)
            assert sorted(r.as_array()) == sorted([[6, 1], [2, 7], [6, 1], [2, 7]])

            # write single file to overwirte folder
            assert e.fs.isdir(fpath)
            e.save_df(r, fpath, force_single=True, header=True)
            assert e.fs.isfile(fpath)
            r = e.load_df(fpath, header=True, infer_schema=True)
            assert sorted(r.as_array()) == sorted([[6, 1], [2, 7], [6, 1], [2, 7]])

Exemplo n.º 29

0

Exibir arquivo

        def test_aggregate(self):
            e = self.engine
            o = ArrayDataFrame(
                [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]],
                "a:double,b:int",
                dict(a=1),
            )
            a = e.to_df(o)

            b = e.aggregate(
                df=a,
                partition_spec=None,
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(b, [[4, 8]], "b:int,c:int", throw=True)

            b = e.aggregate(
                df=a,
                partition_spec=PartitionSpec(by=["a"]),
                agg_cols=[
                    ff.max(col("b")),
                    (ff.max(col("b")) * 2).cast("int32").alias("c"),
                ],
            )
            df_eq(
                b,
                [[None, 4, 8], [1, 2, 4], [3, 4, 8]],
                "a:double,b:int,c:int",
                throw=True,
            )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[ff.max(col("b")), lit(1)],
                )

            with raises(ValueError):
                e.aggregate(
                    df=a,
                    partition_spec=PartitionSpec(by=["a"]),
                    agg_cols=[],
                )

Exemplo n.º 30

0

Exibir arquivo

 def test_save_and_load_json(self):
     e = self.engine
     b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]],
                        "c:int,a:long")
     path = os.path.join(self.tmpdir, "a", "b")
     e.save_df(
         e.repartition(e.to_df(b), PartitionSpec(num=2)),
         path,
         format_hint="json",
     )
     c = e.load_df(
         path,
         format_hint="json",
         columns=["a", "c"],
     )
     df_eq(c, [[1, 6], [7, 2], [4, 3], [8, 4], [7, 6]],
           "a:long,c:long",
           throw=True)