def comap(cursor, dfs): assert not dfs.has_key v = ",".join([k + str(v.count()) for k, v in dfs.items()]) keys = cursor.key_value_array if len(keys) == 0: return ArrayDataFrame([[v]], "v:str") return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str")
def test_to_df_general(self): e = self.engine o = ArrayDataFrame( [[1.1, 2.2], [3.3, 4.4]], "a:double,b:double", dict(a=1), ) # all engines should accept these types of inputs # should take fugue.DataFrame df_eq(o, e.to_df(o), throw=True) # should take array, shema and metadata df_eq( o, e.to_df([[1.1, 2.2], [3.3, 4.4]], "a:double,b:double", dict(a=1)), throw=True, ) # should take pandas dataframe pdf = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]], columns=["a", "b"]) df_eq(o, e.to_df(pdf, metadata=dict(a=1)), throw=True) # should convert string to datetime in to_df df_eq( e.to_df([["2020-01-01"]], "a:datetime"), [[datetime(2020, 1, 1)]], "a:datetime", throw=True, ) # should handle empty pandas dataframe o = ArrayDataFrame([], "a:double,b:str") pdf = pd.DataFrame([[0.1, "a"]], columns=["a", "b"]) pdf = pdf[pdf.a < 0] df_eq(o, e.to_df(pdf), throw=True)
def test_load_csv_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double") b = ArrayDataFrame([[2.1, 7.1], [4.1, 8.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.csv"), format_hint="csv", header=True) native.save_df(b, os.path.join(path, "b.csv"), format_hint="csv", header=True) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], ) df_eq(c, [[1.1, 6.1], [7.1, 2.1], [8.1, 4.1]], "a:double,c:double", throw=True)
def test_map_with_special_values(self): def with_nat(cursor, data): df = data.as_pandas() df["nat"] = pd.NaT schema = data.schema + "nat:datetime" return PandasDataFrame(df, schema) e = self.engine # test with multiple key with null values o = ArrayDataFrame( [[1, None, 1], [1, None, 0], [None, None, 1]], "a:double,b:double,c:int", dict(a=1), ) c = e.map( o, select_top, o.schema, PartitionSpec(by=["a", "b"], presort="c") ) df_eq( c, [[1, None, 0], [None, None, 1]], "a:double,b:double,c:int", throw=True, ) # test datetime with nat dt = datetime.now() o = ArrayDataFrame( [ [dt, 2, 1], [None, 2, None], [None, 1, None], [dt, 5, 1], [None, 4, None], ], "a:datetime,b:int,c:double", dict(a=1), ) c = e.map( o, select_top, o.schema, PartitionSpec(by=["a", "c"], presort="b DESC") ) df_eq( c, [[None, 4, None], [dt, 5, 1]], "a:datetime,b:int,c:double", throw=True, ) d = e.map( c, with_nat, "a:datetime,b:int,c:double,nat:datetime", PartitionSpec() ) df_eq( d, [[None, 4, None, None], [dt, 5, 1, None]], "a:datetime,b:int,c:double,nat:datetime", throw=True, ) # test list o = ArrayDataFrame([[dt, [1, 2]]], "a:datetime,b:[int]") c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"])) df_eq(c, o, check_order=True, throw=True)
def test_load_parquet_folder(self): e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:int,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.parquet")) native.save_df(b, os.path.join(path, "b.parquet")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
def test_load_avro_folder(self): # TODO: switch to c:int,a:long when we can preserve schema to avro e = self.engine native = NativeExecutionEngine() a = ArrayDataFrame([[6, 1]], "c:long,a:long") b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") native.save_df(a, os.path.join(path, "a.avro")) native.save_df(b, os.path.join(path, "b.avro")) FileSystem().touch(os.path.join(path, "_SUCCESS")) c = e.load_df(path, format_hint="avro", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
def test_map_with_binary(self): e = self.engine o = ArrayDataFrame( [[pickle.dumps(BinaryObject("a"))], [pickle.dumps(BinaryObject("b"))]], "a:bytes", ) c = e.map(o, binary_map, o.schema, PartitionSpec()) expected = ArrayDataFrame( [ [pickle.dumps(BinaryObject("ax"))], [pickle.dumps(BinaryObject("bx"))], ], "a:bytes", ) df_eq(expected, c, no_pandas=True, check_order=True, throw=True)
def transform(self, df: LocalDataFrame) -> LocalDataFrame: p = _get_temp_path(self.params.get("path", ""), self.workflow_conf) fp = os.path.join(p, str(uuid4()) + ".parquet") df.as_pandas().to_parquet(fp) return ArrayDataFrame([self.cursor.key_value_array + [fp]], self.output_schema)
def test_worflow_dataframes(): dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") df2 = dag1.df([[0]], "b:int") dag2 = FugueWorkflow() df3 = dag2.df([[0]], "a:int") dfs1 = WorkflowDataFrames(a=df1, b=df2) assert dfs1["a"] is df1 assert dfs1["b"] is df2 dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2) assert 4 == len(dfs2) with raises(ValueError): WorkflowDataFrames(a=df1, b=df3) with raises(ValueError): WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int")) dag = FugueWorkflow() df = dag.df([[0, 1], [1, 1]], "a:int,b:int") assert df.partition_spec.empty df2 = df.partition(by=["a"]) assert df.partition_spec.empty assert df2.partition_spec == PartitionSpec(by=["a"]) df3 = df.partition_by("a", "b") assert df.partition_spec.empty assert df3.partition_spec == PartitionSpec(by=["a", "b"]) df4 = df.per_partition_by("a", "b") assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec(by=["a", "b"], algo="even") df4 = df.per_row() assert df.partition_spec.empty assert df4.partition_spec == PartitionSpec("per_row")
def binary_map(cursor, df): arr = df.as_array(type_safe=True) for i in range(len(arr)): obj = pickle.loads(arr[i][0]) obj.data += "x" arr[i][0] = pickle.dumps(obj) return ArrayDataFrame(arr, df.schema)
def save_single_file(e: ExecutionEngine, _input: DataFrame) -> DataFrame: p = _get_temp_path(path, e.conf) fp = os.path.join(p, str(uuid4()) + ".parquet") e.save_df(_input, fp, force_single=True) return ArrayDataFrame([[fp]], f"{TUNE_DATASET_DF_PREFIX}{name}:str")
def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) a.show() a.show() raises(FugueWorkflowCompileError, lambda: builder.df(123)) b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() builder.create_data([[0], [1]], "b:int").show() c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") raises(TypeError, lambda: builder.run("abc")) builder.run(FugueWorkflowContext()) df_eq(a.result, [[0], [0], [1]], "a:int") builder.run("NativeExecutionEngine") df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
def test_save_and_load_parquet(self): e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") e.save_df(b, path, format_hint="parquet") c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)
def test_map_with_dict_col(self): e = self.engine dt = datetime.now() # test dict o = ArrayDataFrame([[dt, dict(a=1)]], "a:datetime,b:{a:int}") c = e.map(o, select_top, o.schema, PartitionSpec(by=["a"])) df_eq(c, o, no_pandas=True, check_order=True, throw=True)
def test_save_single_and_load_parquet(self): e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file e.save_df(b, path, format_hint="parquet", force_single=True) assert e.fs.isfile(path) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60, 1], [20, 7]], "c:int,a:long") e.save_df(b, path, format_hint="parquet", mode="overwrite") c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 60], [7, 20]], "a:long,c:int", throw=True)
def test_assign(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.assign( a, [ lit(1, "x"), col("b").cast(str), (col("b") + 1).alias("c").cast(int) ], ) df_eq( b, [ [1, "2", 1, 3], [None, "2", 1, 3], [None, "1", 1, 2], [3, "4", 1, 5], [None, "4", 1, 5], ], "a:double,b:str,x:long,c:long", throw=True, )
def transform(self, df: LocalDataFrame) -> LocalDataFrame: p = _get_temp_path(self.params.get("path", ""), self.workflow_conf) fp = os.path.join(p, str(uuid4()) + ".parquet") first = df.peek_dict() keys = [first[x] for x in self.key_schema.names] df.as_pandas().to_parquet(fp) return ArrayDataFrame([keys + [fp]], self.output_schema)
def test_save_and_load_avro(self): # TODO: switch to c:int,a:long when we can preserve schema to avro e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long") path = os.path.join(self.tmpdir, "a", "b") e.save_df(b, path, format_hint="avro") c = e.load_df(path, format_hint="avro", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame: def get_rows() -> Iterable[Any]: keys = list( df.schema.names) + ["__fmin_value__", "__fmin_metadata__"] for row in compute_transformer(df.as_dict_iterable()): yield [row[k] for k in keys] t._execution_engine = engine # type:ignore return ArrayDataFrame( get_rows(), df.schema + "__fmin_value__:double,__fmin_metadata__:str")
def get(self, key: str): if self.dummy: return True, False, ArrayDataFrame([[100]], "a:int") self.get_called += 1 if key not in self.tb: print("not get", key) return False, False, None x = self.tb[key] print("get", key) self.hit += 1 return True, x[0], x[1]
def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame: out_schema = df.schema + TUNE_REPORT_ADD_SCHEMA def get_rows() -> Iterable[Any]: for row in compute_transformer( df.as_local().as_dict_iterable(), on_report ): yield [row[k] for k in out_schema.names] # TODO: need to add back execution_engine for engine aware runners # t._execution_engine = engine # type:ignore return ArrayDataFrame(get_rows(), out_schema)
def test_map(self): def noop(cursor, data): return data def on_init(partition_no, data): # TODO: this test is not sufficient assert partition_no >= 0 data.peek_array() e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) # no partition c = e.map(a, noop, a.schema, PartitionSpec(), dict(a=1)) df_eq(c, o, throw=True) # with key partition c = e.map(a, noop, a.schema, PartitionSpec(by=["a"], presort="b"), dict(a=1)) df_eq(c, o, throw=True) # select top c = e.map(a, select_top, a.schema, PartitionSpec(by=["a"], presort="b")) df_eq(c, [[None, 1], [1, 2], [3, 4]], "a:double,b:int", throw=True) # select top with another order c = e.map( a, select_top, a.schema, PartitionSpec(partition_by=["a"], presort="b DESC"), metadata=dict(a=1), ) df_eq( c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", metadata=dict(a=1), throw=True, ) # add num_partitions, on_init should not matter c = e.map( a, select_top, a.schema, PartitionSpec(partition_by=["a"], presort="b DESC", num_partitions=3), on_init=on_init, ) df_eq(c, [[None, 4], [1, 2], [3, 4]], "a:double,b:int", throw=True)
def test_save_single_and_load_csv(self): e = self.engine b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") e.fs.makedirs(path, recreate=True) # over write folder with single file e.save_df(b, path, format_hint="csv", header=True, force_single=True) assert e.fs.isfile(path) c = e.load_df( path, format_hint="csv", header=True, infer_schema=False, columns=["a", "c"], ) df_eq(c, [["1.1", "6.1"], ["7.1", "2.1"]], "a:str,c:str", throw=True) # overwirte single with folder (if applicable) b = ArrayDataFrame([[60.1, 1.1], [20.1, 7.1]], "c:double,a:double") e.save_df(b, path, format_hint="csv", header=True, mode="overwrite") c = e.load_df( path, format_hint="csv", header=True, infer_schema=False, columns=["a", "c"], ) df_eq(c, [["1.1", "60.1"], ["7.1", "20.1"]], "a:str,c:str", throw=True)
def test_random_walk(): """ test Fugue func random_walk() """ from node2vec.fugue import random_walk graph = [[0, 2, 0.41], [0, 4, 0.85], [3, 4, 0.36], [2, 0, 0.68], [4, 0, 0.1], [4, 3, 0.37]] df = ArrayDataFrame(graph, schema="src:int,dst:int,weight:double") n2v_params = {"num_walks": 2, "walk_length": 3, "return_param": 0.5} res = random_walk(NativeExecutionEngine(), df, n2v_params) assert res is not None res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params) assert res is not None df1 = df.rename({"src": "id"})[["id"]] res = random_walk(NativeExecutionEngine(), df.as_pandas(), n2v_params, df1) assert res is not None pytest.raises( ValueError, random_walk, NativeExecutionEngine(), df.as_pandas(), n2v_params, df, ) spark = SparkSession.builder.config("spark.executor.cores", 4).getOrCreate() r = Row("src", "dst", "weight") df = spark.sparkContext.parallelize([r(*x) for x in graph]).toDF() res = random_walk(SparkExecutionEngine(spark), SparkDataFrame(df), n2v_params) assert res is not None pytest.raises( ValueError, random_walk, SparkExecutionEngine(spark), SparkDataFrame(df), n2v_params, SparkDataFrame(df), )
def test_save_and_load_csv(self): e = self.engine b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double") path = os.path.join(self.tmpdir, "a", "b") e.save_df(b, path, format_hint="csv", header=True) c = e.load_df( path, format_hint="csv", header=True, infer_schema=True, columns=["a", "c"], ) df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True)
def test_filter(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.filter(a, col("a").not_null()) df_eq(b, [[1, 2], [3, 4]], "a:double,b:int", throw=True) c = e.filter(a, col("a").not_null() & (col("b") < 3)) df_eq(c, [[1, 2]], "a:double,b:int", throw=True) c = e.filter(a, col("a") + col("b") == 3) df_eq(c, [[1, 2]], "a:double,b:int", throw=True)
def compute_processor(engine: ExecutionEngine, df: DataFrame) -> DataFrame: out_schema = df.schema + TUNE_REPORT_ADD_SCHEMA def get_rows() -> Iterable[Any]: for row in self._compute_transformer( df.as_local().as_dict_iterable(), entrypoint=entrypoint, stop_check_interval=_interval, ): yield [row[k] for k in out_schema.names] # TODO: need to add back execution_engine for engine aware optimizers # t._execution_engine = engine # type:ignore return ArrayDataFrame(get_rows(), out_schema)
def test_io(self): e = self.engine b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a") e.save_df(b, path, format_hint="parquet", force_single=True) assert e.fs.isfile(path) c = e.load_df(path, format_hint="parquet", columns=["a", "c"]) df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True) path = os.path.join(self.tmpdir, "b.csv") e.save_df(b, path, header=True) c = e.load_df(path, header=True, columns="c:int,a:long") df_eq(c, b, throw=True) # reading multiple csv using wildcard from folder fpath = os.path.join(self.tmpdir, "f.csv") e.fs.makedir(fpath) e.save_df(b, os.path.join(fpath, "1.csv"), header=True, force_single=True) e.save_df(b, os.path.join(fpath, "2.csv"), header=True, force_single=True) r = e.load_df(os.path.join(fpath, "*.csv"), header=True, infer_schema=False) df_eq( r, [["6", "1"], ["2", "7"], ["6", "1"], ["2", "7"]], "c:str,a:str", throw=True, ) # reading multiple csv with/without infer schema e.fs.touch(os.path.join(fpath, "_SUCCESS")) r = e.load_df(fpath, header=True, infer_schema=False) df_eq( r, [["6", "1"], ["2", "7"], ["6", "1"], ["2", "7"]], "c:str,a:str", throw=True, ) r = e.load_df(fpath, header=True, infer_schema=True) assert sorted(r.as_array()) == sorted([[6, 1], [2, 7], [6, 1], [2, 7]]) # write single file to overwirte folder assert e.fs.isdir(fpath) e.save_df(r, fpath, force_single=True, header=True) assert e.fs.isfile(fpath) r = e.load_df(fpath, header=True, infer_schema=True) assert sorted(r.as_array()) == sorted([[6, 1], [2, 7], [6, 1], [2, 7]])
def test_aggregate(self): e = self.engine o = ArrayDataFrame( [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int", dict(a=1), ) a = e.to_df(o) b = e.aggregate( df=a, partition_spec=None, agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq(b, [[4, 8]], "b:int,c:int", throw=True) b = e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ ff.max(col("b")), (ff.max(col("b")) * 2).cast("int32").alias("c"), ], ) df_eq( b, [[None, 4, 8], [1, 2, 4], [3, 4, 8]], "a:double,b:int,c:int", throw=True, ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[ff.max(col("b")), lit(1)], ) with raises(ValueError): e.aggregate( df=a, partition_spec=PartitionSpec(by=["a"]), agg_cols=[], )
def test_save_and_load_json(self): e = self.engine b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]], "c:int,a:long") path = os.path.join(self.tmpdir, "a", "b") e.save_df( e.repartition(e.to_df(b), PartitionSpec(num=2)), path, format_hint="json", ) c = e.load_df( path, format_hint="json", columns=["a", "c"], ) df_eq(c, [[1, 6], [7, 2], [4, 3], [8, 4], [7, 6]], "a:long,c:long", throw=True)