def test_get_join_schemas(): a = ArrayDataFrame([], "a:int,b:int") b = ArrayDataFrame([], "c:int") c = ArrayDataFrame([], "d:str,a:int") i, u = get_join_schemas(a, b, how="cross", on=[]) assert i == "" assert u == "a:int,b:int,c:int" raises(NoneArgumentError, lambda: get_join_schemas(a, b, how=None, on=[])) raises(ValueError, lambda: get_join_schemas(a, b, how="x", on=[])) raises(SchemaError, lambda: get_join_schemas(a, b, how="CROSS", on=["a"])) raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=["a"])) raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=[])) raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"])) raises(ValueError, lambda: get_join_schemas(a, c, how="outer", on=["a"])) i, u = get_join_schemas(a, c, how="inner", on=["a"]) assert i == "a:int" assert u == "a:int,b:int,d:str" i, u = get_join_schemas(a, c, how="inner", on=[]) # infer assert i == "a:int" assert u == "a:int,b:int,d:str" a = ArrayDataFrame([], "a:int,b:int,c:int") b = ArrayDataFrame([], "c:int,b:int,x:int") raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"])) i, u = get_join_schemas(a, b, how="inner", on=["c", "b"]) assert i == "b:int,c:int" assert u == "a:int,b:int,c:int,x:int" for how in ["SEMI", "LEFT_Semi", "Anti", "left_Anti"]: i, u = get_join_schemas(c, a, how=how, on=["a"]) assert i == "a:int" assert u == "d:str,a:int"
def run(self, cursor: PartitionCursor, dfs: DataFrames) -> LocalDataFrame: self.transformer._cursor = cursor # type: ignore try: to_local_bounded_df(self.transformer.transform(dfs)) return ArrayDataFrame([], self.transformer.output_schema) except self.ignore_errors: # type: ignore return ArrayDataFrame([], self.transformer.output_schema)
def _generate_comap_empty_dfs(schemas: Any, named: bool) -> DataFrames: if named: return DataFrames( {k: ArrayDataFrame([], v) for k, v in schemas.items()}) else: return DataFrames([ArrayDataFrame([], v) for v in schemas.values()])
def test_union(self): with self.dag() as dag: a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double") b = dag.df([[2, None], [2, 20]], "x:long,y:double") c = dag.df([[1, 10], [2, 20]], "x:long,y:double") a.union().assert_eq(a) a.union(b, c).assert_eq( ArrayDataFrame( [ [1, 10], [2, None], [2, 20], ], "x:long,y:double", )) a.union(b, c, distinct=False).assert_eq( ArrayDataFrame( [ [1, 10], [2, None], [2, None], [2, None], [2, 20], [1, 10], [2, 20], ], "x:long,y:double", ))
def test_subtract(self): with self.dag() as dag: a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double") b = dag.df([[2, None], [2, 20]], "x:long,y:double") c = dag.df([[1, 10], [2, 20]], "x:long,y:double") a.subtract(b).assert_eq( ArrayDataFrame( [[1, 10]], "x:long,y:double", )) a.subtract(c).assert_eq( ArrayDataFrame( [[2, None]], "x:long,y:double", )) # # TODO: EXCEPT ALL is not implemented (QPD issue) # a.subtract(c, distinct=False).assert_eq( # ArrayDataFrame( # [[2, None], [2, None]], # "x:long,y:double", # ) # ) a.subtract(b, c).assert_eq(ArrayDataFrame( [], "x:long,y:double", ))
def test_df_eq(): df1 = ArrayDataFrame([[0, 100.0, "a"]], "a:int,b:double,c:str", dict(a=1)) df2 = ArrayDataFrame([[0, 100.001, "a"]], "a:int,b:double,c:str", dict(a=2)) assert df_eq(df1, df1) assert df_eq(df1, df2, digits=4, check_metadata=False) # metadata assert not df_eq(df1, df2, digits=4, check_metadata=True) # precision assert not df_eq(df1, df2, digits=6, check_metadata=False) # no content assert df_eq(df1, df2, digits=6, check_metadata=False, check_content=False) raises(AssertionError, lambda: df_eq(df1, df2, throw=True)) df1 = ArrayDataFrame([[100.0, "a"]], "a:double,b:str", dict(a=1)) assert df_eq(df1, df1.as_pandas(), df1.schema, df1.metadata) df1 = ArrayDataFrame([[None, "a"]], "a:double,b:str", dict(a=1)) assert df_eq(df1, df1) df1 = ArrayDataFrame([[None, "a"]], "a:double,b:str", dict(a=1)) df2 = ArrayDataFrame([[np.nan, "a"]], "a:double,b:str", dict(a=1)) assert df_eq(df1, df2) df1 = ArrayDataFrame([[100.0, None]], "a:double,b:str", dict(a=1)) df2 = ArrayDataFrame([[100.0, None]], "a:double,b:str", dict(a=1)) assert df_eq(df1, df2) df1 = ArrayDataFrame([[0], [1]], "a:int") df2 = ArrayDataFrame([[1], [0]], "a:int") assert df_eq(df1, df2) assert not df_eq(df1, df2, check_order=True)
def test_col_ops(self): with self.dag() as dag: a = dag.df([[1, 10], [2, 20]], "x:long,y:long") aa = dag.df([[1, 10], [2, 20]], "xx:long,y:long") a.rename({"x": "xx"}).assert_eq(aa) a[["x"]].assert_eq(ArrayDataFrame([[1], [2]], "x:long")) a.drop(["y", "yy"], if_exists=True).assert_eq( ArrayDataFrame([[1], [2]], "x:long")) a[["x"]].rename(x="xx").assert_eq( ArrayDataFrame([[1], [2]], "xx:long"))
def test_intersect(self): with self.dag() as dag: a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double") b = dag.df([[2, None], [2, 20]], "x:long,y:double") c = dag.df([[1, 10], [2, 20]], "x:long,y:double") # d = dag.df([[1, 10], [2, 20], [2, None]], "x:long,y:double") a.intersect(b).assert_eq( ArrayDataFrame( [[2, None]], "x:long,y:double", )) a.intersect(b, c).assert_eq( ArrayDataFrame( [], "x:long,y:double", ))
def f30(e: EmptyAwareIterable[List[Any]], a: EmptyAwareIterable[Dict[str, Any]]) -> LocalDataFrame: e.peek() a.peek() e = list(e) e += [[x["a"]] for x in a] return ArrayDataFrame(e, "a:int")
def to_local_bounded_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalBoundedDataFrame: """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and list or iterable of arrays :param schema: |SchemaLikeObject|, defaults to None, it should not be set for :class:`~fugue.dataframe.dataframe.DataFrame` type :param metadata: dict-like object with string keys, defaults to None :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame` but you set ``schema`` or ``metadata`` :raises TypeError: if ``df`` is not compatible :return: the dataframe itself if it's :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` else a converted one .. admonition:: Examples >>> a = IterableDataFrame([[0,'a'],[1,'b']],"a:int,b:str") >>> assert isinstance(to_local_bounded_df(a), LocalBoundedDataFrame) >>> to_local_bounded_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str")) .. note:: Compared to :func:`.to_local_df`, this function makes sure the dataframe is also bounded, so :class:`~fugue.dataframe.iterable_dataframe.IterableDataFrame` will be converted although it's local. """ df = to_local_df(df, schema, metadata) if isinstance(df, LocalBoundedDataFrame): return df return ArrayDataFrame(df.as_array(), df.schema, df.metadata)
def to_local_df(df: Any, schema: Any = None, metadata: Any = None) -> LocalDataFrame: """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame` :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and list or iterable of arrays :param schema: |SchemaLikeObject|, defaults to None, it should not be set for :class:`~fugue.dataframe.dataframe.DataFrame` type :param metadata: dict-like object with string keys, defaults to None :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame` but you set ``schema`` or ``metadata`` :raises TypeError: if ``df`` is not compatible :return: the dataframe itself if it's :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one :Examples: >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str") >>> assert to_local_df(a) is a >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str")) """ assert_arg_not_none(df, "df") if isinstance(df, DataFrame): aot( schema is None and metadata is None, ValueError("schema and metadata must be None when df is a DataFrame"), ) return df.as_local() if isinstance(df, pd.DataFrame): return PandasDataFrame(df, schema, metadata) if isinstance(df, List): return ArrayDataFrame(df, schema, metadata) if isinstance(df, Iterable): return IterableDataFrame(df, schema, metadata) raise TypeError(f"{df} cannot convert to a LocalDataFrame")
def test_use_soecial_df(tmpdir): # external non-workflowdataframe arr = ArrayDataFrame([[0], [1]], "a:int") fsql( """ b=CREATE[[0], [1]] SCHEMA a: int a = SELECT * FROM a.x OUTPUT a, b USING assert_eq a = SELECT x.* FROM a.x AS x OUTPUT a, b USING assert_eq c=CREATE [[0,0],[1,1]] SCHEMA a:int,b:int d = SELECT x.*,y.a AS b FROM a.x x INNER JOIN a.x y ON x.a=y.a OUTPUT c, d USING assert_eq """, { "a.x": arr }, ).run() # from yield file engine = NativeExecutionEngine( conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")}) with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int a = SELECT * FROM a.x OUTPUT a, b USING assert_eq """, {"a.x": res}, )
def test_worflow_dataframes(): dag1 = FugueWorkflow() df1 = dag1.df([[0]], "a:int") df2 = dag1.df([[0]], "b:int") dag2 = FugueWorkflow() df3 = dag2.df([[0]], "a:int") dfs1 = WorkflowDataFrames(a=df1, b=df2) assert dfs1["a"] is df1 assert dfs1["b"] is df2 dfs2 = WorkflowDataFrames(dfs1, aa=df1, bb=df2) assert 4 == len(dfs2) with raises(ValueError): WorkflowDataFrames(a=df1, b=df3) with raises(ValueError): WorkflowDataFrames(a=df1, b=ArrayDataFrame([[0]], "a:int")) dag = FugueWorkflow() df = dag.df([[0], [1]], "a:int") assert df.partition_spec.empty df2 = df.partition(by=["a"]) assert df.partition_spec.empty assert df2.partition_spec == PartitionSpec(by=["a"])
def test_workflow(): builder = FugueWorkflow() a = builder.create_data([[0], [0], [1]], "a:int") raises(InvalidOperationError, lambda: a._task.copy()) raises(InvalidOperationError, lambda: copy.copy(a._task)) raises(InvalidOperationError, lambda: copy.deepcopy(a._task)) a.show() a.show() raises(FugueWorkflowCompileError, lambda: builder.df(123)) b = a.transform(mock_tf1, "*,b:int", pre_partition=dict(by=["a"])) b.show() builder.create_data([[0], [1]], "b:int").show() c = ArrayDataFrame([[100]], "a:int") builder.show(a, b, c) b = a.partition(by=["a"]).transform(mock_tf2).persist().broadcast() b.show() builder.run() df_eq(a.result, [[0], [0], [1]], "a:int") raises(TypeError, lambda: builder.run("abc")) builder.run(FugueWorkflowContext()) df_eq(a.result, [[0], [0], [1]], "a:int") builder.run("NativeExecutionEngine") df_eq(b.result, [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(), [[0, 2], [0, 2], [1, 1]], "a:int,b:int") df_eq(b.compute(NativeExecutionEngine), [[0, 2], [0, 2], [1, 1]], "a:int,b:int")
def execute(self, ctx: TaskContext) -> None: self._outputter._execution_engine = self._get_execution_engine(ctx) if self._input_has_key: self._outputter.process(DataFrames(ctx.inputs)) else: self._outputter.process(DataFrames(ctx.inputs.values())) # TODO: output dummy to force cache to work, should we fix adagio? ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
def transform(self, df): if not hasattr(self, "called"): self.called = 1 else: self.called += 1 n = self.params.get("n", 1) assert self.called <= n return ArrayDataFrame([[len(df.as_array())]], "c:int")
def test_distinct(self): with self.dag() as dag: a = dag.df([[1, 10], [2, None], [2, None]], "x:long,y:double") a.distinct().assert_eq( ArrayDataFrame( [[1, 10], [2, None]], "x:long,y:double", ))
def test_parquet_io(tmpdir): df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long") df2 = ArrayDataFrame([[[1, 2]]], "a:[int]") # {a:int} will become {a:long} because pyarrow lib has issue df3 = ArrayDataFrame([[dict(a=1)]], "a:{a:long}") for df in [df1, df2, df3]: path = os.path.join(tmpdir, "a.parquet") save_df(df, path) actual = load_df(path) df_eq(df, actual, throw=True) save_df(df1, path) actual = load_df(path, columns=["b", "a"]) df_eq(actual, [[2, "1"]], "b:int,a:str") actual = load_df(path, columns="b:str,a:int") df_eq(actual, [["2", 1]], "b:str,a:int") # can't specify wrong columns raises(Exception, lambda: load_df(path, columns="bb:str,a:int")) # load directory fs = FileSystem() folder = os.path.join(tmpdir, "folder") fs.makedirs(folder) f0 = os.path.join(folder, "_SUCCESS") f1 = os.path.join(folder, "1.parquet") f2 = os.path.join(folder, "3.parquet") fs.touch(f0) save_df(df1, f1) save_df(df1, f2) actual = load_df(folder, "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # load multiple paths actual = load_df([f1, f2], "parquet") df_eq(actual, [["1", 2, 3], ["1", 2, 3]], "a:str,b:int,c:long") # overwrite = False raises(FileExistsError, lambda: save_df(df1, f1, mode="error")) # can't overwrite directory raises( IsADirectoryError, lambda: save_df(df1, folder, format_hint="parquet", mode="overwrite"), ) # wrong mode raises(NotImplementedError, lambda: save_df(df1, f1, mode="dummy"))
def rename(self, columns: Dict[str, str]) -> DataFrame: if self.empty: return ArrayDataFrame([], self.schema).rename(columns) def _transform() -> Iterable[DataFrame]: for df in self.native: yield df.rename(columns) return LocalDataFrameIterableDataFrame(_transform())
def _select_cols(self, keys: List[Any]) -> DataFrame: if self.empty: return ArrayDataFrame([], self.schema)[keys] def _transform(): for df in self.native: yield df[keys] return LocalDataFrameIterableDataFrame(_transform())
def _drop_cols(self, cols: List[str]) -> DataFrame: if self.empty: return ArrayDataFrame([], self.schema)._drop_cols(cols) def _transform() -> Iterable[DataFrame]: for df in self.native: yield df._drop_cols(cols) return LocalDataFrameIterableDataFrame(_transform())
def test_to_local_bounded_df(): df = ArrayDataFrame([[0, 1]], "a:int,b:int") idf = IterableDataFrame([[0, 1]], "a:int,b:int", dict(a=1)) assert to_local_bounded_df(df) is df r = to_local_bounded_df(idf) assert r is not idf assert r.as_array() == [[0, 1]] assert r.schema == "a:int,b:int" assert r.metadata == dict(a=1)
def alter_columns(self, columns: Any) -> DataFrame: if self.empty: return ArrayDataFrame([], self.schema).alter_columns(columns) def _transform() -> Iterable[DataFrame]: for df in self.native: yield df.alter_columns(columns) return LocalDataFrameIterableDataFrame(_transform())
def test_nan_none(): df = ArrayDataFrame([[None, None]], "b:str,c:double") assert df.as_pandas().iloc[0, 0] is None arr = PandasDataFrame(df.as_pandas(), df.schema).as_array()[0] assert arr[0] is None assert math.isnan(arr[1]) df = ArrayDataFrame([[None, None]], "b:int,c:bool") arr = PandasDataFrame(df.as_pandas(), df.schema).as_array(type_safe=True)[0] assert arr[0] is None assert arr[1] is None df = ArrayDataFrame([["a", 1.1], [None, None]], "b:str,c:double") arr = PandasDataFrame(df.as_pandas(), df.schema).as_array(type_safe=True)[1] assert arr[0] is None assert arr[1] is None
def _get_dfs(self, row: Any) -> Iterable[Any]: for k, name, v in self.df_idx: if row[k] is None: df: DataFrame = ArrayDataFrame([], v) else: df = deserialize_df(row[k]) # type: ignore assert df is not None if self.named: yield name, df else: yield df
def test_to_df(self): e = self.engine o = ArrayDataFrame([[1, 2]], "a:int,b:int", dict(a=1), ) a = e.to_df(o) assert a is not o df_eq(a, o, throw=True) a = e.to_df([[1, None]], "a:int,b:int", dict(a=1)) df_eq(a, [[1, None]], "a:int,b:int", dict(a=1), throw=True)
def get(self, key: str): if self.dummy: return True, False, ArrayDataFrame([[100]], "a:int") self.get_called += 1 if key not in self.tb: print("not get", key) return False, False, None x = self.tb[key] print("get", key) self.hit += 1 return True, x[0], x[1]
def test_use_df(tmpdir): # df generated inside dag with FugueSQLWorkflow() as dag: a = dag.df([[0], [1]], "a:int") dag(""" b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """) dag.sql_vars["b"].assert_eq(a) # external non-workflowdataframe arr = ArrayDataFrame([[0], [1]], "a:int") with FugueSQLWorkflow() as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=arr, ) dag.sql_vars["b"].assert_eq(dag.df([[0], [1]], "a:int")) # from yield file engine = NativeExecutionEngine( conf={"fugue.workflow.checkpoint.path": os.path.join(tmpdir, "ck")}) with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD FILE AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=res, ) # from yield dataframe engine = NativeExecutionEngine() with FugueSQLWorkflow(engine) as dag: dag("CREATE[[0], [1]] SCHEMA a: int YIELD DATAFRAME AS b") res = dag.yields["b"] with FugueSQLWorkflow(engine) as dag: dag( """ b=CREATE[[0], [1]] SCHEMA a: int OUTPUT a, b USING assert_eq """, a=res, )
def execute(self, ctx: TaskContext) -> None: self._outputter._execution_engine = self._get_execution_engine(ctx) if self._input_has_key: inputs = DataFrames(ctx.inputs) else: inputs = DataFrames(ctx.inputs.values()) def exe(): self._outputter.validate_on_runtime(inputs) self._outputter.process(inputs) self._execute_with_modified_traceback(exe) # TODO: output dummy to force cache to work, should we fix adagio? ctx.outputs["_0"] = ArrayDataFrame([], "_0:int")
def transform(self, dfs: DataFrames) -> LocalDataFrame: assert 1 == self.on_init_called assert "test" in self.workflow_conf assert 2 == len(dfs) if self.params.get("named", False): assert dfs.has_key else: assert not dfs.has_key row = self.cursor.key_value_array + [ dfs[0].count(), dfs[1].count(), self.params.get("p", 1), ] return ArrayDataFrame([row], self.output_schema)