def _convert_pyarrow_to_avro_schema(pdf: pd.DataFrame, columns: Any = None) -> Dict: """ pyarrow schema: 'station: str , time: long, temp: int' avro schema: { 'type': 'record', 'name': 'Root', 'fields': [ {'name': 'a', 'type': 'string'}, {'name': 'b', 'type': 'int'}, {'name': 'c', 'type': 'long'}, ], } """ infer_fields = Schema(columns) inferred_fields = [{ "name": k, "type": v } for k, v in infer_fields.pandas_dtype().items() ] # [ {column_name: np.dtype(str)}, ... ] for field in inferred_fields: if "complex" in field["type"]: field["type"] = [ "null", pdx.__complex_field_infer(pdf, field["name"], {}) ] schema = {"type": "record", "name": "Root", "fields": inferred_fields} return schema
def _get_altered_schema(self, subschema: Any) -> Schema: sub = Schema(subschema) assert_or_throw( sub.names in self.schema, lambda: FugueDataFrameOperationError( f"{sub.names} are not all in {self.schema}" ), ) for k, v in sub.items(): old_type = self.schema[k].type new_type = v.type if not old_type.equals(new_type): assert_or_throw( not pa.types.is_struct(old_type) and not pa.types.is_list(old_type) and not pa.types.is_binary(old_type), lambda: NotImplementedError(f"can't convert from {old_type}"), ) assert_or_throw( not pa.types.is_struct(new_type) and not pa.types.is_list(new_type) and not pa.types.is_binary(new_type), lambda: NotImplementedError(f"can't convert to {new_type}"), ) return Schema([(k, sub.get(k, v)) for k, v in self.schema.items()])
def _load_csv( p: FileParser, columns: Any = None, **kwargs: Any ) -> Tuple[pd.DataFrame, Any]: kw = dict(kwargs) header = kw.get("header", False) if "header" in kw: del kw["header"] if str(header) in ["True", "0"]: pdf = pd.read_csv(p.uri, **{"index_col": False, "header": 0, **kw}) if columns is None: return pdf, None if isinstance(columns, list): # column names return pdf[columns], None schema = Schema(columns) return pdf[schema.names], schema if header is None or str(header) == "False": if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names pdf = pd.read_csv( p.uri, **{"index_col": False, "header": None, "names": columns, **kw} ) return pdf, None schema = Schema(columns) pdf = pd.read_csv( p.uri, **{"index_col": False, "header": None, "names": schema.names, **kw} ) return pdf, schema else: raise NotImplementedError(f"{header} is not supported")
def __init__(self, schema: Schema, spec: PartitionSpec, physical_partition_no: int): self._orig_schema = schema self._key_index = [schema.index_of_key(key) for key in spec.partition_by] self._schema = schema.extract(spec.partition_by) self._physical_partition_no = physical_partition_no # the following will be set by the framework self._row: List[Any] = [] self._partition_no = 0 self._slice_no = 0
def test_init_basic(self): raises(FugueDataFrameInitError, lambda: self.df()) raises(FugueDataFrameInitError, lambda: self.df([])) raises(FugueDataFrameInitError, lambda: self.df([[]], Schema())) raises(FugueDataFrameInitError, lambda: self.df([[1]], Schema())) # raises(SchemaError, lambda: self.df([[1]])) # schema can be inferred df = self.df([], "a:str,b:int") assert df.empty
def test_schema_update_delete(): s = Schema("a:int,b:str,c:int") with raises(SchemaError): del s["a"] with raises(SchemaError): del s["x"] with raises(SchemaError): s["a"] = str raises(SchemaError, lambda: s.pop("a")) raises(SchemaError, lambda: s.popitem("a")) raises(SchemaError, lambda: s.update(dict(a=int)))
def _parse_schema(self, obj: Any, dfs: DataFrames) -> Schema: if callable(obj): return obj(dfs, **self.params) if isinstance(obj, str): return Schema(obj) if isinstance(obj, List): s = Schema() for x in obj: s += self._parse_schema(x, dfs) return s return Schema(obj)
def test_schema_eq(): s = Schema("a:int,b:str") assert s != None assert not (s == None) assert s == s assert s == Schema("a:int,b:str") assert not (s == Schema("b:str,a:int")) assert s == ["a:int", "b:str"] assert s != ["a:long", "b:str"] assert not (s == ["a:long", "b:str"]) assert s == [("a", "int"), ("b", str)] assert s == OrderedDict([("a", "int"), ("b", str)])
def _test_as_array_perf(): s = Schema() arr = [] for i in range(100): s.append(f"a{i}:int") arr.append(i) for i in range(100): s.append(f"b{i}:int") arr.append(float(i)) for i in range(100): s.append(f"c{i}:str") arr.append(str(i)) data = [] for i in range(5000): data.append(list(arr)) df = SparkDataFrame(data, s) res = df.as_array() res = df.as_array(type_safe=True) nts, ts = 0.0, 0.0 for i in range(10): t = datetime.now() res = df.as_array() nts += (datetime.now() - t).total_seconds() t = datetime.now() res = df.as_array(type_safe=True) ts += (datetime.now() - t).total_seconds() print(nts, ts)
def test_alter_columns(): dag = FugueWorkflow() a = dag.create(mock_create1) a.alter_columns(Schema("a:str,b:str")) a.alter_columns(Schema("a:float,b:double")) assert_eq( """ a=create using mock_create1 alter columns a:str, b:str alter columns a:float, b:double from a """, dag, )
def _load_parquet(p: FileParser, columns: Any = None, **kwargs: Any) -> Tuple[dd.DataFrame, Any]: if columns is None: pdf = dd.read_parquet(p.uri, **kwargs) schema = Schema(pdf.head(1)) return pdf, schema if isinstance(columns, list): # column names pdf = dd.read_parquet(p.uri, columns=columns, **kwargs) schema = Schema(pdf.head(1)) return pdf, schema schema = Schema(columns) pdf = dd.read_parquet(p.uri, columns=schema.names, **kwargs) return pdf, schema
def get_key_schema(self, schema: Schema) -> Schema: """Get partition keys schema :param schema: the dataframe schema this partition spec to operate on :return: the sub-schema only containing partition keys """ return schema.extract(self.partition_by)
def _load_avro(p: FileParser, columns: Any = None, **kwargs: Any) -> Tuple[pd.DataFrame, Any]: kw = ParamDict(kwargs) preprocess_record = None if "process_record" in kw: process_record = kw["process_record"] del kw["process_record"] with open(p.uri, "rb") as fp: # QN is p.uri the path? # Configure Avro reader avro_reader = reader(fp) # Load records in memory if preprocess_record: records = [process_record(r) for r in avro_reader] else: records = list(avro_reader) # Populate pandas.DataFrame with records pdf = pd.DataFrame.from_records(records) if columns is None: return pdf, None if isinstance(columns, list): # column names return pdf[columns], None schema = Schema(columns) # Return created DataFrame return pdf[schema.names], schema
def assert_eq(expr, expected=None): sql = FugueSQL(expr, "fugueSchema", ignore_case=True) v = _VisitorBase(sql) obj = v.visit(sql.tree) if expected is None: expected = expr assert Schema(expected) == obj
def test__to_transformer_determinism(): a = _to_transformer(t1, None) b = _to_transformer(t1, None) c = _to_transformer("t1", None) assert a is not b assert to_uuid(a) == to_uuid(b) assert a is not c assert to_uuid(a) == to_uuid(c) a = _to_transformer(t4, "*,b:int") b = _to_transformer("t4", "*,b:int") assert a is not b assert to_uuid(a) == to_uuid(b) a = _to_transformer(t4, "a:int,b:int") b = _to_transformer("t4", Schema("a:int,b:int")) assert a is not b assert to_uuid(a) == to_uuid(b) a = _to_transformer(MockTransformer) b = _to_transformer("MockTransformer") assert a is not b assert to_uuid(a) == to_uuid(b) a = _to_transformer(t10) b = _to_transformer("t10") assert a is not b assert to_uuid(a) == to_uuid(b)
def _test_convert_nested(orig, expected_type, expected_value): a = [[orig]] s = Schema("a:" + expected_type).pa_schema x = list(apply_schema(s, a, deep=True))[0] y = list(apply_schema(s, a, copy=False, deep=True))[0] for b in [x, y]: assert expected_value == b[0] assert x is not a[0] assert y is a[0]
def _load_json(p: FileParser, columns: Any = None, **kwargs: Any) -> Tuple[pd.DataFrame, Any]: pdf = _safe_load_json(p.uri, **kwargs).reset_index(drop=True) if columns is None: return pdf, None if isinstance(columns, list): # column names return pdf[columns], None schema = Schema(columns) return pdf[schema.names], schema
def test_schema_properties(): s = Schema("a:int,b:str") assert ["a", "b"] == s.names assert [pa.int32(), pa.string()] == s.types assert [pa.field("a", pa.int32()), pa.field("b", pa.string())] == s.fields assert (pa.schema([pa.field("a", pa.int32()), pa.field("b", pa.string())]) == s.pyarrow_schema) assert s.pyarrow_schema == s.pyarrow_schema assert dict(a=np.int32, b=np.dtype(str)) == s.pd_dtype assert s.pandas_dtype == s.pd_dtype
def transform(self, df: DataFrame, tf: Transformer) -> DataFrame: tf._key_schema = self.partition_spec.get_key_schema(df.schema) # type: ignore tf._output_schema = Schema(tf.get_output_schema(df)) # type: ignore tr = _TransformerRunner(df, tf, self._ignore_errors) # type: ignore return self.execution_engine.map( df=df, map_func=tr.run, output_schema=tf.output_schema, # type: ignore partition_spec=tf.partition_spec, on_init=tr.on_init, )
def _apply_schema( self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True ) -> Tuple[pd.DataFrame, Schema]: if not type_safe: assert_arg_not_none(pdf, "pdf") assert_arg_not_none(schema, "schema") return pdf, schema DASK_UTILS.ensure_compatible(pdf) if pdf.columns.dtype == "object": # pdf has named schema pschema = Schema(DASK_UTILS.to_schema(pdf)) if schema is None or pschema == schema: return pdf, pschema.assert_not_empty() pdf = pdf[schema.assert_not_empty().names] else: # pdf has no named schema schema = _input_schema(schema).assert_not_empty() assert_or_throw( pdf.shape[1] == len(schema), ValueError(f"Pandas datafame column count doesn't match {schema}"), ) pdf.columns = schema.names return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema
def test_transform(): w = (FugueWorkflow().df([[0], [1]], "a:int", data_determiner=to_uuid).transform( mock_transformer, schema=Schema("a:int"), params=dict(n=2))) assert_eq( """ create [[0],[1]] schema a:int transform using mock_transformer(n=2) schema a:int """, w.workflow, ) w = (FugueWorkflow().df([[0], [1]], "a:int", data_determiner=to_uuid).partition( by=["a"], presort="b DESC", num="ROWCOUNT/2").transform(mock_transformer, schema="*", params=dict(n=2))) assert_eq( """ create [[0],[1]] schema a:int transform prepartition ROWCOUNT / 2 by a presort b desc using mock_transformer(n=2) schema * """, w.workflow, ) def _func(a: int, b: int) -> int: return a + b w = (FugueWorkflow().df([[0], [1]], "a:int", data_determiner=to_uuid).partition( by=["a"], presort="b DESC", num="ROWCOUNT/2").transform(mock_transformer, schema="*", params=dict(n=2), callback=_func)) assert_eq( """ create [[0],[1]] schema a:int transform prepartition ROWCOUNT / 2 by a presort b desc using mock_transformer(n=2) schema * callback _func """, w.workflow, )
def test_schema_contains(): s = Schema("a:int,b:str") assert None not in s assert s in s assert "a" in s assert "c" not in s assert "a:int" in s assert "a:long" not in s assert pa.field("a", pa.int32()) in s assert pa.field("aa", pa.int32()) not in s assert pa.field("a", pa.int64()) not in s assert ["a", ("b", str)] in s assert ["a", ("b", int)] not in s
def _load_parquet( p: FileParser, columns: Any = None, **kwargs: Any ) -> Tuple[pd.DataFrame, Any]: if columns is None: pdf = pd.read_parquet(p.uri, **{"engine": "pyarrow", **kwargs}) return pdf, None if isinstance(columns, list): # column names pdf = pd.read_parquet(p.uri, columns=columns, **{"engine": "pyarrow", **kwargs}) return pdf, None schema = Schema(columns) pdf = pd.read_parquet( p.uri, columns=schema.names, **{"engine": "pyarrow", **kwargs} ) return pdf, schema
def test_schema_operators(): s = Schema("a:int,b:str,c:int") s += "d:int" t = s + "e:int" t += "" assert s == "a:int,b:str,c:int,d:int" assert t == "a:int,b:str,c:int,d:int,e:int" t = s - "" assert t == s t = s - ["a", "c"] assert t == "b:str,d:int" with raises(SchemaError): t -= "a" assert t == "b:str,d:int"
def _test_convert(orig, expected_type, expected_value): a = [[orig]] s = Schema("a:" + expected_type).pa_schema x = list(apply_schema(s, a))[0] y = list(apply_schema(s, a, copy=False))[0] for b in [x, y]: if isinstance(expected_value, float) and math.isnan(expected_value): assert math.isnan(b[0]) elif expected_value is pd.NaT: assert b[0] is pd.NaT else: assert expected_value == b[0] assert x is not a[0] assert y is a[0]
def test_schema_rename(): s = Schema("a:int,b:str,c:bool").rename(columns=dict(a="c", c="a")) assert s == "c:int,b:str,a:bool" s = Schema("a:int,b:str,c:bool").rename(columns=dict(a="c", c="a"), ignore_missing=True) assert s == "c:int,b:str,a:bool" raises(SchemaError, lambda: s.rename(dict(x="b"))) raises(SchemaError, lambda: s.rename(dict(a="b"))) raises(SchemaError, lambda: s.rename(dict(a=123)))
def get_partitioner(self, schema: Schema) -> SchemaedDataPartitioner: """Get :class:`~triad.utils.pyarrow.SchemaedDataPartitioner` by input dataframe schema :param schema: the dataframe schema this partition spec to operate on :return: SchemaedDataPartitioner object """ pos = [schema.index_of_key(key) for key in self.partition_by] return SchemaedDataPartitioner( schema.pa_schema, pos, sizer=None, row_limit=self._row_limit, size_limit=self._size_limit, )
def test_schema_setter(): a = Schema("a:int,b:str") with raises(NoneArgumentError): a["c"] = None # None is invalid with raises(SchemaError): a["b"] = "str" # Update is not allowed with raises(SchemaError): a["123"] = "int" # Col name is invalid with raises(SchemaError): a["x"] = pa.field("y", pa.int32()) # key!=field.name with raises(SchemaError): a["y"] = pa.large_binary() # unsupported types a["c"] = str a["d"] = pa.field("d", pa.int32()) assert a == "a:int,b:str,c:str,d:int"
def _enforce_type(df: pd.DataFrame, schema: Schema) -> pd.DataFrame: # TODO: does this have higher latency? for k, v in schema.items(): s = df[k] if pa.types.is_string(v.type): ns = s.isnull() s = s.astype(str) s[ns] = None elif pa.types.is_integer(v.type) or pa.types.is_boolean(v.type): ns = s.isnull() s = s.fillna(0).astype(v.type.to_pandas_dtype()) s[ns] = None elif not pa.types.is_struct(v.type) and not pa.types.is_list(v.type): s = s.astype(v.type.to_pandas_dtype()) df[k] = s return df
def test_partition_cursor(): p = PartitionSpec(dict(partition_by=["b", "a"])) s = Schema("a:int,b:int,c:int,d:int") c = p.get_cursor(s, 2) pt = p.get_partitioner(s) # this part is well covered in spark section assert c.row_schema == s assert c.key_schema == "b:int,a:int" c.set([1, 2, 2, 2], 5, 6) assert [2, 1] == c.key_value_array assert dict(a=1, b=2) == c.key_value_dict assert 2 == c["c"] assert [1, 2, 2, 2] == c.row assert 5 == c.partition_no assert 2 == c.physical_partition_no assert 6 == c.slice_no