Пример #1
0
def _convert_pyarrow_to_avro_schema(pdf: pd.DataFrame,
                                    columns: Any = None) -> Dict:
    """
    pyarrow schema:
    'station: str , time: long, temp: int'

    avro schema:
    {
    'type': 'record',
    'name': 'Root',
    'fields': [
        {'name': 'a', 'type': 'string'},
        {'name': 'b', 'type': 'int'},
        {'name': 'c', 'type': 'long'},
    ],
    }

    """
    infer_fields = Schema(columns)
    inferred_fields = [{
        "name": k,
        "type": v
    } for k, v in infer_fields.pandas_dtype().items()
                       ]  # [ {column_name: np.dtype(str)}, ... ]

    for field in inferred_fields:
        if "complex" in field["type"]:
            field["type"] = [
                "null",
                pdx.__complex_field_infer(pdf, field["name"], {})
            ]

    schema = {"type": "record", "name": "Root", "fields": inferred_fields}

    return schema
Пример #2
0
 def _get_altered_schema(self, subschema: Any) -> Schema:
     sub = Schema(subschema)
     assert_or_throw(
         sub.names in self.schema,
         lambda: FugueDataFrameOperationError(
             f"{sub.names} are not all in {self.schema}"
         ),
     )
     for k, v in sub.items():
         old_type = self.schema[k].type
         new_type = v.type
         if not old_type.equals(new_type):
             assert_or_throw(
                 not pa.types.is_struct(old_type)
                 and not pa.types.is_list(old_type)
                 and not pa.types.is_binary(old_type),
                 lambda: NotImplementedError(f"can't convert from {old_type}"),
             )
             assert_or_throw(
                 not pa.types.is_struct(new_type)
                 and not pa.types.is_list(new_type)
                 and not pa.types.is_binary(new_type),
                 lambda: NotImplementedError(f"can't convert to {new_type}"),
             )
     return Schema([(k, sub.get(k, v)) for k, v in self.schema.items()])
Пример #3
0
def _load_csv(
    p: FileParser, columns: Any = None, **kwargs: Any
) -> Tuple[pd.DataFrame, Any]:
    kw = dict(kwargs)
    header = kw.get("header", False)
    if "header" in kw:
        del kw["header"]
    if str(header) in ["True", "0"]:
        pdf = pd.read_csv(p.uri, **{"index_col": False, "header": 0, **kw})
        if columns is None:
            return pdf, None
        if isinstance(columns, list):  # column names
            return pdf[columns], None
        schema = Schema(columns)
        return pdf[schema.names], schema
    if header is None or str(header) == "False":
        if columns is None:
            raise InvalidOperationError("columns must be set if without header")
        if isinstance(columns, list):  # column names
            pdf = pd.read_csv(
                p.uri, **{"index_col": False, "header": None, "names": columns, **kw}
            )
            return pdf, None
        schema = Schema(columns)
        pdf = pd.read_csv(
            p.uri, **{"index_col": False, "header": None, "names": schema.names, **kw}
        )
        return pdf, schema
    else:
        raise NotImplementedError(f"{header} is not supported")
Пример #4
0
 def __init__(self, schema: Schema, spec: PartitionSpec, physical_partition_no: int):
     self._orig_schema = schema
     self._key_index = [schema.index_of_key(key) for key in spec.partition_by]
     self._schema = schema.extract(spec.partition_by)
     self._physical_partition_no = physical_partition_no
     # the following will be set by the framework
     self._row: List[Any] = []
     self._partition_no = 0
     self._slice_no = 0
Пример #5
0
        def test_init_basic(self):
            raises(FugueDataFrameInitError, lambda: self.df())
            raises(FugueDataFrameInitError, lambda: self.df([]))
            raises(FugueDataFrameInitError, lambda: self.df([[]], Schema()))
            raises(FugueDataFrameInitError, lambda: self.df([[1]], Schema()))
            # raises(SchemaError, lambda: self.df([[1]]))  # schema can be inferred

            df = self.df([], "a:str,b:int")
            assert df.empty
Пример #6
0
def test_schema_update_delete():
    s = Schema("a:int,b:str,c:int")
    with raises(SchemaError):
        del s["a"]
    with raises(SchemaError):
        del s["x"]
    with raises(SchemaError):
        s["a"] = str
    raises(SchemaError, lambda: s.pop("a"))
    raises(SchemaError, lambda: s.popitem("a"))
    raises(SchemaError, lambda: s.update(dict(a=int)))
Пример #7
0
 def _parse_schema(self, obj: Any, dfs: DataFrames) -> Schema:
     if callable(obj):
         return obj(dfs, **self.params)
     if isinstance(obj, str):
         return Schema(obj)
     if isinstance(obj, List):
         s = Schema()
         for x in obj:
             s += self._parse_schema(x, dfs)
         return s
     return Schema(obj)
Пример #8
0
def test_schema_eq():
    s = Schema("a:int,b:str")
    assert s != None
    assert not (s == None)
    assert s == s
    assert s == Schema("a:int,b:str")
    assert not (s == Schema("b:str,a:int"))
    assert s == ["a:int", "b:str"]
    assert s != ["a:long", "b:str"]
    assert not (s == ["a:long", "b:str"])
    assert s == [("a", "int"), ("b", str)]
    assert s == OrderedDict([("a", "int"), ("b", str)])
Пример #9
0
def _test_as_array_perf():
    s = Schema()
    arr = []
    for i in range(100):
        s.append(f"a{i}:int")
        arr.append(i)
    for i in range(100):
        s.append(f"b{i}:int")
        arr.append(float(i))
    for i in range(100):
        s.append(f"c{i}:str")
        arr.append(str(i))
    data = []
    for i in range(5000):
        data.append(list(arr))
    df = SparkDataFrame(data, s)
    res = df.as_array()
    res = df.as_array(type_safe=True)
    nts, ts = 0.0, 0.0
    for i in range(10):
        t = datetime.now()
        res = df.as_array()
        nts += (datetime.now() - t).total_seconds()
        t = datetime.now()
        res = df.as_array(type_safe=True)
        ts += (datetime.now() - t).total_seconds()
    print(nts, ts)
Пример #10
0
def test_alter_columns():
    dag = FugueWorkflow()
    a = dag.create(mock_create1)
    a.alter_columns(Schema("a:str,b:str"))
    a.alter_columns(Schema("a:float,b:double"))

    assert_eq(
        """
    a=create using mock_create1
    alter columns a:str, b:str
    alter columns a:float, b:double from a
    """,
        dag,
    )
Пример #11
0
def _load_parquet(p: FileParser,
                  columns: Any = None,
                  **kwargs: Any) -> Tuple[dd.DataFrame, Any]:
    if columns is None:
        pdf = dd.read_parquet(p.uri, **kwargs)
        schema = Schema(pdf.head(1))
        return pdf, schema
    if isinstance(columns, list):  # column names
        pdf = dd.read_parquet(p.uri, columns=columns, **kwargs)
        schema = Schema(pdf.head(1))
        return pdf, schema
    schema = Schema(columns)
    pdf = dd.read_parquet(p.uri, columns=schema.names, **kwargs)
    return pdf, schema
Пример #12
0
    def get_key_schema(self, schema: Schema) -> Schema:
        """Get partition keys schema

        :param schema: the dataframe schema this partition spec to operate on
        :return: the sub-schema only containing partition keys
        """
        return schema.extract(self.partition_by)
Пример #13
0
def _load_avro(p: FileParser,
               columns: Any = None,
               **kwargs: Any) -> Tuple[pd.DataFrame, Any]:

    kw = ParamDict(kwargs)
    preprocess_record = None
    if "process_record" in kw:
        process_record = kw["process_record"]
        del kw["process_record"]

    with open(p.uri, "rb") as fp:  # QN is p.uri the path?
        # Configure Avro reader
        avro_reader = reader(fp)
        # Load records in memory
        if preprocess_record:
            records = [process_record(r) for r in avro_reader]
        else:
            records = list(avro_reader)

        # Populate pandas.DataFrame with records
        pdf = pd.DataFrame.from_records(records)

    if columns is None:
        return pdf, None
    if isinstance(columns, list):  # column names
        return pdf[columns], None
    schema = Schema(columns)

    # Return created DataFrame
    return pdf[schema.names], schema
Пример #14
0
 def assert_eq(expr, expected=None):
     sql = FugueSQL(expr, "fugueSchema", ignore_case=True)
     v = _VisitorBase(sql)
     obj = v.visit(sql.tree)
     if expected is None:
         expected = expr
     assert Schema(expected) == obj
Пример #15
0
def test__to_transformer_determinism():
    a = _to_transformer(t1, None)
    b = _to_transformer(t1, None)
    c = _to_transformer("t1", None)
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
    assert a is not c
    assert to_uuid(a) == to_uuid(c)

    a = _to_transformer(t4, "*,b:int")
    b = _to_transformer("t4", "*,b:int")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)

    a = _to_transformer(t4, "a:int,b:int")
    b = _to_transformer("t4", Schema("a:int,b:int"))
    assert a is not b
    assert to_uuid(a) == to_uuid(b)

    a = _to_transformer(MockTransformer)
    b = _to_transformer("MockTransformer")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)

    a = _to_transformer(t10)
    b = _to_transformer("t10")
    assert a is not b
    assert to_uuid(a) == to_uuid(b)
Пример #16
0
def _test_convert_nested(orig, expected_type, expected_value):
    a = [[orig]]
    s = Schema("a:" + expected_type).pa_schema
    x = list(apply_schema(s, a, deep=True))[0]
    y = list(apply_schema(s, a, copy=False, deep=True))[0]
    for b in [x, y]:
        assert expected_value == b[0]
    assert x is not a[0]
    assert y is a[0]
Пример #17
0
def _load_json(p: FileParser,
               columns: Any = None,
               **kwargs: Any) -> Tuple[pd.DataFrame, Any]:
    pdf = _safe_load_json(p.uri, **kwargs).reset_index(drop=True)
    if columns is None:
        return pdf, None
    if isinstance(columns, list):  # column names
        return pdf[columns], None
    schema = Schema(columns)
    return pdf[schema.names], schema
Пример #18
0
def test_schema_properties():
    s = Schema("a:int,b:str")
    assert ["a", "b"] == s.names
    assert [pa.int32(), pa.string()] == s.types
    assert [pa.field("a", pa.int32()), pa.field("b", pa.string())] == s.fields
    assert (pa.schema([pa.field("a", pa.int32()),
                       pa.field("b", pa.string())]) == s.pyarrow_schema)
    assert s.pyarrow_schema == s.pyarrow_schema
    assert dict(a=np.int32, b=np.dtype(str)) == s.pd_dtype
    assert s.pandas_dtype == s.pd_dtype
Пример #19
0
 def transform(self, df: DataFrame, tf: Transformer) -> DataFrame:
     tf._key_schema = self.partition_spec.get_key_schema(df.schema)  # type: ignore
     tf._output_schema = Schema(tf.get_output_schema(df))  # type: ignore
     tr = _TransformerRunner(df, tf, self._ignore_errors)  # type: ignore
     return self.execution_engine.map(
         df=df,
         map_func=tr.run,
         output_schema=tf.output_schema,  # type: ignore
         partition_spec=tf.partition_spec,
         on_init=tr.on_init,
     )
Пример #20
0
 def _apply_schema(
     self, pdf: pd.DataFrame, schema: Optional[Schema], type_safe: bool = True
 ) -> Tuple[pd.DataFrame, Schema]:
     if not type_safe:
         assert_arg_not_none(pdf, "pdf")
         assert_arg_not_none(schema, "schema")
         return pdf, schema
     DASK_UTILS.ensure_compatible(pdf)
     if pdf.columns.dtype == "object":  # pdf has named schema
         pschema = Schema(DASK_UTILS.to_schema(pdf))
         if schema is None or pschema == schema:
             return pdf, pschema.assert_not_empty()
         pdf = pdf[schema.assert_not_empty().names]
     else:  # pdf has no named schema
         schema = _input_schema(schema).assert_not_empty()
         assert_or_throw(
             pdf.shape[1] == len(schema),
             ValueError(f"Pandas datafame column count doesn't match {schema}"),
         )
         pdf.columns = schema.names
     return DASK_UTILS.enforce_type(pdf, schema.pa_schema, null_safe=True), schema
Пример #21
0
def test_transform():
    w = (FugueWorkflow().df([[0], [1]], "a:int",
                            data_determiner=to_uuid).transform(
                                mock_transformer,
                                schema=Schema("a:int"),
                                params=dict(n=2)))
    assert_eq(
        """
    create [[0],[1]] schema a:int
    transform using mock_transformer(n=2) schema a:int
    """,
        w.workflow,
    )

    w = (FugueWorkflow().df([[0], [1]], "a:int",
                            data_determiner=to_uuid).partition(
                                by=["a"], presort="b DESC",
                                num="ROWCOUNT/2").transform(mock_transformer,
                                                            schema="*",
                                                            params=dict(n=2)))
    assert_eq(
        """
    create [[0],[1]] schema a:int

    transform
        prepartition ROWCOUNT / 2 by a presort b desc
        using mock_transformer(n=2) schema *
    """,
        w.workflow,
    )

    def _func(a: int, b: int) -> int:
        return a + b

    w = (FugueWorkflow().df([[0], [1]], "a:int",
                            data_determiner=to_uuid).partition(
                                by=["a"], presort="b DESC",
                                num="ROWCOUNT/2").transform(mock_transformer,
                                                            schema="*",
                                                            params=dict(n=2),
                                                            callback=_func))
    assert_eq(
        """
    create [[0],[1]] schema a:int

    transform
        prepartition ROWCOUNT / 2 by a presort b desc
        using mock_transformer(n=2) schema *
        callback _func
    """,
        w.workflow,
    )
Пример #22
0
def test_schema_contains():
    s = Schema("a:int,b:str")
    assert None not in s
    assert s in s
    assert "a" in s
    assert "c" not in s
    assert "a:int" in s
    assert "a:long" not in s
    assert pa.field("a", pa.int32()) in s
    assert pa.field("aa", pa.int32()) not in s
    assert pa.field("a", pa.int64()) not in s
    assert ["a", ("b", str)] in s
    assert ["a", ("b", int)] not in s
Пример #23
0
def _load_parquet(
    p: FileParser, columns: Any = None, **kwargs: Any
) -> Tuple[pd.DataFrame, Any]:
    if columns is None:
        pdf = pd.read_parquet(p.uri, **{"engine": "pyarrow", **kwargs})
        return pdf, None
    if isinstance(columns, list):  # column names
        pdf = pd.read_parquet(p.uri, columns=columns, **{"engine": "pyarrow", **kwargs})
        return pdf, None
    schema = Schema(columns)
    pdf = pd.read_parquet(
        p.uri, columns=schema.names, **{"engine": "pyarrow", **kwargs}
    )
    return pdf, schema
Пример #24
0
def test_schema_operators():
    s = Schema("a:int,b:str,c:int")
    s += "d:int"
    t = s + "e:int"
    t += ""
    assert s == "a:int,b:str,c:int,d:int"
    assert t == "a:int,b:str,c:int,d:int,e:int"
    t = s - ""
    assert t == s
    t = s - ["a", "c"]
    assert t == "b:str,d:int"
    with raises(SchemaError):
        t -= "a"
    assert t == "b:str,d:int"
Пример #25
0
def _test_convert(orig, expected_type, expected_value):
    a = [[orig]]
    s = Schema("a:" + expected_type).pa_schema
    x = list(apply_schema(s, a))[0]
    y = list(apply_schema(s, a, copy=False))[0]
    for b in [x, y]:
        if isinstance(expected_value, float) and math.isnan(expected_value):
            assert math.isnan(b[0])
        elif expected_value is pd.NaT:
            assert b[0] is pd.NaT
        else:
            assert expected_value == b[0]
    assert x is not a[0]
    assert y is a[0]
Пример #26
0
def test_schema_rename():
    s = Schema("a:int,b:str,c:bool").rename(columns=dict(a="c", c="a"))
    assert s == "c:int,b:str,a:bool"
    s = Schema("a:int,b:str,c:bool").rename(columns=dict(a="c", c="a"),
                                            ignore_missing=True)
    assert s == "c:int,b:str,a:bool"
    raises(SchemaError, lambda: s.rename(dict(x="b")))
    raises(SchemaError, lambda: s.rename(dict(a="b")))
    raises(SchemaError, lambda: s.rename(dict(a=123)))
Пример #27
0
    def get_partitioner(self, schema: Schema) -> SchemaedDataPartitioner:
        """Get :class:`~triad.utils.pyarrow.SchemaedDataPartitioner` by input
        dataframe schema

        :param schema: the dataframe schema this partition spec to operate on
        :return: SchemaedDataPartitioner object
        """
        pos = [schema.index_of_key(key) for key in self.partition_by]
        return SchemaedDataPartitioner(
            schema.pa_schema,
            pos,
            sizer=None,
            row_limit=self._row_limit,
            size_limit=self._size_limit,
        )
Пример #28
0
def test_schema_setter():
    a = Schema("a:int,b:str")
    with raises(NoneArgumentError):
        a["c"] = None  # None is invalid
    with raises(SchemaError):
        a["b"] = "str"  # Update is not allowed
    with raises(SchemaError):
        a["123"] = "int"  # Col name is invalid
    with raises(SchemaError):
        a["x"] = pa.field("y", pa.int32())  # key!=field.name
    with raises(SchemaError):
        a["y"] = pa.large_binary()  # unsupported types
    a["c"] = str
    a["d"] = pa.field("d", pa.int32())
    assert a == "a:int,b:str,c:str,d:int"
Пример #29
0
def _enforce_type(df: pd.DataFrame, schema: Schema) -> pd.DataFrame:
    # TODO: does this have higher latency?
    for k, v in schema.items():
        s = df[k]
        if pa.types.is_string(v.type):
            ns = s.isnull()
            s = s.astype(str)
            s[ns] = None
        elif pa.types.is_integer(v.type) or pa.types.is_boolean(v.type):
            ns = s.isnull()
            s = s.fillna(0).astype(v.type.to_pandas_dtype())
            s[ns] = None
        elif not pa.types.is_struct(v.type) and not pa.types.is_list(v.type):
            s = s.astype(v.type.to_pandas_dtype())
        df[k] = s
    return df
Пример #30
0
def test_partition_cursor():
    p = PartitionSpec(dict(partition_by=["b", "a"]))
    s = Schema("a:int,b:int,c:int,d:int")
    c = p.get_cursor(s, 2)
    pt = p.get_partitioner(s)  # this part is well covered in spark section
    assert c.row_schema == s
    assert c.key_schema == "b:int,a:int"

    c.set([1, 2, 2, 2], 5, 6)
    assert [2, 1] == c.key_value_array
    assert dict(a=1, b=2) == c.key_value_dict
    assert 2 == c["c"]
    assert [1, 2, 2, 2] == c.row
    assert 5 == c.partition_no
    assert 2 == c.physical_partition_no
    assert 6 == c.slice_no