示例#1
0
    def append(self, obj: Any) -> "Schema":  # noqa: C901
        """Append schema like object to the current schema. Only new columns
        are allowed.

        :raises SchemaError: if a column exists or is invalid or obj is not convertible
        :return: the Schema object itself
        """
        try:
            if obj is None:
                return self
            elif isinstance(obj, pa.Field):
                self[obj.name] = obj.type
            elif isinstance(obj, str):
                self._append_pa_schema(expression_to_schema(obj))
            elif isinstance(obj, Dict):
                for k, v in obj.items():
                    self[k] = v
            elif isinstance(obj, pa.Schema):
                self._append_pa_schema(obj)
            elif isinstance(obj, pd.DataFrame):
                self._append_pa_schema(PD_UTILS.to_schema(obj))
            elif isinstance(obj, Tuple):  # type: ignore
                self[obj[0]] = obj[1]
            elif isinstance(obj, List):
                for x in obj:
                    self.append(x)
            else:
                raise SchemaError(f"Invalid schema to add {obj}")
            return self
        except SchemaError:
            raise
        except Exception as e:
            raise SchemaError(str(e))
示例#2
0
def test_binary():
    b = pickle.dumps("xyz")
    data = [[b, b"xy"]]
    s = expression_to_schema("a:bytes,b:bytes")
    df = DF(data, "a:bytes,b:bytes")
    a = df.as_array(type_safe=True)
    assert [[b, b"xy"]] == a
示例#3
0
def test_nested():
    # data = [[dict(b=[30, "40"])]]
    # s = expression_to_schema("a:{a:str,b:[int]}")
    # df = DF(data, "a:{a:str,b:[int]}")
    # a = df.as_array(type_safe=True)
    # assert [[dict(a=None, b=[30, 40])]] == a

    data = [[[json.dumps(dict(b=[30, "40"]))]]]
    s = expression_to_schema("a:[{a:str,b:[int]}]")
    df = DF(data, "a:[{a:str,b:[int]}]")
    a = df.as_array(type_safe=True)
    assert [[[dict(a=None, b=[30, 40])]]] == a

    data = [[json.dumps(["1", 2])]]
    s = expression_to_schema("a:[int]")
    df = DF(data, "a:[int]")
    a = df.as_array(type_safe=True)
    assert [[[1, 2]]] == a
示例#4
0
def test_schemas_equal():
    a = expression_to_schema("a:int,b:int,c:int")
    b = expression_to_schema("a:int,b:int,c:int")
    c = expression_to_schema("a:int,c:int,b:int")
    assert schemas_equal(a, a)
    assert schemas_equal(a, b)
    assert not schemas_equal(a, c)
    assert schemas_equal(a, c, check_order=False)
    a = a.with_metadata({"a": "1"})
    assert schemas_equal(a, a)
    assert not schemas_equal(a, b)
    assert schemas_equal(a, b, check_metadata=False)
    assert not schemas_equal(a, c)
    assert not schemas_equal(a, c, check_order=False)
    assert not schemas_equal(a, c, check_metadata=False)
    assert schemas_equal(a, c, check_order=False, check_metadata=False)
    c = c.with_metadata({"a": "1"})
    assert not schemas_equal(a, c)
    assert schemas_equal(a, c, check_order=False)
示例#5
0
def test_schemaed_data_partitioner():
    p0 = SchemaedDataPartitioner(
        schema=expression_to_schema("a:int,b:int,c:int"),
        key_positions=[2, 0],
        row_limit=0,
    )
    p1 = SchemaedDataPartitioner(
        schema=expression_to_schema("a:int,b:int,c:int"),
        key_positions=[2, 0],
        row_limit=1,
    )
    p2 = SchemaedDataPartitioner(
        schema=expression_to_schema("a:int,b:int,c:int"),
        key_positions=[2, 0],
        row_limit=2,
    )
    data = [[0, 0, 0], [0, 1, 0], [0, 2, 0], [1, 0, 0]]
    _test_partition(p0, data, "0,0,[0,1,2];1,0,[3]")
    _test_partition(p1, data, "0,0,[0];0,1,[1];0,2,[2];1,0,[3]")
    _test_partition(p2, data, "0,0,[0,1];0,1,[2];1,0,[3]")
    _test_partition(p2, data,
                    "0,0,[0,1];0,1,[2];1,0,[3]")  # can reuse the partitioner
示例#6
0
 def __init__(self, *args: Any, **kwargs: Any):
     if len(args) > 0 and len(kwargs) > 0:
         raise SchemaError("Can't set both *args and **kwargs")
     if len(args) == 1:  # duplicate code for better performance
         if isinstance(args[0], Schema):
             super().__init__(args[0])  # type: ignore
             return
         fields: Optional[List[pa.Field]] = None
         if isinstance(args[0], str):
             fields = list(expression_to_schema(args[0]))
         if isinstance(args[0], pa.Schema):
             fields = list(args[0])
         if isinstance(args[0], pa.Field):
             fields = [args[0]]
         if fields is not None:
             fields = [self._validate_field(f) for f in fields]
             super().__init__([(x.name, x) for x in fields])
             return
     super().__init__()
     if len(args) > 0:
         self.append(list(args))
     elif len(kwargs) > 0:
         self.append(kwargs)
示例#7
0
def test_expression_conversion():
    _assert_from_expr("a:int,b:ubyte")
    _assert_from_expr(" a : int32 , b : uint8 ", "a:int,b:ubyte")
    _assert_from_expr("a:[int32],b:uint8", "a:[int],b:ubyte")
    _assert_from_expr(
        "a : { x : int32 , y : [string] } , b : [ uint8 ] ",
        "a:{x:int,y:[str]},b:[ubyte]",
    )
    _assert_from_expr(
        "a : [{ x : int32 , y : [string] }] , b : [ uint8 ] ",
        "a:[{x:int,y:[str]}],b:[ubyte]",
    )
    _assert_from_expr("a:decimal(5,2)")
    _assert_from_expr("a:bytes,b:bytes")
    _assert_from_expr("a:bytes,b: binary", "a:bytes,b:bytes")

    raises(SyntaxError, lambda: expression_to_schema("123:int"))
    raises(SyntaxError, lambda: expression_to_schema("int"))
    raises(SyntaxError, lambda: expression_to_schema("a:dummytype"))
    raises(SyntaxError, lambda: expression_to_schema("a:int,a:str"))
    raises(SyntaxError, lambda: expression_to_schema("a:int,b:{x:int,x:str}"))
    raises(SyntaxError, lambda: expression_to_schema("_:int"))
    raises(SyntaxError, lambda: expression_to_schema("__:int"))
示例#8
0
def _assert_from_expr(expr, expected=None):
    schema = expression_to_schema(expr)
    out_expr = schema_to_expression(schema)
    expected = expected or expr
    assert expected == out_expr
示例#9
0
 def extract(  # noqa: C901
     self,
     obj: Any,
     ignore_key_mismatch: bool = False,
     require_type_match: bool = True,
     ignore_type_mismatch: bool = False,
 ) -> "Schema":
     if obj is None:
         return Schema()
     if isinstance(obj, str):
         if ":" in obj:  # expression
             ps = expression_to_schema(obj)
             pairs: List[Tuple[str,
                               pa.DataType]] = list(zip(ps.names, ps.types))
         else:
             pairs = [(obj, None)]  # single key
     elif isinstance(obj, (pa.Schema, Schema)):
         pairs = list(zip(obj.names, obj.types))
     elif isinstance(obj, List):
         fields: List[pa.Field] = []
         for x in obj:
             if isinstance(x, str) and ":" not in x:
                 if x not in self:
                     if not ignore_key_mismatch:
                         raise SchemaError(f"Can't extract {x} from {self}")
                 else:
                     fields.append(self[x])
             else:
                 fields += self.extract(
                     x,
                     ignore_key_mismatch=ignore_key_mismatch,
                     require_type_match=require_type_match,
                     ignore_type_mismatch=ignore_type_mismatch,
                 ).fields
         return Schema(pa.schema(fields))
     else:
         return self.extract(
             Schema(obj),
             ignore_key_mismatch=ignore_key_mismatch,
             require_type_match=require_type_match,
             ignore_type_mismatch=ignore_type_mismatch,
         )
     fields = []
     for k, v in pairs:
         k = k.strip()
         if k == "":
             continue
         if k not in self:
             if ignore_key_mismatch:
                 continue
             raise SchemaError(f"Can't extract {k} from {self}")
         if v is None:
             fields.append(self[k])
         else:
             tp = self[k].type
             if not require_type_match or tp == v:
                 fields.append(self[k])
             elif not ignore_type_mismatch:
                 raise SchemaError(
                     f"Unable to extract {k}:{v} from {self}, type mismatch"
                 )
     return Schema(pa.schema(fields))
示例#10
0
 def remove(  # noqa: C901
     self,
     obj: Any,
     ignore_key_mismatch: bool = False,
     require_type_match: bool = True,
     ignore_type_mismatch: bool = False,
 ) -> "Schema":
     if obj is None:
         return self.copy()
     target = self
     if isinstance(obj, str):
         if ":" in obj:  # expression
             ps = expression_to_schema(obj)
             pairs: List[Tuple[str,
                               pa.DataType]] = list(zip(ps.names, ps.types))
         else:
             pairs = [(obj, None)]  # single key
     elif isinstance(obj, (pa.Schema, Schema)):
         pairs = list(zip(obj.names, obj.types))
     elif isinstance(obj, (List, Set)):
         keys: List[str] = []
         other: List[Any] = []
         for x in obj:
             if isinstance(x, str) and ":" not in x:
                 keys.append(x)
             else:
                 other.append(x)
         pairs = [(x, None) for x in keys]
         for o in other:
             target = target.remove(
                 o,
                 ignore_key_mismatch=ignore_key_mismatch,
                 require_type_match=require_type_match,
                 ignore_type_mismatch=ignore_type_mismatch,
             )
     else:
         return self.remove(
             Schema(obj),
             ignore_key_mismatch=ignore_key_mismatch,
             require_type_match=require_type_match,
             ignore_type_mismatch=ignore_type_mismatch,
         )
     od = OrderedDict(target)
     for k, v in pairs:
         k = k.strip()
         if k == "":
             continue
         if k not in od:
             if ignore_key_mismatch:
                 continue
             raise SchemaError(f"Can't remove {k} from {target}")
         if v is None:
             del od[k]
         else:
             tp = od[k].type
             if not require_type_match or tp == v:
                 del od[k]
             elif not ignore_type_mismatch:
                 raise SchemaError(
                     f"Unable to remove {k}:{v} from {self}, type mismatch")
     return Schema(od)
示例#11
0
 def __init__(self, data, schema, enforce=False):
     s = expression_to_schema(schema)
     df = pd.DataFrame(data, columns=s.names)
     self.native = PD_UTILS.enforce_type(df, s, enforce)
     self.schema = s