def _serialize_by_partition( self, df: DataFrame, partition_spec: PartitionSpec, df_name: str, temp_path: Optional[str] = None, to_file_threshold: Any = -1, has_name: bool = False, ) -> DataFrame: to_file_threshold = _get_file_threshold(to_file_threshold) on = list(filter(lambda k: k in df.schema, partition_spec.partition_by)) presort = list( filter(lambda p: p[0] in df.schema, partition_spec.presort.items()) ) col_name = _df_name_to_serialize_col(df_name) if len(on) == 0: partition_spec = PartitionSpec( partition_spec, num=1, by=[], presort=presort ) output_schema = Schema(f"{col_name}:str") else: partition_spec = PartitionSpec(partition_spec, by=on, presort=presort) output_schema = partition_spec.get_key_schema(df.schema) + f"{col_name}:str" s = _PartitionSerializer(output_schema, temp_path, to_file_threshold) metadata = dict( serialized=True, serialized_cols={df_name: col_name}, schemas={df_name: str(df.schema)}, serialized_has_name=has_name, ) return self.map(df, s.run, output_schema, partition_spec, metadata)
def test_partition_spec(): p = PartitionSpec() assert [] == p.partition_by "0" == p.num_partitions {} == p.presort "hash" == p.algo assert p.empty p = PartitionSpec(None) assert p.empty p2 = PartitionSpec(p) assert p2.empty p = PartitionSpec(json.dumps(dict(partition_by=["a", "b", "c"], num_partitions=1))) assert ["a", "b", "c"] == p.partition_by assert "1" == p.num_partitions assert {} == p.presort assert "hash" == p.algo assert not p.empty p = PartitionSpec(dict(by=["a", "b", "c"], presort="d asc,e desc")) assert ["a", "b", "c"] == p.partition_by assert "0" == p.num_partitions assert dict(d=True, e=False) == p.presort assert "hash" == p.algo assert not p.empty p = PartitionSpec(by=["a", "b", "c"], num=5, presort="d,e desc", algo="EvEN") assert ["a", "b", "c"] == p.partition_by assert "5" == p.num_partitions assert dict(d=True, e=False) == p.presort assert "even" == p.algo assert not p.empty p = PartitionSpec(partition_by=["a", "b", "c"], presort="d,e desc", algo="EvEN", num_partitions="ROWCOUNT*3", row_limit=4, size_limit="5k") p2 = PartitionSpec(p) assert p2.jsondict == p.jsondict assert "d ASC,e DESC" == p2.presort_expr assert not p.empty assert not p2.empty # partition by overlaps with presort raises(SyntaxError, lambda: PartitionSpec(partition_by=[ "a", "b", "c"], presort="a asc,e desc", algo="EvEN")) # partition by has dups raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"])) # partition by has dups raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"])) # bad input raises(TypeError, lambda: PartitionSpec(1)) # bad presort raises(SyntaxError, lambda: PartitionSpec(presort="a xsc,e desc")) raises(SyntaxError, lambda: PartitionSpec(presort="a asc,a desc")) raises(SyntaxError, lambda: PartitionSpec(presort="a b asc,a desc")) p = PartitionSpec(dict(partition_by=["a"], presort="d asc,e desc")) assert dict(a=True, d=True, e=False) == p.get_sorts( Schema("a:int,b:int,d:int,e:int")) p = PartitionSpec(dict(partition_by=["e", "a"], presort="d asc")) assert p.get_key_schema(Schema("a:int,b:int,d:int,e:int")) == "e:int,a:int" # modification a = PartitionSpec(by=["a", "b"]) b = PartitionSpec(a, by=["a"], num=2) assert ["a", "b"] == a.partition_by assert '0' == a.num_partitions assert ["a"] == b.partition_by assert '2' == b.num_partitions a = PartitionSpec(by=["a"], presort="b DESC, c") b = PartitionSpec(by=["a"], presort="c,b DESC") assert a.presort != b.presort c = PartitionSpec(b, presort=a.presort) assert a.presort == c.presort c = PartitionSpec(b, presort=[("b", False), ("c", True)]) assert a.presort == c.presort