def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DataFrame: def _persist_and_count(df: DataFrame) -> int: df = self.persist(df) return df.count() df = self.to_df(df) num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(df)} num = partition_spec.get_num_partitions(**num_funcs) if partition_spec.algo == "hash": sdf = hash_repartition(self.spark_session, df.native, num, partition_spec.partition_by) elif partition_spec.algo == "rand": sdf = rand_repartition(self.spark_session, df.native, num, partition_spec.partition_by) elif partition_spec.algo == "even": df = self.persist(df) sdf = even_repartition(self.spark_session, df.native, num, partition_spec.partition_by) else: # pragma: no cover raise NotImplementedError(partition_spec.algo + " is not supported") sorts = partition_spec.get_sorts(df.schema) if len(sorts) > 0: sdf = sdf.sortWithinPartitions(*sorts.keys(), ascending=list(sorts.values())) return self.to_df(sdf, df.schema, df.metadata)
def test_partition_spec(): p = PartitionSpec() assert [] == p.partition_by "0" == p.num_partitions {} == p.presort "hash" == p.algo assert p.empty p = PartitionSpec(None) assert p.empty p2 = PartitionSpec(p) assert p2.empty p = PartitionSpec(json.dumps(dict(partition_by=["a", "b", "c"], num_partitions=1))) assert ["a", "b", "c"] == p.partition_by assert "1" == p.num_partitions assert {} == p.presort assert "hash" == p.algo assert not p.empty p = PartitionSpec(dict(by=["a", "b", "c"], presort="d asc,e desc")) assert ["a", "b", "c"] == p.partition_by assert "0" == p.num_partitions assert dict(d=True, e=False) == p.presort assert "hash" == p.algo assert not p.empty p = PartitionSpec(by=["a", "b", "c"], num=5, presort="d,e desc", algo="EvEN") assert ["a", "b", "c"] == p.partition_by assert "5" == p.num_partitions assert dict(d=True, e=False) == p.presort assert "even" == p.algo assert not p.empty p = PartitionSpec(partition_by=["a", "b", "c"], presort="d,e desc", algo="EvEN", num_partitions="ROWCOUNT*3", row_limit=4, size_limit="5k") p2 = PartitionSpec(p) assert p2.jsondict == p.jsondict assert "d ASC,e DESC" == p2.presort_expr assert not p.empty assert not p2.empty # partition by overlaps with presort raises(SyntaxError, lambda: PartitionSpec(partition_by=[ "a", "b", "c"], presort="a asc,e desc", algo="EvEN")) # partition by has dups raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"])) # partition by has dups raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"])) # bad input raises(TypeError, lambda: PartitionSpec(1)) # bad presort raises(SyntaxError, lambda: PartitionSpec(presort="a xsc,e desc")) raises(SyntaxError, lambda: PartitionSpec(presort="a asc,a desc")) raises(SyntaxError, lambda: PartitionSpec(presort="a b asc,a desc")) p = PartitionSpec(dict(partition_by=["a"], presort="d asc,e desc")) assert dict(a=True, d=True, e=False) == p.get_sorts( Schema("a:int,b:int,d:int,e:int")) p = PartitionSpec(dict(partition_by=["e", "a"], presort="d asc")) assert p.get_key_schema(Schema("a:int,b:int,d:int,e:int")) == "e:int,a:int" # modification a = PartitionSpec(by=["a", "b"]) b = PartitionSpec(a, by=["a"], num=2) assert ["a", "b"] == a.partition_by assert '0' == a.num_partitions assert ["a"] == b.partition_by assert '2' == b.num_partitions a = PartitionSpec(by=["a"], presort="b DESC, c") b = PartitionSpec(by=["a"], presort="c,b DESC") assert a.presort != b.presort c = PartitionSpec(b, presort=a.presort) assert a.presort == c.presort c = PartitionSpec(b, presort=[("b", False), ("c", True)]) assert a.presort == c.presort