def test_get_num_partitions(): p = PartitionSpec(dict(partition_by=["b", "a"])) assert 0 == p.get_num_partitions() p = PartitionSpec(dict(partition_by=["b", "a"], num=123)) assert 123 == p.get_num_partitions() p = PartitionSpec(dict(partition_by=["b", "a"], num="(x + Y) * 2")) assert 6 == p.get_num_partitions(x=lambda: 1, Y=lambda: 2) raises(Exception, lambda: p.get_num_partitions(x=lambda: 1)) p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,CORECOUNT)")) assert 90 == p.get_num_partitions( **{KEYWORD_ROWCOUNT: lambda: 100, KEYWORD_CORECOUNT: lambda: 90})
def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DataFrame: def _persist_and_count(df: DataFrame) -> int: df = self.persist(df) return df.count() df = self.to_df(df) num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(df)} num = partition_spec.get_num_partitions(**num_funcs) if partition_spec.algo == "hash": sdf = hash_repartition(self.spark_session, df.native, num, partition_spec.partition_by) elif partition_spec.algo == "rand": sdf = rand_repartition(self.spark_session, df.native, num, partition_spec.partition_by) elif partition_spec.algo == "even": df = self.persist(df) sdf = even_repartition(self.spark_session, df.native, num, partition_spec.partition_by) else: # pragma: no cover raise NotImplementedError(partition_spec.algo + " is not supported") sorts = partition_spec.get_sorts(df.schema) if len(sorts) > 0: sdf = sdf.sortWithinPartitions(*sorts.keys(), ascending=list(sorts.values())) return self.to_df(sdf, df.schema, df.metadata)
def repartition(self, df: DataFrame, partition_spec: PartitionSpec) -> DaskDataFrame: df = self.to_df(df) if partition_spec.empty: return df if len(partition_spec.partition_by) > 0: return df p = partition_spec.get_num_partitions( **{ KEYWORD_ROWCOUNT: lambda: df.persist().count(), # type: ignore KEYWORD_CORECOUNT: lambda: 2, # TODO: remove this hard code }) if p > 0: return DaskDataFrame( df.native.repartition(npartitions=p), schema=df.schema, metadata=df.metadata, type_safe=False, ) return df