예제 #1
0
def test_get_num_partitions():
    p = PartitionSpec(dict(partition_by=["b", "a"]))
    assert 0 == p.get_num_partitions()

    p = PartitionSpec(dict(partition_by=["b", "a"], num=123))
    assert 123 == p.get_num_partitions()

    p = PartitionSpec(dict(partition_by=["b", "a"], num="(x + Y) * 2"))
    assert 6 == p.get_num_partitions(x=lambda: 1, Y=lambda: 2)
    raises(Exception, lambda: p.get_num_partitions(x=lambda: 1))

    p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,CORECOUNT)"))
    assert 90 == p.get_num_partitions(
        **{KEYWORD_ROWCOUNT: lambda: 100, KEYWORD_CORECOUNT: lambda: 90})
예제 #2
0
    def repartition(self, df: DataFrame,
                    partition_spec: PartitionSpec) -> DataFrame:
        def _persist_and_count(df: DataFrame) -> int:
            df = self.persist(df)
            return df.count()

        df = self.to_df(df)
        num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(df)}
        num = partition_spec.get_num_partitions(**num_funcs)

        if partition_spec.algo == "hash":
            sdf = hash_repartition(self.spark_session, df.native, num,
                                   partition_spec.partition_by)
        elif partition_spec.algo == "rand":
            sdf = rand_repartition(self.spark_session, df.native, num,
                                   partition_spec.partition_by)
        elif partition_spec.algo == "even":
            df = self.persist(df)
            sdf = even_repartition(self.spark_session, df.native, num,
                                   partition_spec.partition_by)
        else:  # pragma: no cover
            raise NotImplementedError(partition_spec.algo +
                                      " is not supported")
        sorts = partition_spec.get_sorts(df.schema)
        if len(sorts) > 0:
            sdf = sdf.sortWithinPartitions(*sorts.keys(),
                                           ascending=list(sorts.values()))
        return self.to_df(sdf, df.schema, df.metadata)
예제 #3
0
 def repartition(self, df: DataFrame,
                 partition_spec: PartitionSpec) -> DaskDataFrame:
     df = self.to_df(df)
     if partition_spec.empty:
         return df
     if len(partition_spec.partition_by) > 0:
         return df
     p = partition_spec.get_num_partitions(
         **{
             KEYWORD_ROWCOUNT: lambda: df.persist().count(),  # type: ignore
             KEYWORD_CORECOUNT: lambda: 2,  # TODO: remove this hard code
         })
     if p > 0:
         return DaskDataFrame(
             df.native.repartition(npartitions=p),
             schema=df.schema,
             metadata=df.metadata,
             type_safe=False,
         )
     return df