示例#1
0
    def repartition(self, df: DataFrame,
                    partition_spec: PartitionSpec) -> DataFrame:
        def _persist_and_count(df: DataFrame) -> int:
            df = self.persist(df)
            return df.count()

        df = self.to_df(df)
        num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(df)}
        num = partition_spec.get_num_partitions(**num_funcs)

        if partition_spec.algo == "hash":
            sdf = hash_repartition(self.spark_session, df.native, num,
                                   partition_spec.partition_by)
        elif partition_spec.algo == "rand":
            sdf = rand_repartition(self.spark_session, df.native, num,
                                   partition_spec.partition_by)
        elif partition_spec.algo == "even":
            df = self.persist(df)
            sdf = even_repartition(self.spark_session, df.native, num,
                                   partition_spec.partition_by)
        else:  # pragma: no cover
            raise NotImplementedError(partition_spec.algo +
                                      " is not supported")
        sorts = partition_spec.get_sorts(df.schema)
        if len(sorts) > 0:
            sdf = sdf.sortWithinPartitions(*sorts.keys(),
                                           ascending=list(sorts.values()))
        return self.to_df(sdf, df.schema, df.metadata)
示例#2
0
def test_partition_spec():
    p = PartitionSpec()
    assert [] == p.partition_by
    "0" == p.num_partitions
    {} == p.presort
    "hash" == p.algo
    assert p.empty

    p = PartitionSpec(None)
    assert p.empty
    p2 = PartitionSpec(p)
    assert p2.empty

    p = PartitionSpec(json.dumps(dict(partition_by=["a", "b", "c"], num_partitions=1)))
    assert ["a", "b", "c"] == p.partition_by
    assert "1" == p.num_partitions
    assert {} == p.presort
    assert "hash" == p.algo
    assert not p.empty

    p = PartitionSpec(dict(by=["a", "b", "c"], presort="d asc,e desc"))
    assert ["a", "b", "c"] == p.partition_by
    assert "0" == p.num_partitions
    assert dict(d=True, e=False) == p.presort
    assert "hash" == p.algo
    assert not p.empty

    p = PartitionSpec(by=["a", "b", "c"], num=5, presort="d,e desc", algo="EvEN")
    assert ["a", "b", "c"] == p.partition_by
    assert "5" == p.num_partitions
    assert dict(d=True, e=False) == p.presort
    assert "even" == p.algo
    assert not p.empty

    p = PartitionSpec(partition_by=["a", "b", "c"], presort="d,e desc", algo="EvEN",
                      num_partitions="ROWCOUNT*3", row_limit=4, size_limit="5k")
    p2 = PartitionSpec(p)
    assert p2.jsondict == p.jsondict
    assert "d ASC,e DESC" == p2.presort_expr
    assert not p.empty
    assert not p2.empty

    # partition by overlaps with presort
    raises(SyntaxError, lambda: PartitionSpec(partition_by=[
           "a", "b", "c"], presort="a asc,e desc", algo="EvEN"))

    # partition by has dups
    raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"]))

    # partition by has dups
    raises(SyntaxError, lambda: PartitionSpec(partition_by=["a", "b", "b"]))

    # bad input
    raises(TypeError, lambda: PartitionSpec(1))

    # bad presort
    raises(SyntaxError, lambda: PartitionSpec(presort="a xsc,e desc"))
    raises(SyntaxError, lambda: PartitionSpec(presort="a asc,a desc"))
    raises(SyntaxError, lambda: PartitionSpec(presort="a b asc,a desc"))

    p = PartitionSpec(dict(partition_by=["a"], presort="d asc,e desc"))
    assert dict(a=True, d=True, e=False) == p.get_sorts(
        Schema("a:int,b:int,d:int,e:int"))
    p = PartitionSpec(dict(partition_by=["e", "a"], presort="d asc"))
    assert p.get_key_schema(Schema("a:int,b:int,d:int,e:int")) == "e:int,a:int"

    # modification
    a = PartitionSpec(by=["a", "b"])
    b = PartitionSpec(a, by=["a"], num=2)
    assert ["a", "b"] == a.partition_by
    assert '0' == a.num_partitions
    assert ["a"] == b.partition_by
    assert '2' == b.num_partitions

    a = PartitionSpec(by=["a"], presort="b DESC, c")
    b = PartitionSpec(by=["a"], presort="c,b DESC")
    assert a.presort != b.presort
    c = PartitionSpec(b, presort=a.presort)
    assert a.presort == c.presort
    c = PartitionSpec(b, presort=[("b", False), ("c", True)])
    assert a.presort == c.presort