Exemplo n.º 1
0
 def test_rdd_sampler_func(self):
     # SPARK-38879: Test case to improve test coverage for RDDSampler
     # RDDSampler.func
     rdd = self.sc.parallelize(range(20), 2)
     sample_count = rdd.mapPartitionsWithIndex(RDDSampler(False, 0.4, 10).func).count()
     self.assertGreater(sample_count, 3)
     self.assertLess(sample_count, 10)
     sample_data = rdd.mapPartitionsWithIndex(RDDSampler(True, 1, 10).func).collect()
     sample_data.sort()
     # check if at least one element is repeated.
     self.assertTrue(
         any(sample_data[i] == sample_data[i - 1] for i in range(1, len(sample_data)))
     )
Exemplo n.º 2
0
    def takeSample(self, withReplacement, num, seed):
        """
        Return a fixed-size sampled subset of this RDD (currently requires numpy).

        >>> sc.parallelize(range(0, 10)).takeSample(True, 10, 1) #doctest: +SKIP
        [4, 2, 1, 8, 2, 7, 0, 4, 1, 4]
        """

        fraction = 0.0
        total = 0
        multiplier = 3.0
        initialCount = self.count()
        maxSelected = 0

        if (num < 0):
            raise ValueError

        if (initialCount == 0):
            return list()

        if initialCount > sys.maxint - 1:
            maxSelected = sys.maxint - 1
        else:
            maxSelected = initialCount

        if num > initialCount and not withReplacement:
            total = maxSelected
            fraction = multiplier * (maxSelected + 1) / initialCount
        else:
            fraction = multiplier * (num + 1) / initialCount
            total = num

        samples = self.sample(withReplacement, fraction, seed).collect()

        # If the first sample didn't turn out large enough, keep trying to take samples;
        # this shouldn't happen often because we use a big multiplier for their initial size.
        # See: scala/spark/RDD.scala
        while len(samples) < total:
            if seed > sys.maxint - 2:
                seed = -1
            seed += 1
            samples = self.sample(withReplacement, fraction, seed).collect()

        sampler = RDDSampler(withReplacement, fraction, seed+1)
        sampler.shuffle(samples)
        return samples[0:total]
Exemplo n.º 3
0
    def takeSample(self, withReplacement, num, seed):
        """
        Return a fixed-size sampled subset of this RDD (currently requires numpy).

        >>> sc.parallelize(range(0, 10)).takeSample(True, 10, 1) #doctest: +SKIP
        [4, 2, 1, 8, 2, 7, 0, 4, 1, 4]
        """

        fraction = 0.0
        total = 0
        multiplier = 3.0
        initialCount = self.count()
        maxSelected = 0

        if (num < 0):
            raise ValueError

        if (initialCount == 0):
            return list()

        if initialCount > sys.maxint - 1:
            maxSelected = sys.maxint - 1
        else:
            maxSelected = initialCount

        if num > initialCount and not withReplacement:
            total = maxSelected
            fraction = multiplier * (maxSelected + 1) / initialCount
        else:
            fraction = multiplier * (num + 1) / initialCount
            total = num

        samples = self.sample(withReplacement, fraction, seed).collect()

        # If the first sample didn't turn out large enough, keep trying to take samples;
        # this shouldn't happen often because we use a big multiplier for their initial size.
        # See: scala/spark/RDD.scala
        while len(samples) < total:
            if seed > sys.maxint - 2:
                seed = -1
            seed += 1
            samples = self.sample(withReplacement, fraction, seed).collect()

        sampler = RDDSampler(withReplacement, fraction, seed + 1)
        sampler.shuffle(samples)
        return samples[0:total]
Exemplo n.º 4
0
Arquivo: rdd.py Projeto: xoltar/spark
    def sample(self, withReplacement, fraction, seed):
        """
        Return a sampled subset of this RDD (relies on numpy and falls back
        on default random generator if numpy is unavailable).

        >>> sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() #doctest: +SKIP
        [2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98]
        """
        return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True)
Exemplo n.º 5
0
def _sample(rdd: RDD, fraction: float, seed: int):
    from pyspark.rddsampler import RDDSampler
    assert fraction >= 0.0, "Negative fraction value: %s" % fraction

    _sample_func = RDDSampler(False, fraction, seed).func

    def _func(split, iterator):
        return _sample_func(split, iterator)

    return rdd.mapPartitionsWithIndex(_func, preservesPartitioning=True)