def create_context(n_processes=0): if not n_processes: return fast_pyspark_tester.Context() pool = futures.ProcessPoolExecutor(n_processes) return fast_pyspark_tester.Context( pool=pool, serializer=cloudpickle.dumps, # serializer=pickle.dumps, deserializer=pickle.loads, )
def runtime(self, n=10, processes=1): start = time.time() with futures.ProcessPoolExecutor(processes) as pool: sc = fast_pyspark_tester.Context(pool=pool, serializer=cloudpickle.dumps, deserializer=pickle.loads) rdd = sc.parallelize(range(n), 10) rdd.map(lambda _: time.sleep(0.1)).collect() return time.time() - start
def test_union(self): sc = fast_pyspark_tester.Context() rdd1 = sc.parallelize(['Hello']) rdd2 = sc.parallelize(['World']) union = sc.union([rdd1, rdd2]).collect() print(union) self.assertEqual(union, ['Hello', 'World'])
def test_lock1(self): """Should not be able to create a new RDD inside a map operation.""" sc = fast_pyspark_tester.Context() self.assertRaises( fast_pyspark_tester.exceptions.ContextIsLockedException, lambda: (sc.parallelize(range(5)).map(lambda _: sc.parallelize([1]) ).collect()), )
def main(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() sc = fast_pyspark_tester.Context() sc.parallelize(range(1000000)).saveAsTextFile(tempFile.name + '.gz') rdd = sc.textFile(tempFile.name + '.gz') rdd.collect()
def test_connect(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) result = [] (ssc.textFileStream(LICENSE_FILE, process_all=True).count().foreachRDD( lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 44)
def test_lock2(self): """Should not be able to create RDDs containing RDDs.""" sc = fast_pyspark_tester.Context() def parallelize_in_parallelize(): o = sc.parallelize(sc.parallelize(range(x)) for x in range(5)) print(o.map(lambda x: x.collect()).collect()) self.assertRaises( fast_pyspark_tester.exceptions.ContextIsLockedException, parallelize_in_parallelize, )
def test_read_chunks(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) result = [] (ssc.fileBinaryStream(LICENSE_FILE, recordLength=40, process_all=True).count().foreachRDD( lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 55)
def test_cache_empty_partition(): m = Manip() c = fast_pyspark_tester.Context() rdd = c.parallelize(range(10), 2) rdd = rdd.map(m.trivial_manip_with_debug) rdd = rdd.filter(lambda e: e > 6).cache() print(rdd.collect()) print(rdd.collect()) print('count of map executions: {}'.format(m.count)) assert m.count == 10
def test_main(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) counter = Counter() (ssc.socketBinaryStream( '127.0.0.1', 8125, length='<I').foreachRDD(lambda rdd: counter.update(rdd.collect()))) self.client() ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(counter[b'hellohello'], 1)
def test_connect(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) counter = Counter() (ssc.socketTextStream( '127.0.0.1', 8123).foreachRDD(lambda rdd: counter.update(''.join(rdd.collect())) if rdd.collect() else None)) self.client() ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(counter['a'], 20)
def test_mapValues(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) result = [] ( ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]]) .mapValues(sorted) .foreachRDD(lambda rdd: result.append(rdd.collect())) ) ssc.start() ssc.awaitTermination(timeout=0.15) self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]])
def test_retry(self): class EverySecondCallFails(object): def __init__(self): self.attempt = 0 def __call__(self, value): self.attempt += 1 if self.attempt % 2 == 1: raise Exception return value data = list(range(6)) rdd = fast_pyspark_tester.Context().parallelize(data, 3) result = rdd.mapPartitions(EverySecondCallFails()).collect() self.assertEqual(result, data)
def test_groupByKey(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) result = [] ( ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)], [('a', 2), ('b', 3)]]) .groupByKey() .mapPartitions(sorted) .mapValues(sorted) .foreachRDD(lambda rdd: result.append(rdd.collect())) ) ssc.start() ssc.awaitTermination(timeout=0.25) self.assertEqual(result, [[('a', [2, 5]), ('b', [8])], [('a', [2]), ('b', [3])]])
def run(self, n=2000, to_kv=None, format_='hello'): c = fast_pyspark_tester.Context() stream_c = fast_pyspark_tester.streaming.StreamingContext(c, 1.0) counts = [] sensor_sums = defaultdict(float) sensor_squares = defaultdict(float) sensor_counts = defaultdict(int) if format_ not in ('bello', 'struct'): t = stream_c.socketTextStream('localhost', self.port) else: length = {'bello': 5, 'struct': 8}[format_] t = stream_c.socketBinaryStream('localhost', self.port, length) t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0])) if to_kv is not None: def update(rdd): for k, v in rdd.collect(): sensor_sums[k] += sum(v) sensor_squares[k] += sum(vv**2 for vv in v) sensor_counts[k] += len(v) t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd)) self.client(n, format_=format_) stream_c.start() stream_c.awaitTermination(timeout=5.0) result = max(counts) if counts else 0 sensor_expections = { # expectation of X and X^2 k: (sensor_sums[k] / v, sensor_squares[k] / v) for k, v in sensor_counts.items() } sensors = { k: (ex_ex2[0], math.sqrt(ex_ex2[1] - ex_ex2[0]**2)) for k, ex_ex2 in sensor_expections.items() } print('run: n = {}, counts = {}, result = {}' ''.format(n, counts, result)) print('sensors = {}'.format(sensors)) time.sleep(self.pause) self.port += 1 return result
def test_count(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) result = [] ( ssc.queueStream([range(20), ['a', 'b'], ['c']]) .count() .foreachRDD(lambda rdd: result.append(rdd.collect()[0])) ) ssc.start() if platform.system() == 'Windows': # Windows is freakingly slow! So we need a higher timeout there... ssc.awaitTermination(timeout=1.0) else: ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 23)
def test_timed_cache(): m = Manip() # create a timed cache manager cm = fast_pyspark_tester.TimedCacheManager(timeout=1.0) # create a cache entry c = fast_pyspark_tester.Context(cache_manager=cm) rdd = c.parallelize(range(10), 2) rdd = rdd.map(m.trivial_manip_with_debug).cache() print(rdd.collect()) # make sure the cache is working count_before = m.count print(rdd.collect()) count_after = m.count assert count_before == count_after # wait to have the cache expire time.sleep(1.5) cm.gc() print(rdd.collect()) assert m.count > count_after
def main(): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 1) ssc.textFileStream('/var/log/system.log*').pprint() ssc.start() ssc.awaitTermination(timeout=3.0)
def read_csv(filename): c = fast_pyspark_tester.Context() r = c.textFile(filename) r = r.map(lambda l: l + 'something else') print(r.count())
def test_save_gz(self): sc = fast_pyspark_tester.Context() ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1) (ssc.textFileStream(LICENSE_FILE).count().saveAsTextFiles( 'tests/textout/', suffix='.gz'))
def test_parallelize_single_element(self): my_rdd = fast_pyspark_tester.Context().parallelize([7], 100) self.assertEqual(my_rdd.collect(), [7])
def setUp(self): self.pool = futures.ThreadPoolExecutor(4) self.sc = fast_pyspark_tester.Context(pool=self.pool)
def setUp(self): self.sc = fast_pyspark_tester.Context()
def setUp(self): self.pool = multiprocessing.Pool(4) self.sc = fast_pyspark_tester.Context(pool=self.pool)
def setUp(self): self.pool = multiprocessing.Pool(4) self.sc = fast_pyspark_tester.Context(pool=self.pool, serializer=cloudpickle.dumps, deserializer=pickle.loads)
def test_parallelize_matched_elements(self): my_rdd = fast_pyspark_tester.Context().parallelize([1, 2, 3, 4, 5], 5) self.assertEqual(my_rdd.collect(), [1, 2, 3, 4, 5])
def test_parallelize_empty_partitions_at_end(self): my_rdd = fast_pyspark_tester.Context().parallelize(range(3529), 500) print(my_rdd.getNumPartitions()) my_rdd.foreachPartition(lambda p: print(sum(1 for _ in p))) self.assertEqual(my_rdd.getNumPartitions(), 500) self.assertEqual(my_rdd.count(), 3529)
def setUp(self): self.pool = futures.ProcessPoolExecutor(4) self.sc = fast_pyspark_tester.Context(pool=self.pool, serializer=cloudpickle.dumps, deserializer=pickle.loads)
def test_broadcast(self): b = fast_pyspark_tester.Context().broadcast([1, 2, 3]) self.assertEqual(b.value[0], 1)
def test_version(self): self.assertTrue(isinstance(fast_pyspark_tester.Context().version, str))