def test_first_mp(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads) my_rdd = c.parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3) print(my_rdd.first()) assert my_rdd.first() == 1
def test_saveAsTextFile_zip(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(10)).saveAsTextFile(tempFile.name + '.zip') read_rdd = Context().textFile(tempFile.name + '.zip') print(read_rdd.collect()) assert '5' in read_rdd.collect()
def test_read_7z(): # file was created with: # 7z a tests/data.7z tests/readme_example.py # (brew install p7zip) rdd = Context().textFile('tests/data.7z') print(rdd.collect()) assert 'from pysparkling import Context' in rdd.collect()
def test_multiprocessing(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=dill.dumps, deserializer=dill.loads) my_rdd = c.parallelize([1, 3, 4]) r = my_rdd.map(lambda x: x*x).collect() print(r) assert 16 in r
def test_saveAsTextFile_zip(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(10)).saveAsTextFile(tempFile.name+'.zip') read_rdd = Context().textFile(tempFile.name+'.zip') print(read_rdd.collect()) assert '5' in read_rdd.collect()
def test_algorith_excution(self): """Tests the algorithm execution with basic parameters """ folder_path = os.path.dirname(os.path.realpath(__file__)) json_path = os.path.join(folder_path, 'data', 'objects.json') dataset = Context( serializer=cloudpickle.dumps, deserializer=pickle.loads, ).textFile(json_path).map(json.loads) dataset.persist() number_of_cluster = 2 algorithm_settings = settings.ClusteringSetting( number_of_cluster, 2, 0.01, AlgorithmProvider.random_distance) labeled, centroids = kmeans.compute_cluster( dataset, algorithm_settings, AlgorithmProvider.random_sampling, AlgorithmProvider.random_cluster, AlgorithmProvider.dummy_update) labeled.collect() self.assertEqual(10, labeled.count()) first_item = labeled.first() self.assertTrue(0 <= first_item[0] < 2) self.assertEqual(number_of_cluster, len(centroids))
def test_local_regex_read(): # was not working before 0.3.19 tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(30), 30).saveAsTextFile(tempFile.name) d = Context().textFile(tempFile.name + '/part-0000*').collect() print(d) assert len(d) == 10
def test_lazy_execution(): r = Context().textFile('tests/test_multiprocessing.py') r = r.map(indent_line) exec_before_collect = INDENT_WAS_EXECUTED # at this point, no map() or foreach() should have been executed r.collect() exec_after_collect = INDENT_WAS_EXECUTED assert not exec_before_collect and exec_after_collect
def test_s3_textFile(): myrdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/' 'CC-MAIN-2015-11/warc.paths.*') assert ( 'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/' 'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.' 'internal.warc.gz' in myrdd.collect())
def test_filter(): my_rdd = Context().parallelize( [1, 2, 2, 4, 1, 3, 5, 9], 3, ).filter(lambda x: x % 2 == 0) print(my_rdd.collect()) print(my_rdd.count()) assert my_rdd.count() == 3
def test_multiprocessing(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads) my_rdd = c.parallelize([1, 3, 4]) r = my_rdd.map(lambda x: x*x).collect() print(r) assert 16 in r
def test_mapPartitions(): rdd = Context().parallelize([1, 2, 3, 4], 2) def f(iterator): yield sum(iterator) r = rdd.mapPartitions(f).collect() assert 3 in r and 7 in r
def test_lazy_execution_threadpool(): with futures.ThreadPoolExecutor(4) as p: r = Context(pool=p).textFile('tests/test_multiprocessing.py') r = r.map(indent_line) r = r.map(indent_line) r = r.collect() # ThreadPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_hdfs_file_exists(): random.seed() fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' rdd = Context().parallelize(f'Hello World {x}' for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_s3_textFile(): myrdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/' 'CC-MAIN-2015-11/warc.paths.*' ) assert ( 'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/' 'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.' 'internal.warc.gz' in myrdd.collect() )
def create_context(n_processes=0): if not n_processes: return Context() p = futures.ProcessPoolExecutor(n_processes) return Context( pool=p, serializer=cloudpickle.dumps, # serializer=pickle.dumps, deserializer=pickle.loads, )
def test_hdfs_file_exists(): random.seed() fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_s3_textFile_loop(): random.seed() fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' rdd = Context().parallelize(f'Line {n}' for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_hdfs_file_exists(): random.seed() fn1 = '{}/pysparkling_test_{:d}.txt'.format( HDFS_TEST_PATH, random.random() * 999999.0) fn2 = '{}/pysparkling_test_{:d}.txt'.format( HDFS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_gs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format(GS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Line {0}'.format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def run_feature_extraction(): start_time = time.time() desc = 'Feature Extraction for Images' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=desc) default_path = '/media/chris/cschulze_external_4tb/receipt_classifier_images/nonreceipts/train2014' # default_path = '/media/chris/cschulze_external_4tb/elliot_data/train_nonpill' # default_path = '/train_nonpill' parser.add_argument("--input_dir", help="input directory", default=default_path) parser.add_argument("--output", help="output file", default='image_features') args = parser.parse_args() # serialize and put all images in rdd: # use json schema: # "image_name": "", # "bytes": "" # "features": "array[]" image_dir_path = args.input_dir df, data_arr = serialize_and_make_df(image_dir_path) print df.head() print df.info() # df to df_cvs: csv_df_file = 'dataframe_csv_file.csv' json_df_file = 'dataframe_csv_file.json' df.to_csv(csv_df_file, header=False, index=False) # df.to_json(json_df_file) # rdd from df_csv # pysparkling: sc = Context() # pyspark: # conf = SparkConf().setAppName("HOG and GIST ETL") # sc = SparkContext(conf=conf) # rdd = sc.textFile(json_df_file) num_parts = 4 rdd = sc.parallelize(data_arr, num_parts) # submit image rdd to processing rdd_features = rdd.map(get_features).coalesce(1) # save as txt file: rdd_features.map(dump).saveAsTextFile(args.output) print "------------------ %f minutes elapsed ------------------------" % ( (time.time() - start_time) / 60.0)
def test_processpool_distributed_cache(): with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).parallelize(range(3), 3) r = r.map(lambda _: time.sleep(0.1)).cache() r.collect() time_start = time.time() print(r.collect()) time_end = time.time() assert time_end - time_start < 0.3
def test_hdfs_file_exists(): if not HDFS_TEST_PATH: raise SkipTest random.seed() fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_session_storage_level(self): spark = SparkSession(Context()) df = spark.range(4, numPartitions=2) self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1))) persisted_df = df.persist() self.assertEqual(persisted_df.is_cached, True) self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
def test_s3_textFile_loop(): if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH: raise SkipTest random.seed() fn = '{}/pysparkling_test_{0}.txt'.format(S3_TEST_PATH, int(random.random() * 999999.0)) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_gs_textFile_loop(): if not OAUTH2_CLIENT_ID or not GS_TEST_PATH: raise SkipTest random.seed() fn = GS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_hdfs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format( HDFS_TEST_PATH, random.random() * 999999.0) print('HDFS test file: {0}'.format(fn)) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) print(rdd.collect()) print(read_rdd.collect()) assert ( rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())) )
def test_hdfs_file_exists(): if not HDFS_TEST_PATH: raise SkipTest random.seed() fn1 = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format( int(random.random()*999999.0) ) fn2 = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format( int(random.random()*999999.0) ) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_hdfs_textFile_loop(): if not HDFS_TEST_PATH: raise SkipTest random.seed() fn = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format( int(random.random()*999999.0) ) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) assert ( rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())) )
def test_saveAsTextFile(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(10)).saveAsTextFile(tempFile.name) with open(tempFile.name, 'r') as f: r = f.readlines() print(r) assert '5\n' in r
def test_cache(): # this crashes in version 0.2.28 lines = Context().textFile('tests/*textFil*.py') lines = lines.map(lambda l: '-' + l).cache() print(len(lines.collect())) lines = lines.map(lambda l: '+' + l) lines = lines.map(lambda l: '-' + l).cache() lines = lines.collect() print(lines) assert '-+-from pysparkling import Context' in lines
def test_gs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format( GS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Line {0}'.format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert ( rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())) )
def test_cache(): my_rdd = Context().parallelize([1, 2, 3, 4], 2) my_rdd = my_rdd.map(lambda x: x*x).cache() print('no exec until here') print(my_rdd.first()) print('executed map on first partition only') print(my_rdd.collect()) print('now map() was executed on all partitions and should ' 'not be executed again') print(my_rdd.collect()) assert len(my_rdd.collect()) == 4 and 16 in my_rdd.collect()
def test_lazy_execution_threadpool(): def indent_line(l): return '--- ' + l with futures.ThreadPoolExecutor(4) as p: r = Context(pool=p).textFile('tests/test_multiprocessing.py') r = r.map(indent_line).cache() r.collect() r = r.map(indent_line) r = r.collect() # ThreadPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_lazy_execution_processpool(): def indent_line(l): return '--- ' + l with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).textFile('tests/test_multiprocessing.py') # .take(10) print(r.collect()) r = r.map(indent_line) print(r.collect()) r = r.cache() print(r.collect()) r = r.map(indent_line) r = r.collect() # ProcessPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_cache(): # this crashes in version 0.2.28 lines = Context().textFile('tests/*textFil*.py') lines = lines.map(lambda l: '-'+l).cache() print(len(lines.collect())) lines = lines.map(lambda l: '+'+l) lines = lines.map(lambda l: '-'+l).cache() lines = lines.collect() print(lines) assert '-+-from pysparkling import Context' in lines
def test_gs_textFile_loop(): if not OAUTH2_CLIENT_ID or not GS_TEST_PATH: raise SkipTest random.seed() fn = '{}/pysparkling_test_{0}.txt'.format( GS_TEST_PATH, int(random.random() * 999999.0) ) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert ( rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())) )
def test_s3_textFile_loop(): if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH: raise SkipTest random.seed() fn = S3_TEST_PATH+'/pysparkling_test_{0}.txt'.format( int(random.random()*999999.0) ) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert ( rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())) )
def test_lazy_execution_processpool(): def indent_line(l): return '--- '+l with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).textFile('tests/test_multiprocessing.py') # .take(10) print(r.collect()) r = r.map(indent_line) print(r.collect()) r = r.cache() print(r.collect()) r = r.map(indent_line) r = r.collect() # ProcessPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_lazy_execution_processpool(): with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=dill.dumps, deserializer=dill.loads, ).textFile('tests/test_multiprocessing.py') r = r.map(indent_line).cache() r.collect() r = r.map(indent_line) r = r.collect() # ProcessPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_lazy_execution(): class I(object): def __init__(self): self.executed = False def indent_line(self, l): # global indent_was_executed self.executed = True return '--- ' + l r = Context().textFile('tests/test_multiprocessing.py') i = I() r = r.map(i.indent_line) exec_before_collect = i.executed # at this point, no map() or foreach() should have been executed r = r.map(i.indent_line).cache() print(r.collect()) r = r.map(i.indent_line) r.collect() exec_after_collect = i.executed print((exec_before_collect, exec_after_collect)) assert not exec_before_collect and exec_after_collect
def test_lazy_execution(): class I(object): def __init__(self): self.executed = False def indent_line(self, l): # global indent_was_executed self.executed = True return '--- '+l r = Context().textFile('tests/test_multiprocessing.py') i = I() r = r.map(i.indent_line) exec_before_collect = i.executed # at this point, no map() or foreach() should have been executed r = r.map(i.indent_line).cache() print(r.collect()) r = r.map(i.indent_line) r.collect() exec_after_collect = i.executed print((exec_before_collect, exec_after_collect)) assert not exec_before_collect and exec_after_collect
def test_local_textFile_name(): name = Context().textFile('tests/*.py').name() print(name) assert name == 'tests/*.py'
def test_local_textFile_1(): lines = Context().textFile('tests/*textFil*.py').collect() print(lines) assert 'from pysparkling import Context' in lines
def test_local_textFile_2(): line_count = Context().textFile('tests/*.py').count() print(line_count) assert line_count > 90
def test_pyspark_compatibility_gz(): kv = Context().textFile('tests/pyspark/key_value.txt.gz').collect() print(kv) assert u"a\t1" in kv and u"b\t2" in kv and len(kv) == 2
from pysparkling import Context my_rdd = Context().textFile("tests/*.py") print( "In tests/*.py: all lines={0}, with import={1}".format( my_rdd.count(), my_rdd.filter(lambda l: l.startswith("import ")).count() ) )
from pysparkling import Context counts = Context().textFile( 'README.rst' ).flatMap( lambda line: line.split(' ') ).map( lambda word: (word, 1) ).reduceByKey( lambda a, b: a + b ) print(counts.collect())
def test_pyspark_compatibility_txt(): kv = Context().textFile('tests/pyspark/key_value.txt').collect() print(kv) assert u"('a', 1)" in kv and u"('b', 2)" in kv and len(kv) == 2
def test_http_textFile(): myrdd = Context().textFile( 'https://s3-us-west-2.amazonaws.com/human-microbiome-project/DEMO/' 'HM16STR/46333/by_subject/1139.fsa' ) assert u'TGCTGCGGTGAATGCGTTCCCGGGTCT' in myrdd.collect()
class RDDTest(unittest.TestCase): """Tests for the resilient distributed databases""" def setUp(self): self.context = Context() def testLeftOuterJoinSimple(self): """Test the basic left outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) zx = sorted(z.leftOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc'))]) self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))]) def testLeftOuterJoinDuplicate(self): """Test the left outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2'))]) def testRightOuterJoinSimple(self): """Test the basic right outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) zx = sorted(z.rightOuterJoin(x).collect()) self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc'))]) def testRightOuterJoinDuplicate(self): """Test the right outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def testFullOuterJoinSimple(self): """Test the basic full outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) zx = sorted(z.fullOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc')), ('d', ('zd', None))]) def testFullOuterJoinDuplicate(self): """Test the full outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def test_cartesian(self): x = self.context.parallelize(range(0, 2)) y = self.context.parallelize(range(3, 6)) c = x.cartesian(y) result = sorted(c.collect()) expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)]) self.assertListEqual(result, expected) def test_sample(self): rdd = self.context.parallelize(range(100), 4) self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14) def test_sampleByKey(self): fractions = {"a": 0.2, "b": 0.1} range_rdd = self.context.parallelize(range(0, 1000)) rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd) sample = dict( rdd.sampleByKey(False, fractions, 2).groupByKey().collect() ) self.assertTrue(100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150) self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0) self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0) def test_groupByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValue(object): def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValue(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.groupByKey().collect() expected_group = ((0, r[0::3]), (1, r[1::3]), (2, r[2::3])) grouped_dict = {k: v for k, v in actual_group} for k, v in expected_group: self.assertIn(k, grouped_dict) for vv in v: self.assertIn(vv, grouped_dict[k]) def test_reduceByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValueAddable(object): def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __add__(self, other): return self.__class__(self.value + other.value) def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValueAddable(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.reduceByKey(add).collect() expected_group = ((0, IncomparableValueAddable(3)), (1, IncomparableValueAddable(5)), (2, IncomparableValueAddable(7))) grouped_dict = {k: v for k, v in actual_group} # Keep this order-agnostic for k, v in expected_group: self.assertEqual(grouped_dict[k], v)
def test_wholeTextFiles(): t = Context().wholeTextFiles('tests/*.py').lookup('tests/test_textFile.py') print(t) assert 'test_wholeTextFiles' in t[0]
from pysparkling import Context # read all the paths of warc and wat files of the latest Common Crawl paths_rdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.*,' 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/wat.paths.gz' ) print(paths_rdd.collect())
def test_wholeTextFiles(): all_files = Context().wholeTextFiles('{}/*.py'.format(LOCAL_TEST_PATH)) this_file = all_files.lookup(__file__) print(this_file) assert 'test_wholeTextFiles' in this_file[0]
def test_local_textFile_name(): name = Context().textFile('{}/*.py'.format(LOCAL_TEST_PATH)).name() print(name) assert name.startswith('{}/*.py'.format(LOCAL_TEST_PATH))
def setUp(self): self.context = Context()