def test_read_7z(): # file was created with: # 7z a tests/data.7z tests/readme_example.py # (brew install p7zip) rdd = Context().textFile('tests/data.7z') print(rdd.collect()) assert 'from pysparkling import Context' in rdd.collect()
def test_saveAsTextFile_zip(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(10)).saveAsTextFile(tempFile.name+'.zip') read_rdd = Context().textFile(tempFile.name+'.zip') print(read_rdd.collect()) assert '5' in read_rdd.collect()
def test_saveAsTextFile_zip(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(10)).saveAsTextFile(tempFile.name + '.zip') read_rdd = Context().textFile(tempFile.name + '.zip') print(read_rdd.collect()) assert '5' in read_rdd.collect()
def test_lazy_execution(): r = Context().textFile('tests/test_multiprocessing.py') r = r.map(indent_line) exec_before_collect = INDENT_WAS_EXECUTED # at this point, no map() or foreach() should have been executed r.collect() exec_after_collect = INDENT_WAS_EXECUTED assert not exec_before_collect and exec_after_collect
def test_cache(): # this crashes in version 0.2.28 lines = Context().textFile('tests/*textFil*.py') lines = lines.map(lambda l: '-' + l).cache() print(len(lines.collect())) lines = lines.map(lambda l: '+' + l) lines = lines.map(lambda l: '-' + l).cache() lines = lines.collect() print(lines) assert '-+-from pysparkling import Context' in lines
def test_cache(): # this crashes in version 0.2.28 lines = Context().textFile('tests/*textFil*.py') lines = lines.map(lambda l: '-'+l).cache() print(len(lines.collect())) lines = lines.map(lambda l: '+'+l) lines = lines.map(lambda l: '-'+l).cache() lines = lines.collect() print(lines) assert '-+-from pysparkling import Context' in lines
def test_cache(): my_rdd = Context().parallelize([1, 2, 3, 4], 2) my_rdd = my_rdd.map(lambda x: x*x).cache() print('no exec until here') print(my_rdd.first()) print('executed map on first partition only') print(my_rdd.collect()) print('now map() was executed on all partitions and should ' 'not be executed again') print(my_rdd.collect()) assert len(my_rdd.collect()) == 4 and 16 in my_rdd.collect()
def test_s3_textFile_loop(): random.seed() fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' rdd = Context().parallelize(f'Line {n}' for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_lazy_execution_threadpool(): def indent_line(l): return '--- ' + l with futures.ThreadPoolExecutor(4) as p: r = Context(pool=p).textFile('tests/test_multiprocessing.py') r = r.map(indent_line).cache() r.collect() r = r.map(indent_line) r = r.collect() # ThreadPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_lazy_execution_threadpool(): def indent_line(l): return '--- '+l with futures.ThreadPoolExecutor(4) as p: r = Context(pool=p).textFile('tests/test_multiprocessing.py') r = r.map(indent_line).cache() r.collect() r = r.map(indent_line) r = r.collect() # ThreadPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_gs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format(GS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Line {0}'.format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_gs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format( GS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Line {0}'.format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert ( rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())) )
def test_hdfs_textFile_loop(): random.seed() fn = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' print(f'HDFS test file: {fn}') rdd = Context().parallelize(f'Hello World {x}' for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) print(rdd.collect()) print(read_rdd.collect()) assert (rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())))
def test_lazy_execution_processpool(): with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=dill.dumps, deserializer=dill.loads, ).textFile('tests/test_multiprocessing.py') r = r.map(indent_line).cache() r.collect() r = r.map(indent_line) r = r.collect() # ProcessPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_processpool_distributed_cache(): with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).parallelize(range(3), 3) r = r.map(lambda _: time.sleep(0.1)).cache() r.collect() time_start = time.time() print(r.collect()) time_end = time.time() assert time_end - time_start < 0.3
def test_hdfs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) print('HDFS test file: {0}'.format(fn)) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) print(rdd.collect()) print(read_rdd.collect()) assert (rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())))
def test_gs_textFile_loop(): if not OAUTH2_CLIENT_ID or not GS_TEST_PATH: raise SkipTest random.seed() fn = GS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_s3_textFile_loop(): if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH: raise SkipTest random.seed() fn = '{}/pysparkling_test_{0}.txt'.format(S3_TEST_PATH, int(random.random() * 999999.0)) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_hdfs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format( HDFS_TEST_PATH, random.random() * 999999.0) print('HDFS test file: {0}'.format(fn)) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) print(rdd.collect()) print(read_rdd.collect()) assert ( rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())) )
def test_filter(): my_rdd = Context().parallelize( [1, 2, 2, 4, 1, 3, 5, 9], 3, ).filter(lambda x: x % 2 == 0) print(my_rdd.collect()) print(my_rdd.count()) assert my_rdd.count() == 3
def test_hdfs_textFile_loop(): if not HDFS_TEST_PATH: raise SkipTest random.seed() fn = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format( int(random.random()*999999.0) ) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) assert ( rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())) )
def test_s3_textFile(): myrdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/' 'CC-MAIN-2015-11/warc.paths.*') assert ( 'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/' 'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.' 'internal.warc.gz' in myrdd.collect())
def test_gs_textFile_loop(): if not OAUTH2_CLIENT_ID or not GS_TEST_PATH: raise SkipTest random.seed() fn = '{}/pysparkling_test_{0}.txt'.format( GS_TEST_PATH, int(random.random() * 999999.0) ) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert ( rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())) )
def test_s3_textFile_loop(): if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH: raise SkipTest random.seed() fn = S3_TEST_PATH+'/pysparkling_test_{0}.txt'.format( int(random.random()*999999.0) ) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert ( rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())) )
def test_s3_textFile(): myrdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/' 'CC-MAIN-2015-11/warc.paths.*' ) assert ( 'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/' 'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.' 'internal.warc.gz' in myrdd.collect() )
def test_lazy_execution_processpool(): def indent_line(l): return '--- '+l with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).textFile('tests/test_multiprocessing.py') # .take(10) print(r.collect()) r = r.map(indent_line) print(r.collect()) r = r.cache() print(r.collect()) r = r.map(indent_line) r = r.collect() # ProcessPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_lazy_execution_processpool(): def indent_line(l): return '--- ' + l with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).textFile('tests/test_multiprocessing.py') # .take(10) print(r.collect()) r = r.map(indent_line) print(r.collect()) r = r.cache() print(r.collect()) r = r.map(indent_line) r = r.collect() # ProcessPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_lazy_execution(): class I(object): def __init__(self): self.executed = False def indent_line(self, l): # global indent_was_executed self.executed = True return '--- ' + l r = Context().textFile('tests/test_multiprocessing.py') i = I() r = r.map(i.indent_line) exec_before_collect = i.executed # at this point, no map() or foreach() should have been executed r = r.map(i.indent_line).cache() print(r.collect()) r = r.map(i.indent_line) r.collect() exec_after_collect = i.executed print((exec_before_collect, exec_after_collect)) assert not exec_before_collect and exec_after_collect
def test_lazy_execution(): class I(object): def __init__(self): self.executed = False def indent_line(self, l): # global indent_was_executed self.executed = True return '--- '+l r = Context().textFile('tests/test_multiprocessing.py') i = I() r = r.map(i.indent_line) exec_before_collect = i.executed # at this point, no map() or foreach() should have been executed r = r.map(i.indent_line).cache() print(r.collect()) r = r.map(i.indent_line) r.collect() exec_after_collect = i.executed print((exec_before_collect, exec_after_collect)) assert not exec_before_collect and exec_after_collect
def test_http_textFile(): myrdd = Context().textFile( 'https://s3-us-west-2.amazonaws.com/human-microbiome-project/DEMO/' 'HM16STR/46333/by_subject/1139.fsa' ) assert u'TGCTGCGGTGAATGCGTTCCCGGGTCT' in myrdd.collect()
def test_local_textFile_1(): lines = Context().textFile('{}/*textFil*.py'.format(LOCAL_TEST_PATH)) lines = lines.collect() print(lines) assert 'from pysparkling import Context' in lines
def test_http_textFile(): myrdd = Context().textFile( 'https://s3-us-west-2.amazonaws.com/human-microbiome-project/DEMO/' 'HM16STR/46333/by_subject/1139.fsa') assert u'TGCTGCGGTGAATGCGTTCCCGGGTCT' in myrdd.collect()
from pysparkling import Context counts = Context().textFile('README.rst').flatMap(lambda line: line.split( ' ')).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) print(counts.collect())
def test_parallelize_matched_elements(): my_rdd = Context().parallelize([1, 2, 3, 4, 5], 5) assert my_rdd.collect()[2] == 3 and len(my_rdd.collect()) == 5
from pysparkling import Context counts = Context().textFile( 'README.rst' ).flatMap( lambda line: line.split(' ') ).map( lambda word: (word, 1) ).reduceByKey( lambda a, b: a + b ) print(counts.collect())
def test_read_tar_gz(): # file was created with: # tar -cvzf data.tar.gz hello.txt rdd = Context().textFile('{}/data.tar.gz'.format(LOCAL_TEST_PATH)) print(rdd.collect()) assert 'Hello pysparkling!' in rdd.collect()
def test_map(): my_rdd = Context().parallelize([1, 2, 3]).map(lambda x: x+1) assert my_rdd.collect()[0] == 2
def test_read_tar_gz(): # file was created with: # tar -cvzf data.tar.gz hello.txt rdd = Context().textFile('tests/data.tar.gz') print(rdd.collect()) assert 'Hello pysparkling!' in rdd.collect()
def test_parallelize_single_element(): my_rdd = Context().parallelize([7], 100) assert my_rdd.collect()[0] == 7
def test_collect(): my_rdd = Context().parallelize([1, 2, 3]) assert my_rdd.collect()[0] == 1
from pysparkling import Context # read all the paths of warc and wat files of the latest Common Crawl paths_rdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.*,' 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/wat.paths.gz' ) print(paths_rdd.collect())
def test_keys(): rdd = Context().parallelize([(0, 1), (1, 1)]).keys() assert rdd.collect()[0] == 0
def test_keyBy(): rdd = Context().parallelize([0, 4, 7, 4, 10]) rdd = rdd.keyBy(lambda x: x % 2) assert rdd.collect()[2][0] == 1 # the third element (7) is odd
def test_count_partitions(): my_rdd = Context().parallelize([1, 2, 3], 2) print(my_rdd.collect()) my_rdd.foreach(print) assert my_rdd.count() == 3