def test_range_type(self): # ranges have different reprs on Python 2 vs. Python 3, and # can't be checked for equality until Python 3.3+ if PY2: range_type = xrange else: range_type = range self.assertEqual(repr(safeeval(repr(range_type(3)))), repr(range_type(3))) if sys.version_info >= (3, 3): self.assertEqual(safeeval(repr(range_type(3))), range_type(3))
def test_end_to_end(self): script_path = spark_wordcount_script.__file__ if script_path.endswith('.pyc'): script_path = script_path[:-1] input_path = self.makefile( 'input', b'one fish\ntwo fish\nred fish\nblue fish\n') # don't create this path, let Spark do it output_path = join(self.tmp_dir, 'output') self.assertFalse(exists(output_path)) spark_submit_main( ['-r', 'local', script_path, input_path, output_path]) self.assertTrue(exists(output_path)) word_counts = {} for path in glob(join(output_path, 'part-*')): with open(path) as f: for line in f: word, count = safeeval(line) word_counts[word] = count self.assertEqual(word_counts, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_archive_emulation(self): f_dir = self.makedirs('f') self.makefile(join(f_dir, 'fish'), b'salmon') self.makefile(join(f_dir, 'fowl'), b'goose') f_tar_gz = make_archive(join(self.tmp_dir, 'f'), 'gztar', f_dir) job = MRSparkOSWalk([ '-r', 'local', '--archives', '%s#f-unpacked' % f_tar_gz, '--dirs', f_dir ]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size self.assertIn('f/fish', file_sizes) self.assertEqual(file_sizes['f/fish'], 6) self.assertIn('f/fowl', file_sizes) self.assertEqual(file_sizes['f/fowl'], 5) self.assertIn('f-unpacked/fish', file_sizes) self.assertEqual(file_sizes['f-unpacked/fish'], 6) self.assertIn('f-unpacked/fowl', file_sizes) self.assertEqual(file_sizes['f-unpacked/fowl'], 5) # archives should have been uploaded as files self.assertIn('f.tar.gz.file', file_sizes) self.assertIn('f-1.tar.gz.file', file_sizes)
def test_spark_mrjob(self): text = b'one fish\ntwo fish\nred fish\nblue fish\n' job = MRSparkWordcount(['-r', 'inline']) job.sandbox(stdin=BytesIO(text)) counts = {} with job.make_runner() as runner: runner.run() for line in to_lines(runner.cat_output()): k, v = safeeval(line) counts[k] = v self.assertEqual(counts, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_spark_mrjob(self): text = b'one fish\ntwo fish\nred fish\nblue fish\n' job = MRSparkWordcount(['-r', 'inline']) job.sandbox(stdin=BytesIO(text)) counts = {} with job.make_runner() as runner: runner.run() for line in to_lines(runner.cat_output()): k, v = safeeval(line) counts[k] = v self.assertEqual(counts, dict( blue=1, fish=4, one=1, red=1, two=1))
def test_count_words(self): job = MRSparkWordcount([]) job.sandbox( stdin=BytesIO(b'Mary had a little lamb\nlittle lamb\nlittle lamb')) with job.make_runner() as runner: runner.run() output = sorted( safeeval(line) for line in to_lines(runner.cat_output())) self.assertEqual(output, [ ('a', 1), ('had', 1), ('lamb', 3), ('little', 3), ('mary', 1), ])
def test_copy_files_with_rename_to_local_wd_mirror(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # use _LOCAL_CLUSTER_MASTER because the default master (local[*]) # doesn't have a working directory job = MRSparkOSWalk(['-r', 'spark', '--spark-master', _LOCAL_CLUSTER_MASTER, '--file', fish_path + '#ghoti', '--file', fowl_path]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(exists(join(wd_mirror, 'ghoti'))) self.assertFalse(exists(join(wd_mirror, 'fish'))) self.assertFalse(exists(join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_copy_files_with_rename_to_local_wd_mirror(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # use _LOCAL_CLUSTER_MASTER because the default master (local[*]) # doesn't have a working directory job = MRSparkOSWalk([ '-r', 'spark', '--spark-master', _LOCAL_CLUSTER_MASTER, '--files', '%s#ghoti,%s' % (fish_path, fowl_path) ]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(exists(join(wd_mirror, 'ghoti'))) self.assertFalse(exists(join(wd_mirror, 'fish'))) self.assertFalse(exists(join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_upload_files_with_rename(self): fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') job = MRSparkOSWalk( ['-r', 'local', '--files', '%s#ghoti,%s' % (fish_path, fowl_path)]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(os.path.exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_upload_files_with_rename(self): fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') job = MRSparkOSWalk(['-r', 'local', '--file', fish_path + '#ghoti', '--file', fowl_path]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(os.path.exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_upload_files_with_rename(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # --use-driver-cwd gets around issues with the shared JVM not changing # executors' working directory to match the driver on local master job = MRSparkOSWalk(['-r', 'inline', '--use-driver-cwd', '--file', fish_path + '#ghoti', '--file', fowl_path]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # there is no working dir mirror in inline mode; inline # mode simulates the working dir itself wd_mirror = runner._wd_mirror() self.assertIsNone(wd_mirror) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_upload_files_with_rename(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # --use-driver-cwd gets around issues with the shared JVM not changing # executors' working directory to match the driver on local master job = MRSparkOSWalk([ '-r', 'inline', '--use-driver-cwd', '--file', fish_path + '#ghoti', '--file', fowl_path ]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # there is no working dir mirror in inline mode; inline # mode simulates the working dir itself wd_mirror = runner._wd_mirror() self.assertIsNone(wd_mirror) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def read(cls, line): key, value = line.split('\t') return safeeval(key), safeeval(value)
def test_globals_and_locals(self): # test passing in globals, locals a = -0.2 self.assertEqual( abs(a), safeeval('abs(a)', globals={'abs': abs}, locals={'a': a}))
def test_globals_and_locals(self): # test passing in globals, locals a = -0.2 self.assertEqual(abs(a), safeeval("abs(a)", globals={"abs": abs}, locals={"a": a}))
def read(cls, line): return (None, safeeval(line))
def test_simple_data_structures(self): # try unrepr-ing a bunch of simple data structures for x in True, None, 1, [0, 1, 2, 3, 4], {'foo': False, 'bar': 2}: self.assertEqual(x, safeeval(repr(x)))
def load_from_string(cls, value): return safeeval(value)
def _loads(self, value): return safeeval(value)
def read(self, line): return (None, safeeval(line))
def test_simple_data_structure(self): # try unrepr-ing a bunch of simple data structures for x in True, None, 1, range(5), {"foo": False, "bar": 2}: self.assertEqual(x, safeeval(repr(x)))
def test_simple_data_structure(self): # try unrepr-ing a bunch of simple data structures for x in True, None, 1, range(5), {'foo': False, 'bar': 2}: assert_equal(x, safeeval(repr(x)))