def _assert_output_matches( self, job_class, input_bytes=b'', input_paths=(), job_args=[]): # run classes defined in this module in inline mode, classes # with their own script files in local mode. used by # test_skip_combiner_that_runs_cmd() if job_class.__module__ == __name__: runner_alias = 'inline' else: runner_alias = 'local' reference_job = self._reference_job( job_class, input_bytes=input_bytes, input_paths=input_paths, job_args=job_args, runner_alias=runner_alias) with reference_job.make_runner() as runner: runner.run() reference_output = sorted(to_lines(runner.cat_output())) harness_job = self._harness_job( job_class, input_bytes=input_bytes, input_paths=input_paths, job_args=job_args) with harness_job.make_runner() as runner: runner.run() harness_output = sorted(to_lines(runner.cat_output())) self.assertEqual(harness_output, reference_output)
def test_no_trailing_newline(self): self.assertEqual( list(to_lines(iter([ b'Alouette,\ngentille', b' Alouette.', ]))), [b'Alouette,\n', b'gentille Alouette.'])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob(['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( 'import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any( '#' in line for line in lines)) # should still get expected results self.assertEqual( sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def test_loading_bootstrapped_mrjob_library(self): # track the dir we're loading mrjob from rather than the full path # to deal with edge cases where we load from the .py file, # and the script loads from the .pyc compiled from that .py file. our_mrjob_dir = os.path.dirname(os.path.realpath(mrjob.__file__)) with mrjob_conf_patcher(): mr_job = MRJobWhereAreYou(['-r', 'local', '--bootstrap-mrjob']) mr_job.sandbox() with mr_job.make_runner() as runner: # sanity check self.assertEqual(runner._bootstrap_mrjob(), True) local_tmp_dir = os.path.realpath(runner._get_local_tmp_dir()) runner.run() output = list(to_lines(runner.cat_output())) self.assertEqual(len(output), 1) # script should load mrjob from its working dir _, script_mrjob_dir = mr_job.parse_output_line(output[0]) self.assertNotEqual(our_mrjob_dir, script_mrjob_dir) self.assertTrue(script_mrjob_dir.startswith(local_tmp_dir))
def test_mixed_job(self): # test a combination of streaming and spark steps job = MRStreamingAndSpark(['-r', 'spark']) job.sandbox(stdin=BytesIO( b'foo\nbar\n')) with job.make_runner() as runner: runner.run() # converts to 'null\t"foo"', 'null\t"bar"' and then counts chars self.assertEqual( sorted(to_lines(runner.cat_output())), [ b'\t 2\n', b'" 4\n', b'a 1\n', b'b 1\n', b'f 1\n', b'l 4\n', b'n 2\n', b'o 2\n', b'r 1\n', b'u 2\n', ] )
def test_cat_output(self): a_dir_path = os.path.join(self.tmp_dir, 'a') b_dir_path = os.path.join(self.tmp_dir, 'b') l_dir_path = os.path.join(self.tmp_dir, '_logs') os.mkdir(a_dir_path) os.mkdir(b_dir_path) os.mkdir(l_dir_path) a_file_path = os.path.join(a_dir_path, 'part-00000') b_file_path = os.path.join(b_dir_path, 'part-00001') c_file_path = os.path.join(self.tmp_dir, 'part-00002') x_file_path = os.path.join(l_dir_path, 'log.xml') y_file_path = os.path.join(self.tmp_dir, '_SUCCESS') with open(a_file_path, 'w') as f: f.write('A') with open(b_file_path, 'w') as f: f.write('B') with open(c_file_path, 'w') as f: f.write('C') with open(x_file_path, 'w') as f: f.write('<XML XML XML/>') with open(y_file_path, 'w') as f: f.write('I win') runner = InlineMRJobRunner(conf_paths=[], output_dir=self.tmp_dir) self.assertEqual(sorted(to_lines(runner.cat_output())), [b'A', b'B', b'C'])
def test_multiple_2(self): data = b'x\ny\nz\n' job = MRCmdJob(['--mapper-cmd=cat', '--reducer-cmd-2', 'wc -l', '--runner=local', '--no-conf']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: r.run() self.assertEqual(sum(int(l) for l in to_lines(r.cat_output())), 3)
def parse_output(self, chunks): """Parse the final output of this MRJob (as a stream of byte chunks) into a stream of ``(key, value)``. """ read = self.output_protocol().read for line in to_lines(chunks): yield read(line)
def test_buffered_lines(self): self.assertEqual( list(to_lines(chunk for chunk in [b'The quick\nbrown fox\nju', b'mped over\nthe lazy\ndog', b's.\n'])), [b'The quick\n', b'brown fox\n', b'jumped over\n', b'the lazy\n', b'dogs.\n'])
def test_long_lines(self): super_long_line = b'a' * 10000 + b'\n' + b'b' * 1000 + b'\nlast\n' self.assertEqual( list(to_lines( chunk for chunk in (super_long_line[0 + i:1024 + i] for i in range(0, len(super_long_line), 1024)))), [b'a' * 10000 + b'\n', b'b' * 1000 + b'\n', b'last\n'])
def test_read_all_non_hidden_files(self): self.makefile(os.path.join(self.output_dir, 'baz'), b'qux\n') self.makefile(os.path.join(self.output_dir, 'foo', 'bar'), b'baz\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'baz\n', b'qux\n'])
def test_output_dir_not_considered_hidden(self): output_dir = os.path.join(self.tmp_dir, '_hidden', '_output_dir') self.makefile(os.path.join(output_dir, 'part-00000'), b'cats\n') runner = InlineMRJobRunner(conf_paths=[], output_dir=output_dir) self.assertEqual(sorted(to_lines(runner.stream_output())), [b'cats\n'])
def stream_output(self): """Like :py:meth:`cat_output` except that it groups bytes into lines. Equivalent to ``mrjob.util.to_lines(runner.stream_output())``. .. deprecated:: 0.6.0 """ log.warning('stream_output() is deprecated and will be removed in' ' v0.7.0. use mrjob.util.to_lines(runner.cat_output())' ' instead.') return to_lines(self.cat_output())
def test_eof_without_trailing_newline(self): self.assertEqual( list(to_lines(iter([ b'Alouette,\ngentille', b' Alouette.', b'', # treated as EOF b'Allouette,\nje te p', b'lumerais.', ]))), [b'Alouette,\n', b'gentille Alouette.', b'Allouette,\n', b'je te plumerais.'])
def _cat_log_lines(fs, path): """Yield lines from the given log. Log errors rather than raising them. """ try: if not fs.exists(path): return for line in to_lines(fs.cat(path)): yield to_unicode(line) except (IOError, OSError) as e: log.warning("couldn't cat() %s: %r" % (path, e))
def test_no_file_args_required(self): words1 = self.makefile('words1', b'kit and caboodle\n') words2 = self.makefile('words2', b'baubles\nbangles and beads\n') job = MRJobLauncher( args=['-r', 'local', tests.sr_wc.__file__, words1, words2]) job.sandbox() with job.make_runner() as runner: runner.run() lines = list(to_lines(runner.cat_output())) self.assertEqual(len(lines), 1) self.assertEqual(int(lines[0]), 7)
def test_mixed_job(self): # can we run just the streaming part of a job? input_bytes = b'foo\nbar\n' job = self._harness_job( MRStreamingAndSpark, input_bytes=input_bytes, first_step_num=0, last_step_num=0) with job.make_runner() as runner: runner.run() # the streaming part is just an identity mapper, but it converts # lines to pairs of JSON self.assertEqual(set(to_lines(runner.cat_output())), {b'null\t"foo"\n', b'null\t"bar"\n'})
def test_output_in_subdirs(self): # test for output being placed in subdirs, for example with nicknack self.makefile(os.path.join(self.output_dir, 'a', 'part-00000'), b'line-a0\n') self.makefile(os.path.join(self.output_dir, 'a', 'part-00001'), b'line-a1\n') self.makefile(os.path.join(self.output_dir, 'b', 'part-00000'), b'line-b0\n') self.makefile(os.path.join(self.output_dir, 'b', '.crc.part-00000'), b'42\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'line-a0\n', b'line-a1\n', b'line-b0\n'])
def test_mixed_job(self): # can we run just the streaming part of a job? input_bytes = b'foo\nbar\n' job = self._harness_job(MRStreamingAndSpark, input_bytes=input_bytes, start_step=0, end_step=1) with job.make_runner() as runner: runner.run() # the streaming part is just an identity mapper, but it converts # lines to pairs of JSON self.assertEqual(set(to_lines(runner.cat_output())), {b'null\t"foo"\n', b'null\t"bar"\n'})
def test_cat_mapper(self): data = b'x\ny\nz\n' job = MRCmdJob(['--mapper-cmd=cat', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat' } }]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), sorted(data.split()))
def test_spark_script_mrjob(self): text = b'one fish\ntwo fish\nred fish\nblue fish\n' job = MRSparkScriptWordcount(['-r', 'local']) job.sandbox(stdin=BytesIO(text)) counts = {} with job.make_runner() as runner: runner.run() for line in to_lines(runner.cat_output()): k, v = safeeval(line) counts[k] = v self.assertEqual(counts, dict(blue=1, fish=4, one=1, red=1, two=1))
def test_cat_mapper(self): data = b'x\ny\nz\n' job = MRCmdJob(['--mapper-cmd=cat', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'command', 'command': 'cat'}}]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), sorted(data.split()))
def test_mapper_pre_filter(self): data = b'x\ny\nz\n' job = MRFilterJob(['--mapper-filter', 'cat -e', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'cat -e'}}]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
def test_mapper_pre_filter(self): data = b'x\ny\nz\n' job = MRFilterJob(['--mapper-filter', 'cat -e', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'cat -e' } }]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
def test_spark_mrjob(self): text = b'one fish\ntwo fish\nred fish\nblue fish\n' job = MRSparkWordcount(['-r', 'inline']) job.sandbox(stdin=BytesIO(text)) counts = {} with job.make_runner() as runner: runner.run() for line in to_lines(runner.cat_output()): k, v = safeeval(line) counts[k] = v self.assertEqual(counts, dict( blue=1, fish=4, one=1, red=1, two=1))
def test_count_words(self): job = MRSparkWordcount([]) job.sandbox( stdin=BytesIO(b'Mary had a little lamb\nlittle lamb\nlittle lamb')) with job.make_runner() as runner: runner.run() output = sorted( safeeval(line) for line in to_lines(runner.cat_output())) self.assertEqual(output, [ ('a', 1), ('had', 1), ('lamb', 3), ('little', 3), ('mary', 1), ])
def test_does_not_override_hadoop_input_format(self): input1_path = self.makefile('input1', b'potato') input2_path = self.makefile('input2', b'potato tomato') manifest_path = self.makefile('manifest') with open(manifest_path, 'w') as manifest: manifest.write('%s\n%s\n' % (input1_path, input2_path)) job = MRNickNackWithHadoopInputFormat(['-r', 'spark', manifest_path]) job.sandbox() with job.make_runner() as runner: runner.run() output_counts = dict(line.strip().split(b'\t') for line in to_lines(runner.cat_output())) self.assertEqual(output_counts, {b'"tomato"': b'1', b'"potato"': b'2'})
def gz_test(self, dir_path_name): contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'] contents_normal = [b'foo\n', b'bar\n', b'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = join(dir_path_name, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b''.join(contents_gz)) input_gz.close() input_path2 = join(dir_path_name, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, 'rb') as f: lines = list(to_lines(decompress(f, file_name))) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def gz_test(self, dir_path_name): contents_gz = [ b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n' ] contents_normal = [b'foo\n', b'bar\n', b'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = join(dir_path_name, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b''.join(contents_gz)) input_gz.close() input_path2 = join(dir_path_name, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: with open(file_name, 'rb') as f: lines = list(to_lines(decompress(f, file_name))) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def test_copy_files_with_rename_to_local_wd_mirror(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # use _LOCAL_CLUSTER_MASTER because the default master (local[*]) # doesn't have a working directory job = MRSparkOSWalk(['-r', 'spark', '--spark-master', _LOCAL_CLUSTER_MASTER, '--file', fish_path + '#ghoti', '--file', fowl_path]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(exists(join(wd_mirror, 'ghoti'))) self.assertFalse(exists(join(wd_mirror, 'fish'))) self.assertFalse(exists(join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_copy_files_with_rename_to_local_wd_mirror(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # use _LOCAL_CLUSTER_MASTER because the default master (local[*]) # doesn't have a working directory job = MRSparkOSWalk([ '-r', 'spark', '--spark-master', _LOCAL_CLUSTER_MASTER, '--files', '%s#ghoti,%s' % (fish_path, fowl_path) ]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(exists(join(wd_mirror, 'ghoti'))) self.assertFalse(exists(join(wd_mirror, 'fish'))) self.assertFalse(exists(join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_hadoop_output_format(self): input_bytes = b'ee eye ee eye oh' job = MRNickNack(['-r', 'spark']) job.sandbox(stdin=BytesIO(input_bytes)) with job.make_runner() as runner: runner.run() # nicknack.MultipleValueOutputFormat should put output in subdirs self.assertTrue( runner.fs.exists(runner.fs.join(runner.get_output_dir(), 'e'))) self.assertTrue( runner.fs.exists(runner.fs.join(runner.get_output_dir(), 'o'))) # check for expected output self.assertEqual(sorted(to_lines(runner.cat_output())), [b'"ee"\t2\n', b'"eye"\t2\n', b'"oh"\t1\n'])
def _read_input(self): """Read from stdin, or one more files, or directories. Yield one line at time. - Resolve globs (``foo_*.gz``). - Decompress ``.gz`` and ``.bz2`` files. - If path is ``-``, read from STDIN. - Recursively read all files in a directory """ paths = self.options.args or ['-'] for path in paths: if path == '-': for line in self.stdin: yield line else: with open(path, 'rb') as f: for line in to_lines(decompress(f, path)): yield line
def test_typical_output(self): # actual output self.makefile(os.path.join(self.output_dir, 'part-00000'), b'line0\n') self.makefile(os.path.join(self.output_dir, 'part-00001'), b'line1\n') # hidden .crc file self.makefile(os.path.join(self.output_dir, '.crc.part-00000'), b'42\n') # hidden _SUCCESS file (ignore) self.makefile(os.path.join(self.output_dir, '_SUCCESS'), b'such a relief!\n') # hidden _logs dir self.makefile(os.path.join(self.output_dir, '_logs', 'log.xml'), b'pretty much the usual\n') self.assertEqual(sorted(to_lines(self.runner.cat_output())), [b'line0\n', b'line1\n'])
def test_hadoop_output_format(self): input_bytes = b'ee eye ee eye oh' job = MRNickNack(['-r', 'spark']) job.sandbox(stdin=BytesIO(input_bytes)) with job.make_runner() as runner: runner.run() # nicknack.MultipleValueOutputFormat should put output in subdirs self.assertTrue(runner.fs.exists( runner.fs.join(runner.get_output_dir(), 'e'))) self.assertTrue(runner.fs.exists( runner.fs.join(runner.get_output_dir(), 'o'))) # check for expected output self.assertEqual( sorted(to_lines(runner.cat_output())), [b'"ee"\t2\n', b'"eye"\t2\n', b'"oh"\t1\n'])
def test_cat_reducer(self): data = b'x\ny\nz\n' job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', }, 'reducer': { 'type': 'command', 'command': 'cat -e'}}]) r.run() lines = list(to_lines(r.cat_output())) self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
def test_pre_filter_failure(self): # regression test for #1524 data = b'x\ny\nz\n' # grep will return exit code 1 because there are no matches job = MRFilterJob(['--mapper-filter', 'grep w', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep w' } }]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [])
def test_pre_filter_failure(self): # regression test for #1524 data = b'x\ny\nz\n' # grep will return exit code 1 because there are no matches job = MRFilterJob(['--mapper-filter', 'grep w', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep w'}}]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [])
def test_cat_reducer(self): data = b'x\ny\nz\n' job = MRCmdJob(['--reducer-cmd', 'cat -e', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', }, 'reducer': { 'type': 'command', 'command': 'cat -e' } }]) r.run() lines = list(to_lines(r.cat_output())) self.assertEqual(sorted(lines), [b'x$\n', b'y$\n', b'z$\n'])
def test_mixed_job(self): # test a combination of streaming and spark steps job = MRStreamingAndSpark(['-r', 'spark']) job.sandbox(stdin=BytesIO(b'foo\nbar\n')) with job.make_runner() as runner: runner.run() # converts to 'null\t"foo"', 'null\t"bar"' and then counts chars self.assertEqual(sorted(to_lines(runner.cat_output())), [ b'\t 2\n', b'" 4\n', b'a 1\n', b'b 1\n', b'f 1\n', b'l 4\n', b'n 2\n', b'o 2\n', b'r 1\n', b'u 2\n', ])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue( any('import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any('#' in line for line in lines)) # should still get expected results self.assertEqual(sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def test_pre_filter_on_compressed_data(self): # regression test for #1061 input_gz_path = self.makefile('data.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'x\ny\nz\n') input_gz.close() job = MRFilterJob( ['--mapper-filter', 'cat -e', '--runner=local', input_gz_path]) with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'cat -e' } }]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
def test_upload_files_with_rename(self): fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') job = MRSparkOSWalk(['-r', 'local', '--file', fish_path + '#ghoti', '--file', fowl_path]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(os.path.exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_upload_files_with_rename(self): fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') job = MRSparkOSWalk( ['-r', 'local', '--files', '%s#ghoti,%s' % (fish_path, fowl_path)]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(os.path.exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(os.path.exists(os.path.join(wd_mirror, 'ghoti'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fish'))) self.assertFalse(os.path.exists(os.path.join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def run_runner(mr_job, i, folder): with mr_job.make_runner() as runner: runner.run() new_assignation = {} new_prototype = {} for line in to_lines(runner.cat_output()): key, value = mr_job.parse_output_line(line) new_assignation[key] = value[0] new_prototype[key] = value[1] cwd = os.getcwd() with open(f'{cwd}/{folder}/assignments{i + 1}.txt', mode='w') as new_assignation_file: for key, values in new_assignation.items(): aux = reduce(lambda acc, x: f'{acc} {x}', values, f'{key}:') new_assignation_file.write(f'{aux}\n') global assignation if assignation == new_assignation: return i, new_prototype else: assignation = new_assignation write_prototype(new_prototype, f'{cwd}/{folder}/prototypes{i + 1}.txt')
def test_hadoop_output_format(self): input_bytes = b'ee eye ee eye oh' job = self._harness_job(MRNickNack, input_bytes=input_bytes, runner_alias='local', spark_conf=self.SPARK_CONF) with job.make_runner() as runner: runner.run() self.assertTrue( runner.fs.exists(join(runner.get_output_dir(), 'e'))) self.assertTrue( runner.fs.exists(join(runner.get_output_dir(), 'o'))) output_counts = dict((line.strip().split(b'\t') for line in to_lines(runner.cat_output()))) expected_output_counts = {b'"ee"': b'2', b'"eye"': b'2', b'"oh"': b'1'} self.assertEqual(expected_output_counts, output_counts)
def test_upload_files_with_rename(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # --use-driver-cwd gets around issues with the shared JVM not changing # executors' working directory to match the driver on local master job = MRSparkOSWalk([ '-r', 'inline', '--use-driver-cwd', '--file', fish_path + '#ghoti', '--file', fowl_path ]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # there is no working dir mirror in inline mode; inline # mode simulates the working dir itself wd_mirror = runner._wd_mirror() self.assertIsNone(wd_mirror) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_upload_files_with_rename(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # --use-driver-cwd gets around issues with the shared JVM not changing # executors' working directory to match the driver on local master job = MRSparkOSWalk(['-r', 'inline', '--use-driver-cwd', '--file', fish_path + '#ghoti', '--file', fowl_path]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # there is no working dir mirror in inline mode; inline # mode simulates the working dir itself wd_mirror = runner._wd_mirror() self.assertIsNone(wd_mirror) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)
def test_hadoop_output_format_with_compression(self): input_bytes = b"one two one two" compression_codec = 'org.apache.hadoop.io.compress.GzipCodec' job = self._harness_job( MRNickNack, input_bytes=input_bytes, runner_alias='local', spark_conf=self.SPARK_CONF, compression_codec=compression_codec) with job.make_runner() as runner: runner.run() self.assertTrue(runner.fs.exists( join(runner.get_output_dir(), 'o', 'part*.gz'))) self.assertTrue(runner.fs.exists( join(runner.get_output_dir(), 't', 'part*.gz'))) output_counts = dict( line.strip().split(b'\t') for line in to_lines(runner.cat_output())) expected_output_counts = {b'"one"': b'2', b'"two"': b'2'} self.assertEqual(expected_output_counts, output_counts)
def test_empty(self): self.assertEqual(list(to_lines(_ for _ in ())), [])
def test_no_trailing_newline(self): self.assertEqual( list(to_lines(chunk for chunk in [b'Alouette,\ngentille', b' Alouette.'])), [b'Alouette,\n', b'gentille Alouette.'])