def test_hadoop_output_format(self): output_format = 'org.apache.hadoop.mapred.SequenceFileOutputFormat' # one-step job job1 = MRWordCount() # no cmd-line argument for this because it's part of job semantics job1.HADOOP_OUTPUT_FORMAT = output_format with job1.make_runner() as runner1: self.assertEqual(runner1._hadoop_args_for_step(0), ['-outputformat', output_format]) # multi-step job: only use -outputformat on the last step job2 = MRTwoStepJob() job2.HADOOP_OUTPUT_FORMAT = output_format with job2.make_runner() as runner2: self.assertEqual(runner2._hadoop_args_for_step(0), []) self.assertEqual(runner2._hadoop_args_for_step(1), ['-outputformat', output_format])
def test_unexpected_opt_from_mrjob_conf(self): conf_path = self.makefile('mrjob.custom.conf') with open(conf_path, 'w') as f: dump_mrjob_conf( dict(runners=dict(local=dict(land='useless_swamp'))), f) job = MRTwoStepJob(['-r', 'local', '-c', conf_path]) job.sandbox() with job.make_runner(): self.assertTrue(self.log.warning.called) warnings = '\n'.join(arg[0][0] for arg in self.log.warning.call_args_list) self.assertIn('Unexpected option', warnings) self.assertIn('land', warnings) self.assertIn(conf_path, warnings)
def test_echo_as_python_bin(self): # "echo" is a pretty poor substitute for Python, but it # should be available on most systems mr_job = MRTwoStepJob( ['--python-bin', 'echo', '--steps-python-bin', sys.executable, '--no-conf', '-r', 'local']) mr_job.sandbox() with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() output = ''.join(runner.stream_output()) # the output should basically be the command we used to # run the last step, which in this case is a mapper self.assertIn('mr_two_step_job.py', output) self.assertIn('--step-num=1', output) self.assertIn('--mapper', output)
def _test_cloud_tmp_cleanup(self, mode, tmp_len): stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '-', '--cleanup', mode]) mr_job.sandbox(stdin=stdin) with mr_job.make_runner() as runner: tmp_bucket, _ = parse_gcs_uri(runner._cloud_tmp_dir) runner.run() # this is set and unset before we can get at it unless we do this list(runner.cat_output()) fs = runner.fs # with statement finishes, cleanup runs self.assertEqual(len(list(fs.client.bucket(tmp_bucket).list_blobs())), tmp_len)
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with no_handlers_for_logger(): mr_job.run_job() # expect debugging messages in stderr. stderr = mr_job.stderr.getvalue() # stderr is huge, so don't use assertIn() self.assertTrue(b'import mrjob' in stderr or # Python 2 b"import 'mrjob'" in stderr) # Python 3 self.assertTrue(b'#' in stderr) # should still get expected results self.assertEqual(sorted(mr_job.stdout.getvalue().splitlines()), sorted([b'1\tnull', b'1\t"bar"']))
def test_show_steps(self): mr_boring_job = MRBoringJob(['--steps']) mr_boring_job.sandbox() mr_boring_job.show_steps() self.assertEqual(mr_boring_job.stdout.getvalue(), 'MR\n') # final mappers don't show up in the step description mr_final_boring_job = MRFinalBoringJob(['--steps']) mr_final_boring_job.sandbox() mr_final_boring_job.show_steps() self.assertEqual(mr_final_boring_job.stdout.getvalue(), 'MR\n') mr_two_step_job = MRTwoStepJob(['--steps']) mr_two_step_job.sandbox() mr_two_step_job.show_steps() self.assertEqual(mr_two_step_job.stdout.getvalue(), 'MCR M\n') mr_no_mapper = MRNoMapper(['--steps']) mr_no_mapper.sandbox() mr_no_mapper.show_steps() self.assertEqual(mr_no_mapper.stdout.getvalue(), 'MR R\n')
def test_echo_as_steps_python_bin(self): mr_job = MRTwoStepJob([ '--steps', '--steps-python-bin', 'echo', '--no-conf', '-r', 'local' ]) mr_job.sandbox() with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) try: # make_runner() populates _steps in the runner, so un-populate # it here so that the runner actually tries to get the steps # via subprocess runner._steps = None runner._get_steps() assert False, 'Should throw exception' except ValueError, ex: output = str(ex) # the output should basically be the command used to # run the steps command self.assertIn('mr_two_step_job.py', output) self.assertIn('--steps', output)
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = StringIO('foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz_glob = os.path.join(self.tmp_dir, '*.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\n') input_gz.close() mr_job = MRTwoStepJob([ '-c', self.mrjob_conf_path, '-r', 'local', '-', input_path, input_gz_glob ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) self.assertEqual(runner.counters()[0]['count']['combiners'], 8) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_python_dash_v_as_python_bin(self): python_cmd = cmd_line([sys.executable or 'python', '-v']) mr_job = MRTwoStepJob( ['--python-bin', python_cmd, '--no-conf', '-r', 'local']) mr_job.sandbox(stdin=[b'bar\n']) with mr_job.make_runner() as runner: runner.run() # expect python -v crud in stderr with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue( any('import mrjob' in line or # Python 2 "import 'mrjob'" in line for line in lines)) with open(runner._task_stderr_path('mapper', 0, 0)) as lines: self.assertTrue(any('#' in line for line in lines)) # should still get expected results self.assertEqual(sorted(to_lines(runner.cat_output())), sorted([b'1\tnull\n', b'1\t"bar"\n']))
def test_failed_job(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '-v']) mr_job.sandbox() with no_handlers_for_logger('mrjob.dataproc'): stderr = StringIO() log_to_stream('mrjob.dataproc', stderr) self.mock_jobs_succeed = False with mr_job.make_runner() as runner: self.assertIsInstance(runner, DataprocJobRunner) self.assertRaises(StepFailedException, runner.run) self.assertIn(' => ERROR\n', stderr.getvalue()) cluster_id = runner.get_cluster_id() # job should get terminated cluster = runner._get_cluster(cluster_id) self.assertEqual(_cluster_state_name(cluster.status.state), 'DELETING')
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = BytesIO(b'foo\nbar\n') input_path = join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\n') input_gz_path = join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() input_gz_glob = join(self.tmp_dir, '*.gz') mr_job = MRTwoStepJob([ '-r', 'local', '--num-cores', '4', '-', input_path, input_gz_glob ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() results.extend(mr_job.parse_output(runner.cat_output())) local_tmp_dir = runner._get_local_tmp_dir() assert exists(local_tmp_dir) self.assertGreater(runner.counters()[0]['count']['combiners'], 0) # make sure cleanup happens assert not exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_end_to_end_multiple_tasks(self): # read from STDIN, a regular file, and a .gz stdin = BytesIO(b'foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRTwoStepJob([ '-r', 'local', '--jobconf=mapred.map.tasks=2', '--jobconf=mapred.reduce.tasks=2', '-', input_path, input_gz_path ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_two_step_job(self): input1_path = self.makefile('input1') input2_path = self.makefile('input2') job = MRTwoStepJob([ '-r', 'hadoop', '--hadoop-bin', 'false', # shouldn't run; just in case input1_path, input2_path]) job.sandbox() with job.make_runner() as runner: runner._add_job_files_for_upload() input_uris_0 = runner._step_input_uris(0) self.assertEqual([os.path.basename(uri) for uri in input_uris_0], ['input1', 'input2']) output_uri_0 = runner._step_output_uri(0) input_uris_1 = runner._step_input_uris(1) self.assertEqual(input_uris_1, [output_uri_0]) output_uri_1 = runner._step_output_uri(1) self.assertEqual(output_uri_1, runner._output_dir)
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar' ] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [ args for args in hadoop_cmd_args if args[:1] == ['jar'] ] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_bootstrap_python_comes_before_bootstrap(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '--bootstrap', 'true']) with mr_job.make_runner() as runner: self.assertEqual(runner._bootstrap, self.EXPECTED_BOOTSTRAP + [['true']])
def test_no_bootstrap_python_switch(self): mr_job = MRTwoStepJob(['-r', 'dataproc', '--no-bootstrap-python']) with mr_job.make_runner() as runner: self.assertEqual(runner._opts['bootstrap_python'], False) self.assertEqual(runner._bootstrap_python(), []) self.assertEqual(runner._bootstrap, [])
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar', '-', local_input_path, remote_input_path ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_no_warning_by_default(self): job = MRTwoStepJob(['-r', 'local', '--no-conf']) job.sandbox() with job.make_runner(): self.assertFalse(self.log.warning.called)
def test_base_classes_cant_have_steps(self): steps = MRTwoStepJob([])._steps_desc() self.assertRaises(NotImplementedError, MRJobRunner, steps=steps)
def test_streaming_step_not_okay(self): job = MRTwoStepJob() job.sandbox() with job.make_runner() as runner: self.assertRaises(TypeError, runner._spark_script_args, 0)
def test_auto_label(self): runner = MRTwoStepJob(['--no-conf']).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'mr_two_step_job') self.assertEqual(match.group(2), getpass.getuser())
def test_nonexistent_steps(self): mr_job = MRTwoStepJob() mr_job.sandbox() self.assertRaises(ValueError, mr_job.run_reducer, 1) self.assertRaises(ValueError, mr_job.run_mapper, 2) self.assertRaises(ValueError, mr_job.run_reducer, -1)
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = { ('j-MOCKJOBFLOW0', 1): ['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'] } mr_job = MRTwoStepJob([ '-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path, '--hadoop-input-format', 'FooFormat', '--hadoop-output-format', 'BarFormat' ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps assert_in('-inputformat', job_flow.steps[0].args) assert_not_in('-outputformat', job_flow.steps[0].args) assert_not_in('-inputformat', job_flow.steps[1].args) assert_in('-outputformat', job_flow.steps[1].args) # make sure mrjob.tar.gz is created and uploaded as # a bootstrap file assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file') # shouldn't be in PYTHONPATH (we dump it directly in site-packages) pythonpath = runner._get_cmdenv().get('PYTHONPATH') or '' assert_not_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')