def test_attach_to_existing_cluster(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob( ['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Generate fake output self.put_job_output_parts(runner, [b'1\t"bar"\n1\t"foo"\n2\tnull\n']) # Issue 182: don't create the bootstrap script when # attaching to another cluster self.assertIsNone(runner._master_bootstrap_script_path) for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_attach_to_existing_job_flow(self): emr_conn = EMRJobRunner(conf_path=False).make_emr_conn() # set log_uri to None, so that when we describe the job flow, it # won't have the loguri attribute, to test Issue #112 emr_job_flow_id = emr_conn.run_jobflow( name='Development Job Flow', log_uri=None) stdin = StringIO('foo\nbar\n') self.mock_emr_output = {(emr_job_flow_id, 1): [ '1\t"bar"\n1\t"foo"\n2\tnull\n']} mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '--emr-job-flow-id', emr_job_flow_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Issue 182: don't create the bootstrap script when # attaching to another job flow assert_equal(runner._master_bootstrap_script, None) for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) assert_equal(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = StringIO("foo\nbar\n") input_path = os.path.join(self.tmp_dir, "input") with open(input_path, "w") as input_file: input_file.write("bar\nqux\n") input_gz_path = os.path.join(self.tmp_dir, "input.gz") input_gz = gzip.GzipFile(input_gz_path, "w") input_gz.write("foo\n") input_gz.close() mr_job = MRTwoStepJob(["-c", self.mrjob_conf_path, "-", input_path, input_gz_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)])
def test_attach_to_existing_cluster(self): runner = DataprocJobRunner(conf_paths=[]) cluster_body = runner.api_client.cluster_create() cluster_id = cluster_body['clusterName'] stdin = BytesIO(b'foo\nbar\n') mr_job = MRTwoStepJob(['-r', 'dataproc', '-v', '--cluster-id', cluster_id]) mr_job.sandbox(stdin=stdin) results = [] with mr_job.make_runner() as runner: runner.run() # Generate fake output self.put_job_output_parts(runner, [ b'1\t"bar"\n1\t"foo"\n2\tnull\n' ]) # Issue 182: don't create the bootstrap script when # attaching to another cluster self.assertIsNone(runner._master_bootstrap_script_path) for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) self.assertEqual(sorted(results), [(1, 'bar'), (1, 'foo'), (2, None)])
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(['-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar', '-', local_input_path, remote_input_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO("foo\nbar\n") local_input_path = os.path.join(self.tmp_dir, "input") with open(local_input_path, "w") as local_input_file: local_input_file.write("bar\nqux\n") input_to_upload = os.path.join(self.tmp_dir, "remote_input") with open(input_to_upload, "w") as input_to_upload_file: input_to_upload_file.write("foo\n") remote_input_path = "hdfs:///data/foo" check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output([""]) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(["-r", "hadoop", "-v", "--no-conf", "-", local_input_path, remote_input_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ["MOCK_HDFS_ROOT"] assert_equal(sorted(os.listdir(hdfs_root)), ["data", "user"]) home_dir = os.path.join(hdfs_root, "user", getpass.getuser()) assert_equal(os.listdir(home_dir), ["tmp"]) assert_equal(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"]) assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = StringIO('foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz_glob = os.path.join(self.tmp_dir, '*.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\n') input_gz.close() mr_job = MRTwoStepJob([ '-c', self.mrjob_conf_path, '-r', 'local', '-', input_path, input_gz_glob ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) self.assertEqual(runner.counters()[0]['count']['combiners'], 8) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_end_to_end(self): # read from STDIN, a regular file, and a .gz stdin = StringIO('foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz_glob = os.path.join(self.tmp_dir, '*.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\n') input_gz.close() mr_job = MRTwoStepJob(['-c', self.mrjob_conf_path, '-r', 'local', '-', input_path, input_gz_glob]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) self.assertEqual(runner.counters()[0]['count']['combiners'], 8) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_end_to_end_multiple_tasks(self): # read from STDIN, a regular file, and a .gz stdin = BytesIO(b'foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRTwoStepJob([ '-r', 'local', '--jobconf=mapred.map.tasks=2', '--jobconf=mapred.reduce.tasks=2', '-', input_path, input_gz_path ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_end_to_end_multiple_tasks(self): # read from STDIN, a regular file, and a .gz stdin = BytesIO(b'foo\nbar\n') input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nqux\n') input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\n') input_gz.close() mr_job = MRTwoStepJob(['-r', 'local', '--jobconf=mapred.map.tasks=2', '--jobconf=mapred.reduce.tasks=2', '-', input_path, input_gz_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, LocalMRJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() assert os.path.exists(local_tmp_dir) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) self.assertEqual(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)])
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO("foo\nbar\n") local_input_path = os.path.join(self.tmp_dir, "input") with open(local_input_path, "w") as local_input_file: local_input_file.write("bar\nqux\n") input_to_upload = os.path.join(self.tmp_dir, "remote_input") with open(input_to_upload, "w") as input_to_upload_file: input_to_upload_file.write("foo\n") remote_input_path = "hdfs:///data/foo" check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output([""]) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob( [ "-r", "hadoop", "-v", "--no-conf", "--hadoop-arg", "-libjar", "--hadoop-arg", "containsJars.jar", "-", local_input_path, remote_input_path, "--hadoop-input-format", "FooFormat", "--hadoop-output-format", "BarFormat", ] ) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ["MOCK_HDFS_ROOT"] assert_equal(sorted(os.listdir(hdfs_root)), ["data", "user"]) home_dir = os.path.join(hdfs_root, "user", getpass.getuser()) assert_equal(os.listdir(home_dir), ["tmp"]) assert_equal(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"]) assert_equal(runner._opts["hadoop_extra_args"], ["-libjar", "containsJars.jar"]) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict["path"] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict["name"] pythonpath = runner._get_cmdenv()["PYTHONPATH"] assert_in(mrjob_tar_gz_file_dict["name"], pythonpath.split(":")) assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ["MOCK_HADOOP_LOG"]) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [args for args in hadoop_cmd_args if args[:1] == ["jar"]] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in("-inputformat", step_0_args) assert_not_in("-outputformat", step_0_args) assert_not_in("-inputformat", step_1_args) assert_in("-outputformat", step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in("-libjar", args) assert_in("-mapper", args) assert_lt(args.index("-libjar"), args.index("-mapper")) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar', '-', local_input_path, remote_input_path ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = { ('j-MOCKJOBFLOW0', 1): ['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'] } mr_job = MRTwoStepJob([ '-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path, '--hadoop-input-format', 'FooFormat', '--hadoop-output-format', 'BarFormat' ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps assert_in('-inputformat', job_flow.steps[0].args) assert_not_in('-outputformat', job_flow.steps[0].args) assert_not_in('-inputformat', job_flow.steps[1].args) assert_in('-outputformat', job_flow.steps[1].args) # make sure mrjob.tar.gz is created and uploaded as # a bootstrap file assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file') # shouldn't be in PYTHONPATH (we dump it directly in site-packages) pythonpath = runner._get_cmdenv().get('PYTHONPATH') or '' assert_not_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [ '1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']} mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path, '--hadoop-input-format', 'FooFormat', '--hadoop-output-format', 'BarFormat']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps assert_in('-inputformat', job_flow.steps[0].args) assert_not_in('-outputformat', job_flow.steps[0].args) assert_not_in('-inputformat', job_flow.steps[1].args) assert_in('-outputformat', job_flow.steps[1].args) # make sure mrjob.tar.gz is created and uploaded as # a bootstrap file assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file') # shouldn't be in PYTHONPATH (we dump it directly in site-packages) pythonpath = runner._get_cmdenv().get('PYTHONPATH') or '' assert_not_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(['-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar'] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [args for args in hadoop_cmd_args if args[:1] == ['jar']] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar' ] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [ args for args in hadoop_cmd_args if args[:1] == ['jar'] ] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [ '1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']} mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')