def test_owner_and_label_kwargs(self): runner = InlineMRJobRunner(conf_paths=[], owner='ads', label='ads_chain') match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'ads_chain') self.assertEqual(match.group(2), 'ads')
def test_owner_and_label_switches(self): runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain'] runner = MRTwoStepJob(runner_opts).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'ads_chain') self.assertEqual(match.group(2), 'ads')
def test_auto_owner(self): os.environ['USER'] = '******' runner = LocalMRJobRunner(conf_path=False) match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'no_script') assert_equal(match.group(2), 'mcp')
def test_auto_owner(self): os.environ["USER"] = "******" runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "no_script") self.assertEqual(match.group(2), "mcp")
def test_auto_owner(self): os.environ['USER'] = '******' runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), 'mcp')
def test_empty_no_user(self): self.getuser_should_fail = True runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), 'no_user')
def test_owner_and_label_kwargs(self): runner = LocalMRJobRunner(conf_path=False, owner='ads', label='ads_chain') match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'ads_chain') assert_equal(match.group(2), 'ads')
def test_empty_no_user(self): self.getuser_should_fail = True runner = LocalMRJobRunner(conf_path=False) match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'no_script') assert_equal(match.group(2), 'no_user')
def test_auto_everything(self): test_start = datetime.datetime.utcnow() os.environ["USER"] = "******" runner = MRTwoStepJob(["--no-conf"]).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), "mr_two_step_job") self.assertEqual(match.group(2), "mcp") job_start = datetime.datetime.strptime(match.group(3) + match.group(4), "%Y%m%d%H%M%S") job_start = job_start.replace(microsecond=int(match.group(5))) self.assertGreaterEqual(job_start, test_start) self.assertLessEqual(job_start - test_start, datetime.timedelta(seconds=5))
def test_auto_everything(self): test_start = datetime.datetime.utcnow() os.environ['USER'] = '******' runner = MRTwoStepJob(['--no-conf']).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'mr_two_step_job') assert_equal(match.group(2), 'mcp') job_start = datetime.datetime.strptime( match.group(3) + match.group(4), '%Y%m%d%H%M%S') job_start = job_start.replace(microsecond=int(match.group(5))) assert_gte(job_start, test_start) assert_lte(job_start - test_start, datetime.timedelta(seconds=5))
def print_report(options): emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn() log.info('getting job flow history...') # microseconds just make our report messy now = datetime.datetime.utcnow().replace(microsecond=0) # if --max-days-ago is set, only look at recent jobs created_after = None if options.max_days_ago is not None: created_after = now - datetime.timedelta(days=options.max_days_ago) job_flows = describe_all_job_flows(emr_conn, created_after=created_after) job_flow_infos = [] for jf in job_flows: job_flow_info = {} job_flow_info['id'] = jf.jobflowid job_flow_info['name'] = jf.name job_flow_info['created'] = to_datetime(jf.creationdatetime) start_time = to_datetime(getattr(jf, 'startdatetime', None)) if start_time: end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now job_flow_info['ran'] = end_time - start_time else: job_flow_info['ran'] = datetime.timedelta(0) job_flow_info['state'] = jf.state job_flow_info['num_steps'] = len(jf.steps or []) # this looks to be an integer, but let's protect against # future changes job_flow_info['hours'] = float(jf.normalizedinstancehours) # estimate hours billed but not used job_flow_info['hours_bbnu'] = ( job_flow_info['hours'] * estimate_proportion_billed_but_not_used(jf)) # split out mr job name and user # jobs flows created by MRJob have names like: # mr_word_freq_count.dave.20101103.121249.638552 match = JOB_NAME_RE.match(jf.name) if match: job_flow_info['mr_job_name'] = match.group(1) job_flow_info['user'] = match.group(2) else: # not run by mrjob job_flow_info['mr_job_name'] = None job_flow_info['user'] = None job_flow_infos.append(job_flow_info) if not job_flow_infos: print 'No job flows created in the past two months!' return earliest = min(info['created'] for info in job_flow_infos) latest = max(info['created'] for info in job_flow_infos) print 'Total # of Job Flows: %d' % len(job_flow_infos) print print '* All times are in UTC.' print print 'Min create time: %s' % earliest print 'Max create time: %s' % latest print ' Current time: %s' % now print print '* All usage is measured in Normalized Instance Hours, which are' print ' roughly equivalent to running an m1.small instance for an hour.' print # total compute-unit hours used total_hours = sum(info['hours'] for info in job_flow_infos) print 'Total Usage: %d' % total_hours print print '* Time billed but not used is estimated, and may not match' print " Amazon's billing system exactly." print total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos) print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu print date_to_hours = defaultdict(float) date_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: date_created = info['created'].date() date_to_hours[date_created] += info['hours'] date_to_hours_bbnu[date_created] += info['hours_bbnu'] print 'Daily statistics:' print print ' date usage waste' d = latest.date() while d >= earliest.date(): print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d]) d -= datetime.timedelta(days=1) print def fmt(mr_job_name_or_user): if mr_job_name_or_user: return mr_job_name_or_user else: return '(not started by mrjob)' print '* Job flows are considered to belong to the user and job that' print ' started them (even if other jobs use the job flow).' print # Top jobs print 'Top jobs, by total usage:' mr_job_name_to_hours = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours[info['mr_job_name']] += info['hours'] for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(mr_job_name)) print print 'Top jobs, by time billed but not used:' mr_job_name_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu'] for mr_job_name, hours_bbnu in sorted(mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(mr_job_name)) print # Top users print 'Top users, by total usage:' user_to_hours = defaultdict(float) for info in job_flow_infos: user_to_hours[info['user']] += info['hours'] for user, hours in sorted(user_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(user)) print print 'Top users, by time billed but not used:' user_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: user_to_hours_bbnu[info['user']] += info['hours_bbnu'] for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(user)) print # Top job flows print 'All job flows, by total usage:' top_job_flows = sorted(job_flow_infos, key=lambda i: (-i['hours'], i['name'])) for info in top_job_flows: print ' %6d %-15s %s' % (info['hours'], info['id'], info['name']) print print 'All job flows, by time billed but not used:' top_job_flows_bbnu = sorted(job_flow_infos, key=lambda i: (-i['hours_bbnu'], i['name'])) for info in top_job_flows_bbnu: print ' %9.2f %-15s %s' % ( info['hours_bbnu'], info['id'], info['name']) print print 'Details for all job flows:' print print ' id state created steps time ran usage waste user name' all_job_flows = sorted(job_flow_infos, key=lambda i: i['created'], reverse=True) for info in all_job_flows: print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % ( info['id'], info['state'], info['created'], info['num_steps'], info['ran'], info['hours'], info['hours_bbnu'], (info['user'] or ''), fmt(info['mr_job_name']))
def test_auto_label(self): runner = MRTwoStepJob(['--no-conf']).make_runner() match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'mr_two_step_job') self.assertEqual(match.group(2), getpass.getuser())
def test_empty(self): runner = LocalMRJobRunner(conf_path=False) match = JOB_NAME_RE.match(runner.get_job_name()) assert_equal(match.group(1), 'no_script') assert_equal(match.group(2), getpass.getuser())
def job_flow_to_basic_summary(job_flow, now=None): """Extract fields such as creation time, owner, etc. from the job flow, so we can safely reference them without using :py:func:`getattr`. :param job_flow: a :py:class:`boto.emr.EmrObject` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with the following keys. These will be ``None`` if the corresponding field in the job flow is unavailable. * *created*: UTC `datetime.datetime` that the job flow was created, or ``None`` * *end*: UTC `datetime.datetime` that the job flow finished, or ``None`` * *id*: job flow ID, or ``None`` (this should never happen) * *label*: The label for the job flow (usually the module name of the :py:class:`~mrjob.job.MRJob` script that started it), or ``None`` for non-:py:mod:`mrjob` job flows. * *name*: job flow name, or ``None`` (this should never happen) * *nih*: number of normalized instance hours used by the job flow. * *num_steps*: Number of steps in the job flow. * *owner*: The owner for the job flow (usually the user that started it), or ``None`` for non-:py:mod:`mrjob` job flows. * *pool*: pool name (e.g. ``'default'``) if the job flow is pooled, otherwise ``None``. * *ran*: How long the job flow ran, or has been running, as a :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if the job flow hasn't started. * *ready*: UTC `datetime.datetime` that the job flow finished bootstrapping, or ``None`` * *start*: UTC `datetime.datetime` that the job flow became available, or ``None`` * *state*: The job flow's state as a string (e.g. ``'RUNNING'``) """ if now is None: now = datetime.utcnow() jf = {} # summary to fill in jf['id'] = getattr(job_flow, 'jobflowid', None) jf['name'] = getattr(job_flow, 'name', None) jf['created'] = to_datetime(getattr(job_flow, 'creationdatetime', None)) jf['start'] = to_datetime(getattr(job_flow, 'startdatetime', None)) jf['ready'] = to_datetime(getattr(job_flow, 'readydatetime', None)) jf['end'] = to_datetime(getattr(job_flow, 'enddatetime', None)) if jf['start']: jf['ran'] = (jf['end'] or now) - jf['start'] else: jf['ran'] = timedelta(0) jf['state'] = getattr(job_flow, 'state', None) jf['num_steps'] = len(getattr(job_flow, 'steps', None) or ()) jf['pool'] = None bootstrap_actions = getattr(job_flow, 'bootstrapactions', None) if bootstrap_actions: args = [arg.value for arg in bootstrap_actions[-1].args] if len(args) == 2 and args[0].startswith('pool-'): jf['pool'] = args[1] m = JOB_NAME_RE.match(getattr(job_flow, 'name', '')) if m: jf['label'], jf['owner'] = m.group(1), m.group(2) else: jf['label'], jf['owner'] = None, None jf['nih'] = float(getattr(job_flow, 'normalizedinstancehours', '0')) return jf
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [ '1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']} mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path, '--hadoop-input-format', 'FooFormat', '--hadoop-output-format', 'BarFormat']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps assert_in('-inputformat', job_flow.steps[0].args) assert_not_in('-outputformat', job_flow.steps[0].args) assert_not_in('-inputformat', job_flow.steps[1].args) assert_in('-outputformat', job_flow.steps[1].args) # make sure mrjob.tar.gz is created and uploaded as # a bootstrap file assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file') # shouldn't be in PYTHONPATH (we dump it directly in site-packages) pythonpath = runner._get_cmdenv().get('PYTHONPATH') or '' assert_not_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')
def print_report(options): emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn() log.info('getting job flow history...') # microseconds just make our report messy now = datetime.datetime.utcnow().replace(microsecond=0) # if --max-days-ago is set, only look at recent jobs created_after = None if options.max_days_ago is not None: created_after = now - datetime.timedelta(days=options.max_days_ago) job_flows = describe_all_job_flows(emr_conn, created_after=created_after) job_flow_infos = [] for jf in job_flows: job_flow_info = {} job_flow_info['id'] = jf.jobflowid job_flow_info['name'] = jf.name job_flow_info['created'] = to_datetime(jf.creationdatetime) start_time = to_datetime(getattr(jf, 'startdatetime', None)) if start_time: end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now job_flow_info['ran'] = end_time - start_time else: job_flow_info['ran'] = datetime.timedelta(0) job_flow_info['state'] = jf.state job_flow_info['num_steps'] = len(jf.steps or []) # this looks to be an integer, but let's protect against # future changes job_flow_info['hours'] = float(jf.normalizedinstancehours) # estimate hours billed but not used job_flow_info['hours_bbnu'] = ( job_flow_info['hours'] * estimate_proportion_billed_but_not_used(jf)) # split out mr job name and user # jobs flows created by MRJob have names like: # mr_word_freq_count.dave.20101103.121249.638552 match = JOB_NAME_RE.match(jf.name) if match: job_flow_info['mr_job_name'] = match.group(1) job_flow_info['user'] = match.group(2) else: # not run by mrjob job_flow_info['mr_job_name'] = None job_flow_info['user'] = None job_flow_infos.append(job_flow_info) if not job_flow_infos: print 'No job flows created in the past two months!' return earliest = min(info['created'] for info in job_flow_infos) latest = max(info['created'] for info in job_flow_infos) print 'Total # of Job Flows: %d' % len(job_flow_infos) print print '* All times are in UTC.' print print 'Min create time: %s' % earliest print 'Max create time: %s' % latest print ' Current time: %s' % now print print '* All usage is measured in Normalized Instance Hours, which are' print ' roughly equivalent to running an m1.small instance for an hour.' print # total compute-unit hours used total_hours = sum(info['hours'] for info in job_flow_infos) print 'Total Usage: %d' % total_hours print print '* Time billed but not used is estimated, and may not match' print " Amazon's billing system exactly." print total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos) print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu print date_to_hours = defaultdict(float) date_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: date_created = info['created'].date() date_to_hours[date_created] += info['hours'] date_to_hours_bbnu[date_created] += info['hours_bbnu'] print 'Daily statistics:' print print ' date usage waste' d = latest.date() while d >= earliest.date(): print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d]) d -= datetime.timedelta(days=1) print def fmt(mr_job_name_or_user): if mr_job_name_or_user: return mr_job_name_or_user else: return '(not started by mrjob)' print '* Job flows are considered to belong to the user and job that' print ' started them (even if other jobs use the job flow).' print # Top jobs print 'Top jobs, by total usage:' mr_job_name_to_hours = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours[info['mr_job_name']] += info['hours'] for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(mr_job_name)) print print 'Top jobs, by time billed but not used:' mr_job_name_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu'] for mr_job_name, hours_bbnu in sorted( mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(mr_job_name)) print # Top users print 'Top users, by total usage:' user_to_hours = defaultdict(float) for info in job_flow_infos: user_to_hours[info['user']] += info['hours'] for user, hours in sorted(user_to_hours.iteritems(), key=lambda (n, h): (-h, n)): print ' %6d %s' % (hours, fmt(user)) print print 'Top users, by time billed but not used:' user_to_hours_bbnu = defaultdict(float) for info in job_flow_infos: user_to_hours_bbnu[info['user']] += info['hours_bbnu'] for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)): print ' %9.2f %s' % (hours_bbnu, fmt(user)) print # Top job flows print 'All job flows, by total usage:' top_job_flows = sorted(job_flow_infos, key=lambda i: (-i['hours'], i['name'])) for info in top_job_flows: print ' %6d %-15s %s' % (info['hours'], info['id'], info['name']) print print 'All job flows, by time billed but not used:' top_job_flows_bbnu = sorted(job_flow_infos, key=lambda i: (-i['hours_bbnu'], i['name'])) for info in top_job_flows_bbnu: print ' %9.2f %-15s %s' % (info['hours_bbnu'], info['id'], info['name']) print print 'Details for all job flows:' print print ' id state created steps time ran usage waste user name' all_job_flows = sorted(job_flow_infos, key=lambda i: i['created'], reverse=True) for info in all_job_flows: print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % ( info['id'], info['state'], info['created'], info['num_steps'], info['ran'], info['hours'], info['hours_bbnu'], (info['user'] or ''), fmt(info['mr_job_name']))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = { ('j-MOCKJOBFLOW0', 1): ['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n'] } mr_job = MRTwoStepJob([ '-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path, '--hadoop-input-format', 'FooFormat', '--hadoop-output-format', 'BarFormat' ]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) # make sure our input and output formats are attached to # the correct steps assert_in('-inputformat', job_flow.steps[0].args) assert_not_in('-outputformat', job_flow.steps[0].args) assert_not_in('-inputformat', job_flow.steps[1].args) assert_in('-outputformat', job_flow.steps[1].args) # make sure mrjob.tar.gz is created and uploaded as # a bootstrap file assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] assert_equal(mrjob_tar_gz_file_dict.get('bootstrap'), 'file') # shouldn't be in PYTHONPATH (we dump it directly in site-packages) pythonpath = runner._get_cmdenv().get('PYTHONPATH') or '' assert_not_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')
def test_empty(self): runner = InlineMRJobRunner(conf_paths=[]) match = JOB_NAME_RE.match(runner.get_job_name()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), getpass.getuser())
def test_job_name_not_specified(self): job = MRWordCount() with job.make_runner() as runner: self.assertFalse(runner._opts['job_name']) self.assertIsNotNone(JOB_NAME_RE.match(runner.get_job_name()))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') remote_input_path = 's3://walrus/data/foo' self.add_mock_s3_data({'walrus': {'data/foo': 'foo\n'}}) # setup fake output self.mock_emr_output = {('j-MOCKJOBFLOW0', 1): [ '1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']} mr_job = MRTwoStepJob(['-r', 'emr', '-v', '-c', self.mrjob_conf_path, '-', local_input_path, remote_input_path]) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] mock_s3_fs_snapshot = copy.deepcopy(self.mock_s3_fs) with mr_job.make_runner() as runner: assert isinstance(runner, EMRJobRunner) # make sure that initializing the runner doesn't affect S3 # (Issue #50) assert_equal(mock_s3_fs_snapshot, self.mock_s3_fs) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(runner.get_emr_job_flow_id()) assert_equal(job_flow.state, 'COMPLETED') name_match = JOB_NAME_RE.match(job_flow.name) assert_equal(name_match.group(1), 'mr_two_step_job') assert_equal(name_match.group(2), getpass.getuser()) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir())) # job should get terminated emr_conn = runner.make_emr_conn() job_flow_id = runner.get_emr_job_flow_id() for i in range(10): emr_conn.simulate_progress(job_flow_id) job_flow = emr_conn.describe_jobflow(job_flow_id) assert_equal(job_flow.state, 'TERMINATED')