def test_create_scratch_uri(self): # "walrus" bucket will be ignored; it doesn't start with "mrjob-" self.add_mock_s3_data({'walrus': {}, 'zebra': {}}) runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01) # bucket name should be mrjob- plus 16 random hex digits s3_scratch_uri = runner._opts['s3_scratch_uri'] assert_equal(s3_scratch_uri[:11], 's3://mrjob-') assert_equal(s3_scratch_uri[27:], '/tmp/') # bucket shouldn't actually exist yet scratch_bucket, _ = parse_s3_uri(s3_scratch_uri) assert_not_in(scratch_bucket, self.mock_s3_fs.keys()) # need to do something to ensure that the bucket actually gets # created. let's launch a (mock) job flow jfid = runner.make_persistent_job_flow() assert_in(scratch_bucket, self.mock_s3_fs.keys()) runner.make_emr_conn().terminate_jobflow(jfid) # once our scratch bucket is created, we should re-use it runner2 = EMRJobRunner(conf_path=False) assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri) s3_scratch_uri = runner._opts['s3_scratch_uri']
def test_no_region(self): runner = EMRJobRunner(conf_path=False) assert_equal(runner.make_emr_conn().endpoint, 'elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com') assert_equal(runner._aws_region, '')
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if len(args) != 1: option_parser.error('takes exactly one argument') emr_job_flow_id = args[0] MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(conf_path=options.conf_path) log.debug('Terminating job flow %s' % emr_job_flow_id) runner.make_emr_conn().terminate_jobflow(emr_job_flow_id) log.info('Terminated job flow %s' % emr_job_flow_id)
def test_local_bootstrap_action(self): # make sure that local bootstrap action scripts get uploaded to S3 action_path = os.path.join(self.tmp_dir, 'apt-install.sh') with open(action_path, 'w') as f: f.write('for $pkg in $@; do sudo apt-get install $pkg; done\n') bootstrap_actions = [ action_path + ' python-scipy mysql-server'] runner = EMRJobRunner(conf_path=False, bootstrap_actions=bootstrap_actions, s3_sync_wait_time=0.01) job_flow_id = runner.make_persistent_job_flow() emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(job_flow_id) actions = job_flow.bootstrapactions assert_equal(len(actions), 2) assert actions[0].path.startswith('s3://mrjob-') assert actions[0].path.endswith('/apt-install.sh') assert_equal(actions[0].name, 'apt-install.sh') assert_equal(actions[0].args, ['python-scipy', 'mysql-server']) # check for master boostrap script assert actions[1].path.startswith('s3://mrjob-') assert actions[1].path.endswith('b.py') assert_equal(actions[1].args, []) assert_equal(actions[1].name, 'master') # make sure master bootstrap script is on S3 assert runner.path_exists(actions[1].path)
def main(cl_args=None): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args(cl_args) if len(args) != 1: option_parser.error('This tool takes exactly one argument.') cluster_id = args[0] MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(**runner_kwargs(options)) log.debug('Terminating job flow %s' % cluster_id) runner.make_emr_conn().terminate_jobflow(cluster_id) log.info('Terminated job flow %s' % cluster_id)
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''): # print (aws_access_key_id,aws_secret_access_key) JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) emr_conn = JobRunner.make_emr_conn() job_flows=emr_conn.describe_jobflows() job_id='NONE' d = {'WAITING':0,'STARTING':1,'RUNNING':2} waiting_flows=[] for flow in job_flows: try: if flow.state in d.keys(): job_id=flow.jobflowid ip_address=flow.masterpublicdnsname waiting_flows.append([d[flow.state],job_id,ip_address,flow.state]) if ssh_key_pair_file != '': print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address) job_id=flow.jobflowid except Exception: continue waiting_flows = sorted(waiting_flows, key=itemgetter(0)) waiting_flows = [i[1:] for i in waiting_flows] #An index was added at the beginning for the sorting. Removing that index in this step waiting_flows_dict = [{'flow_id':i[0],'node':i[1],'flow_state':i[2]} for i in waiting_flows] #Converting a list of lists to a list of dicts #Printing index = 0 for flow_dict in waiting_flows_dict: print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state'] index+=1 return waiting_flows_dict
def test_explicit_endpoints(self): runner = EMRJobRunner(conf_path=False, aws_region='EU', s3_endpoint='s3-proxy', emr_endpoint='emr-proxy') assert_equal(runner.make_emr_conn().endpoint, 'emr-proxy') assert_equal(runner.make_s3_conn().endpoint, 's3-proxy')
def find_all_flows(aws_access_key_id, aws_secret_access_key): JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) print 'got job runner' emr_conn = JobRunner.make_emr_conn() print 'made EMR connection' return emr_conn.describe_jobflows()
def main(cl_args=None): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args(cl_args) if len(args) != 1: option_parser.error('This tool takes exactly one argument.') emr_job_flow_id = args[0] MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(conf_paths=options.conf_paths) log.debug('Terminating job flow %s' % emr_job_flow_id) runner.make_emr_conn().terminate_jobflow(emr_job_flow_id) log.info('Terminated job flow %s' % emr_job_flow_id)
def main(cl_args=None): # parser command-line args option_parser = _make_option_parser() options, args = option_parser.parse_args(cl_args) if len(args) != 1: option_parser.error('This tool takes exactly one argument.') cluster_id = args[0] MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # create the persistent job runner = EMRJobRunner(**_runner_kwargs(options)) log.debug('Terminating cluster %s' % cluster_id) runner.make_emr_conn().terminate_jobflow(cluster_id) log.info('Terminated cluster %s' % cluster_id)
def test_blank_region(self): # blank region should be treated the same as no region runner = EMRJobRunner(conf_path=False, aws_region='') assert_equal(runner.make_emr_conn().endpoint, 'elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com') assert_equal(runner._aws_region, '')
def main(): # parser command-line args option_parser = make_option_parser() options, args = option_parser.parse_args() if len(args) != 1: option_parser.error('takes exactly one argument') emr_job_flow_id = args[0] # set up logging if not options.quiet: log_to_stream(name='mrjob', debug=options.verbose) # create the persistent job runner = EMRJobRunner(conf_path=options.conf_path) log.debug('Terminating job flow %s' % emr_job_flow_id) runner.make_emr_conn().terminate_jobflow(emr_job_flow_id) log.info('Terminated job flow %s' % emr_job_flow_id)
def find_waiting_flow(aws_access_key_id,aws_secret_access_key): JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) emr_conn = JobRunner.make_emr_conn() job_flows=emr_conn.describe_jobflows() job_id='NONE' for flow in job_flows: if flow.state=='WAITING': print flow,flow.name,flow.jobflowid,flow.state job_id=flow.jobflowid return job_id
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''): print (aws_access_key_id,aws_secret_access_key) JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) emr_conn = JobRunner.make_emr_conn() job_flows=emr_conn.describe_jobflows() job_id='NONE' waiting_flows=[] for flow in job_flows: if flow.state=='WAITING': waiting_flows.append(flow) print flow.jobflowid,flow.state ip_address=flow.masterpublicdnsname if ssh_key_pair_file != '': print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address) job_id=flow.jobflowid return job_id
def test_bootstrap_actions_get_added(self): bootstrap_actions = [ 's3://elasticmapreduce/bootstrap-actions/configure-hadoop -m,mapred.tasktracker.map.tasks.maximum=1', 's3://foo/bar#xyzzy', # use alternate name for script ] runner = EMRJobRunner(conf_path=False, bootstrap_actions=bootstrap_actions, s3_sync_wait_time=0.01) job_flow_id = runner.make_persistent_job_flow() emr_conn = runner.make_emr_conn() job_flow = emr_conn.describe_jobflow(job_flow_id) actions = job_flow.bootstrapactions assert_equal(len(actions), 3) assert_equal( actions[0].path, 's3://elasticmapreduce/bootstrap-actions/configure-hadoop') assert_equal( actions[0].args, ['-m,mapred.tasktracker.map.tasks.maximum=1']) assert_equal(actions[0].name, 'configure-hadoop') assert_equal(actions[1].path, 's3://foo/bar') assert_equal(actions[1].args, []) assert_equal(actions[1].name, 'xyzzy') # check for master bootstrap script assert actions[2].path.startswith('s3://mrjob-') assert actions[2].path.endswith('b.py') assert_equal(actions[2].args, []) assert_equal(actions[2].name, 'master') # make sure master bootstrap script is on S3 assert runner.path_exists(actions[2].path)
def find_all_flows(aws_access_key_id,aws_secret_access_key): JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key) print 'got job runner' emr_conn = JobRunner.make_emr_conn() print 'made EMR connection' return emr_conn.describe_jobflows()
def inspect_and_maybe_terminate_job_flows(conf_paths=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_paths=conf_paths, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % (num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_conn = runner.make_emr_conn() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _yield_all_clusters(emr_conn): cluster_id = cluster_summary.id # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = _list_all_steps(emr_conn, cluster_id) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) bootstrap_actions = list(_yield_all_bootstrap_actions( emr_conn, cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug( 'cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary.name)) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta( minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify( runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary.name, num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info( 'Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % ( num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_path=conf_path, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug( 'Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta( minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % ( num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def test_us_west_1(self): runner = EMRJobRunner(conf_path=False, aws_region='us-west-1') assert_equal(runner.make_emr_conn().endpoint, 'us-west-1.elasticmapreduce.amazonaws.com') assert_equal(runner.make_s3_conn().endpoint, 's3-us-west-1.amazonaws.com')
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_conn = runner.make_emr_conn() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _yield_all_clusters(emr_conn): cluster_id = cluster_summary.id # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = _list_all_steps(emr_conn, cluster_id) # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) if _is_cluster_non_streaming(steps): num_non_streaming += 1 continue if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) bootstrap_actions = list( _yield_all_bootstrap_actions(emr_conn, cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary.name)) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary.name, num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d active non-streaming, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_non_streaming, num_done))