def test_too_few_args(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), ], name='master'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_first_arg_doesnt_start_with_pool(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='mrjob'), ]), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_currently_running(self): jf = self.mock_emr_job_flows['j-CURRENTLY_RUNNING'] self.assertEqual(is_job_flow_done(jf), False) self.assertEqual(is_job_flow_running(jf), True) self.assertEqual(is_job_flow_non_streaming(jf), False) self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(0)) self.assertEqual(est_time_to_hour(jf, self.now), timedelta(minutes=45)) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_pooled_job_flow(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ]), ]) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_idle_and_failed(self): jf = self.mock_emr_job_flows['j-IDLE_AND_FAILED'] self.assertEqual(is_job_flow_done(jf), False) self.assertEqual(is_job_flow_running(jf), False) self.assertEqual(is_job_flow_non_streaming(jf), False) self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(hours=3)) self.assertEqual(est_time_to_hour(jf, self.now), timedelta(hours=1)) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_hadoop_debugging_job_flow(self): jf = self.mock_emr_job_flows['j-HADOOP_DEBUGGING'] self.assertEqual(is_job_flow_done(jf), False) self.assertEqual(is_job_flow_running(jf), False) self.assertEqual(is_job_flow_non_streaming(jf), False) self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(hours=2)) self.assertEqual(est_time_to_hour(jf, self.now), timedelta(hours=1)) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_too_many_args(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='-b'), MockEmrObject(value='mrjob'), ]), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_first_arg_doesnt_start_with_pool(self): jf = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='mrjob'), ], name='master'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_too_few_args(self): jf = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), ], name='master'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_bootstrap_action_isnt_named_master(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='apprentice'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_hive_job_flow(self): jf = self.mock_emr_job_flows['j-HIVE'] self.assertEqual(is_job_flow_done(jf), False) self.assertEqual(is_job_flow_running(jf), False) self.assertEqual(is_job_flow_non_streaming(jf), True) self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(hours=4)) self.assertEqual(est_time_to_hour(jf, self.now), timedelta(hours=1)) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_too_many_args(self): jf = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='cowsay'), MockEmrObject(value='-b'), MockEmrObject(value='mrjob'), ], name='master'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_bootstrap_action_isnt_named_master(self): jf = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='apprentice'), ]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_pooled(self): jf = self.mock_emr_job_flows['j-POOLED'] self.assertEqual(is_job_flow_done(jf), False) self.assertEqual(is_job_flow_running(jf), False) self.assertEqual(is_job_flow_non_streaming(jf), False) self.assertEqual(time_job_flow_idle(jf, self.now), timedelta(minutes=55)) self.assertEqual(est_time_to_hour(jf, self.now), timedelta(minutes=5)) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_pooled_job_flow(self): jf = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), ]) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_pooled_job_flow_with_other_bootstrap_actions(self): jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[], name='action 0'), MockEmrObject(args=[], name='action 1'), MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), ]) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_pooled_job_flow_with_max_hours_idle(self): # max hours idle is added AFTER the master bootstrap script, # which was a problem when we just look at the last action jf = MockEmrObject(bootstrapactions=[ MockEmrObject(args=[ MockEmrObject(value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), MockEmrObject(args=[ MockEmrObject(value='900'), MockEmrObject(value='300'), ], name='idle timeout'), ]) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def test_pooled_job_flow_with_max_hours_idle(self): # max hours idle is added AFTER the master bootstrap script, # which was a problem when we just look at the last action jf = MockEmrObject( bootstrapactions=[ MockEmrObject(args=[ MockEmrObject( value='pool-0123456789abcdef0123456789abcdef'), MockEmrObject(value='reflecting'), ], name='master'), MockEmrObject(args=[ MockEmrObject(value='900'), MockEmrObject(value='300'), ], name='idle timeout'), ]) self.assertEqual(pool_hash_and_name(jf), ('0123456789abcdef0123456789abcdef', 'reflecting'))
def assertJobFlowIs( self, jf, bootstrapping=False, done=False, from_end_of_hour=timedelta(hours=1), has_pending_steps=False, idle_for=timedelta(0), pool_hash=None, pool_name=None, running=False, streaming=True, ): self.assertEqual(bootstrapping, is_job_flow_bootstrapping(jf)) self.assertEqual(done, is_job_flow_done(jf)) self.assertEqual(from_end_of_hour, est_time_to_hour(jf, self.now)) self.assertEqual(has_pending_steps, job_flow_has_pending_steps(jf)) self.assertEqual(idle_for, self.time_job_flow_idle(jf)) self.assertEqual((pool_hash, pool_name), pool_hash_and_name(jf)) self.assertEqual(running, is_job_flow_running(jf)) self.assertEqual(streaming, is_job_flow_streaming(jf))
def test_too_few_args(self): jf = MockEmrObject( bootstrapactions=[MockEmrObject(args=[])]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def inspect_and_maybe_terminate_job_flows(conf_paths=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_paths=conf_paths, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % (num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def test_empty_bootstrap_actions(self): jf = MockEmrObject(bootstrapactions=[]) self.assertEqual(pool_hash_and_name(jf), (None, None))
def test_empty(self): jf = MockEmrObject() self.assertEqual(pool_hash_and_name(jf), (None, None))
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_path=conf_path, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug( 'Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta( minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % ( num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def test_too_few_args(self): jf = MockEmrObject(bootstrapactions=[MockEmrObject(args=[])]) self.assertEqual(pool_hash_and_name(jf), (None, None))