def test_owner_and_label_switches(self): runner_opts = ['--no-conf', '--owner=ads', '--label=ads_chain'] runner = MRTwoStepJob(runner_opts).make_runner() match = JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'ads_chain') self.assertEqual(match.group(2), 'ads')
def test_owner_and_label_kwargs(self): runner = InlineMRJobRunner(conf_paths=[], owner='ads', label='ads_chain') match = JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'ads_chain') self.assertEqual(match.group(2), 'ads')
def test_auto_owner(self): os.environ['USER'] = '******' runner = InlineMRJobRunner(conf_paths=[]) match = JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), 'mcp')
def test_empty_no_user(self): self.getuser_should_fail = True runner = InlineMRJobRunner(conf_paths=[]) match = JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), 'no_user')
def test_auto_everything(self): test_start = datetime.datetime.utcnow() os.environ['USER'] = '******' runner = MRTwoStepJob(['--no-conf']).make_runner() match = JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'mr_two_step_job') self.assertEqual(match.group(2), 'mcp') job_start = datetime.datetime.strptime( match.group(3) + match.group(4), '%Y%m%d%H%M%S') job_start = job_start.replace(microsecond=int(match.group(5))) self.assertGreaterEqual(job_start, test_start) self.assertLessEqual(job_start - test_start, datetime.timedelta(seconds=5))
def cluster_to_basic_summary(cluster, now=None): """Extract fields such as creation time, owner, etc. from the job flow, so we can safely reference them without using :py:func:`getattr`. :param cluster: a :py:class:`boto.emr.EmrObject` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with the following keys. These will be ``None`` if the corresponding field in the job flow is unavailable. * *created*: UTC `datetime.datetime` that the job flow was created, or ``None`` * *end*: UTC `datetime.datetime` that the job flow finished, or ``None`` * *id*: job flow ID, or ``None`` (this should never happen) * *label*: The label for the job flow (usually the module name of the :py:class:`~mrjob.job.MRJob` script that started it), or ``None`` for non-:py:mod:`mrjob` job flows. * *name*: job flow name, or ``None`` (this should never happen) * *nih*: number of normalized instance hours used by the job flow. * *num_steps*: Number of steps in the job flow. * *owner*: The owner for the job flow (usually the user that started it), or ``None`` for non-:py:mod:`mrjob` job flows. * *pool*: pool name (e.g. ``'default'``) if the job flow is pooled, otherwise ``None``. * *ran*: How long the job flow ran, or has been running, as a :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if the job flow hasn't started. * *ready*: UTC `datetime.datetime` that the job flow finished bootstrapping, or ``None`` * *state*: The job flow's state as a string (e.g. ``'RUNNING'``) """ if now is None: now = datetime.utcnow() bcs = {} # basic cluster summary to fill in bcs['id'] = getattr(cluster, 'id', None) bcs['name'] = getattr(cluster, 'name', None) status = getattr(cluster, 'status', None) timeline = getattr(status, 'timeline', None) bcs['created'] = to_datetime(getattr( timeline, 'creationdatetime', None)) bcs['ready'] = to_datetime(getattr(timeline, 'readydatetime', None)) bcs['end'] = to_datetime(getattr(timeline, 'enddatetime', None)) if bcs['created']: bcs['ran'] = (bcs['end'] or now) - bcs['created'] else: bcs['ran'] = timedelta(0) bcs['state'] = getattr(status, 'state', None) bcs['num_steps'] = len(getattr(cluster, 'steps', ())) bcs['pool'] = None bootstrap_actions = getattr(cluster, 'bootstrapactions', None) if bootstrap_actions: args = [arg.value for arg in bootstrap_actions[-1].args] if len(args) == 2 and args[0].startswith('pool-'): bcs['pool'] = args[1] m = JOB_KEY_RE.match(bcs['name'] or '') if m: bcs['label'], bcs['owner'] = m.group(1), m.group(2) else: bcs['label'], bcs['owner'] = None, None bcs['nih'] = float(getattr(cluster, 'normalizedinstancehours', '0')) return bcs
def cluster_to_basic_summary(cluster, now=None): """Extract fields such as creation time, owner, etc. from the cluster, so we can safely reference them without using :py:func:`getattr`. :param cluster: a :py:class:`boto.emr.EmrObject` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with the following keys. These will be ``None`` if the corresponding field in the cluster is unavailable. * *created*: UTC `datetime.datetime` that the cluster was created, or ``None`` * *end*: UTC `datetime.datetime` that the cluster finished, or ``None`` * *id*: cluster ID, or ``None`` (this should never happen) * *label*: The label for the cluster (usually the module name of the :py:class:`~mrjob.job.MRJob` script that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *name*: cluster name, or ``None`` (this should never happen) * *nih*: number of normalized instance hours used by the cluster. * *num_steps*: Number of steps in the cluster. * *owner*: The owner for the cluster (usually the user that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *pool*: pool name (e.g. ``'default'``) if the cluster is pooled, otherwise ``None``. * *ran*: How long the cluster ran, or has been running, as a :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if the cluster hasn't started. * *ready*: UTC `datetime.datetime` that the cluster finished bootstrapping, or ``None`` * *state*: The cluster's state as a string (e.g. ``'RUNNING'``) """ if now is None: now = datetime.utcnow() bcs = {} # basic cluster summary to fill in bcs['id'] = getattr(cluster, 'id', None) bcs['name'] = getattr(cluster, 'name', None) status = getattr(cluster, 'status', None) timeline = getattr(status, 'timeline', None) bcs['created'] = to_datetime(getattr( timeline, 'creationdatetime', None)) bcs['ready'] = to_datetime(getattr(timeline, 'readydatetime', None)) bcs['end'] = to_datetime(getattr(timeline, 'enddatetime', None)) if bcs['created']: bcs['ran'] = (bcs['end'] or now) - bcs['created'] else: bcs['ran'] = timedelta(0) bcs['state'] = getattr(status, 'state', None) bcs['num_steps'] = len(getattr(cluster, 'steps', ())) bcs['pool'] = None bootstrap_actions = getattr(cluster, 'bootstrapactions', None) if bootstrap_actions: args = [arg.value for arg in bootstrap_actions[-1].args] if len(args) == 2 and args[0].startswith('pool-'): bcs['pool'] = args[1] m = JOB_KEY_RE.match(bcs['name'] or '') if m: bcs['label'], bcs['owner'] = m.group(1), m.group(2) else: bcs['label'], bcs['owner'] = None, None bcs['nih'] = float(getattr(cluster, 'normalizedinstancehours', '0')) return bcs
def test_auto_label(self): runner = MRTwoStepJob(['--no-conf']).make_runner() match = JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'mr_two_step_job') self.assertEqual(match.group(2), getpass.getuser())
def test_empty(self): runner = InlineMRJobRunner(conf_paths=[]) match = JOB_KEY_RE.match(runner.get_job_key()) self.assertEqual(match.group(1), 'no_script') self.assertEqual(match.group(2), getpass.getuser())