def job_flow_to_usage_data(job_flow, basic_summary=None, now=None): """Break billing/usage information for a job flow down by job. :param job_flow: a :py:class:`boto.emr.EmrObject` :param basic_summary: a basic summary of the job flow, returned by :py:func:`job_flow_to_basic_summary`. If this is ``None``, we'll call :py:func:`job_flow_to_basic_summary` ourselves. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a list of dictionaries containing usage information, one for bootstrapping, and one for each step that ran or is currently running. If the job flow hasn't started yet, return ``[]``. Usage dictionaries have the following keys: * *end*: when the job finished running, or *now* if it's still running. * *end_billing*: the effective end of the job for billing purposes, either when the next job starts, the current time if the job is still running, or the end of the next full hour in the job flow. * *nih_billed*: normalized instances hours billed for this job or bootstrapping step * *nih_used*: normalized instance hours actually used for running the job or bootstrapping * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`) * *date_to_nih_\**: map from a :py:class:`datetime.date` to number of normalized instance hours billed/used/billed but not used * *label*: job's label (usually the module name of the job), or for the bootstrapping step, the label of the job flow * *owner*: job's owner (usually the user that started it), or for the bootstrapping step, the owner of the job flow * *start*: when the job or bootstrapping step started, as a :py:class:`datetime.datetime` """ jf = basic_summary or job_flow_to_basic_summary(job_flow) if now is None: now = datetime.utcnow() if not jf['start']: return [] # Figure out billing rate per second for the job, given that # normalizedinstancehours is how much we're charged up until # the next full hour. full_hours = math.ceil(to_secs(jf['ran']) / 60.0 / 60.0) nih_per_sec = jf['nih'] / (full_hours * 3600.0) # Don't actually count a step as billed for the full hour until # the job flow finishes. This means that our total "nih_billed" # will be less than normalizedinstancehours in the job flow, but it # also keeps stats stable for steps that have already finished. if jf['end']: jf_end_billing = jf['start'] + timedelta(hours=full_hours) else: jf_end_billing = now intervals = [] # add a fake step for the job that started the job flow, and credit # it for time spent bootstrapping. intervals.append({ 'label': jf['label'], 'owner': jf['owner'], 'start': jf['start'], 'end': jf['ready'] or now, 'step_num': None, }) for step in (getattr(job_flow, 'steps', None) or ()): # we've reached the last step that's actually run if not hasattr(step, 'startdatetime'): break step_start = to_datetime(step.startdatetime) step_end = to_datetime(getattr(step, 'enddatetime', None)) if step_end is None: # step started running and was cancelled. credit it for 0 usage if jf['end']: step_end = step_start # step is still running else: step_end = now m = STEP_NAME_RE.match(getattr(step, 'name', '')) if m: step_label = m.group(1) step_owner = m.group(2) step_num = int(m.group(6)) else: step_label, step_owner, step_num = None, None, None intervals.append({ 'label': step_label, 'owner': step_owner, 'start': step_start, 'end': step_end, 'step_num': step_num, }) # fill in end_billing for i in xrange(len(intervals) - 1): intervals[i]['end_billing'] = intervals[i + 1]['start'] intervals[-1]['end_billing'] = jf_end_billing # fill normalized usage information for interval in intervals: interval['nih_used'] = (nih_per_sec * to_secs(interval['end'] - interval['start'])) interval['date_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date( interval['start'], interval['end']).iteritems()) interval['nih_billed'] = ( nih_per_sec * to_secs(interval['end_billing'] - interval['start'])) interval['date_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date( interval['start'], interval['end_billing']).iteritems()) # time billed but not used interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used'] interval['date_to_nih_bbnu'] = {} for d, nih_billed in interval['date_to_nih_billed'].iteritems(): nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['date_to_nih_bbnu'][d] = nih_bbnu return intervals
def cluster_to_usage_data(cluster, basic_summary=None, now=None): """Break billing/usage information for a job flow down by job. :param cluster: a :py:class:`boto.emr.EmrObject` :param basic_summary: a basic summary of the job flow, returned by :py:func:`cluster_to_basic_summary`. If this is ``None``, we'll call :py:func:`cluster_to_basic_summary` ourselves. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a list of dictionaries containing usage information, one for bootstrapping, and one for each step that ran or is currently running. If the job flow hasn't started yet, return ``[]``. Usage dictionaries have the following keys: * *end*: when the job finished running, or *now* if it's still running. * *end_billing*: the effective end of the job for billing purposes, either when the next job starts, the current time if the job is still running, or the end of the next full hour in the job flow. * *nih_billed*: normalized instances hours billed for this job or bootstrapping step * *nih_used*: normalized instance hours actually used for running the job or bootstrapping * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`) * *date_to_nih_\**: map from a :py:class:`datetime.date` to number of normalized instance hours billed/used/billed but not used on that date * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number of normalized instance hours billed/used/billed but not used during the hour starting at that time * *label*: job's label (usually the module name of the job), or for the bootstrapping step, the label of the job flow * *owner*: job's owner (usually the user that started it), or for the bootstrapping step, the owner of the job flow * *start*: when the job or bootstrapping step started, as a :py:class:`datetime.datetime` """ bcs = basic_summary or cluster_to_basic_summary(cluster) if now is None: now = datetime.utcnow() if not bcs['created']: return [] # Figure out billing rate per second for the job, given that # normalizedinstancehours is how much we're charged up until # the next full hour. full_hours = math.ceil(to_secs(bcs['ran']) / 60.0 / 60.0) nih_per_sec = bcs['nih'] / (full_hours * 3600.0) # Don't actually count a step as billed for the full hour until # the job flow finishes. This means that our total "nih_billed" # will be less than normalizedinstancehours in the job flow, but it # also keeps stats stable for steps that have already finished. if bcs['end']: cluster_end_billing = bcs['created'] + timedelta(hours=full_hours) else: cluster_end_billing = now intervals = [] # make a fake step for cluster startup and bootstrapping, so we don't # consider that wasted. intervals.append({ 'label': bcs['label'], 'owner': bcs['owner'], 'start': bcs['created'], 'end': bcs['ready'] or bcs['end'] or now, 'step_num': None, }) for step in getattr(cluster, 'steps', ()): step_status = getattr(step, 'status', None) step_timeline = getattr(step_status, 'timeline', None) # we've reached the last step that's actually run if not hasattr(step_timeline, 'startdatetime'): break step_start = to_datetime(step_timeline.startdatetime) step_end = to_datetime(getattr(step_timeline, 'enddatetime', None)) if step_end is None: # step started running and was cancelled. credit it for 0 usage if bcs['end']: step_end = step_start # step is still running else: step_end = now m = STEP_NAME_RE.match(getattr(step, 'name', '')) if m: step_label = m.group(1) step_owner = m.group(2) step_num = int(m.group(6)) else: step_label, step_owner, step_num = None, None, None intervals.append({ 'label': step_label, 'owner': step_owner, 'start': step_start, 'end': step_end, 'step_num': step_num, }) # fill in end_billing for i in range(len(intervals) - 1): intervals[i]['end_billing'] = intervals[i + 1]['start'] intervals[-1]['end_billing'] = cluster_end_billing # fill normalized usage information for interval in intervals: interval['nih_used'] = ( nih_per_sec * to_secs(interval['end'] - interval['start'])) interval['date_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date(interval['start'], interval['end']).items()) interval['hour_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_hour(interval['start'], interval['end']).items()) interval['nih_billed'] = ( nih_per_sec * to_secs(interval['end_billing'] - interval['start'])) interval['date_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date(interval['start'], interval['end_billing']).items()) interval['hour_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_hour(interval['start'], interval['end_billing']).items()) # time billed but not used interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used'] interval['date_to_nih_bbnu'] = {} for d, nih_billed in interval['date_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['date_to_nih_bbnu'][d] = nih_bbnu interval['hour_to_nih_bbnu'] = {} for d, nih_billed in interval['hour_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['hour_to_nih_bbnu'][d] = nih_bbnu return intervals
def job_flow_to_usage_data(job_flow, basic_summary=None, now=None): """Break billing/usage information for a job flow down by job. :param job_flow: a :py:class:`boto.emr.EmrObject` :param basic_summary: a basic summary of the job flow, returned by :py:func:`job_flow_to_basic_summary`. If this is ``None``, we'll call :py:func:`job_flow_to_basic_summary` ourselves. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a list of dictionaries containing usage information, one for bootstrapping, and one for each step that ran or is currently running. If the job flow hasn't started yet, return ``[]``. Usage dictionaries have the following keys: * *end*: when the job finished running, or *now* if it's still running. * *end_billing*: the effective end of the job for billing purposes, either when the next job starts, the current time if the job is still running, or the end of the next full hour in the job flow. * *nih_billed*: normalized instances hours billed for this job or bootstrapping step * *nih_used*: normalized instance hours actually used for running the job or bootstrapping * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`) * *date_to_nih_\**: map from a :py:class:`datetime.date` to number of normalized instance hours billed/used/billed but not used on that date * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number of normalized instance hours billed/used/billed but not used during the hour starting at that time * *label*: job's label (usually the module name of the job), or for the bootstrapping step, the label of the job flow * *owner*: job's owner (usually the user that started it), or for the bootstrapping step, the owner of the job flow * *start*: when the job or bootstrapping step started, as a :py:class:`datetime.datetime` """ jf = basic_summary or job_flow_to_basic_summary(job_flow) if now is None: now = datetime.utcnow() if not jf["start"]: return [] # Figure out billing rate per second for the job, given that # normalizedinstancehours is how much we're charged up until # the next full hour. full_hours = math.ceil(to_secs(jf["ran"]) / 60.0 / 60.0) nih_per_sec = jf["nih"] / (full_hours * 3600.0) # Don't actually count a step as billed for the full hour until # the job flow finishes. This means that our total "nih_billed" # will be less than normalizedinstancehours in the job flow, but it # also keeps stats stable for steps that have already finished. if jf["end"]: jf_end_billing = jf["start"] + timedelta(hours=full_hours) else: jf_end_billing = now intervals = [] # add a fake step for the job that started the job flow, and credit # it for time spent bootstrapping. intervals.append( {"label": jf["label"], "owner": jf["owner"], "start": jf["start"], "end": jf["ready"] or now, "step_num": None} ) for step in getattr(job_flow, "steps", None) or (): # we've reached the last step that's actually run if not hasattr(step, "startdatetime"): break step_start = to_datetime(step.startdatetime) step_end = to_datetime(getattr(step, "enddatetime", None)) if step_end is None: # step started running and was cancelled. credit it for 0 usage if jf["end"]: step_end = step_start # step is still running else: step_end = now m = STEP_NAME_RE.match(getattr(step, "name", "")) if m: step_label = m.group(1) step_owner = m.group(2) step_num = int(m.group(6)) else: step_label, step_owner, step_num = None, None, None intervals.append( {"label": step_label, "owner": step_owner, "start": step_start, "end": step_end, "step_num": step_num} ) # fill in end_billing for i in xrange(len(intervals) - 1): intervals[i]["end_billing"] = intervals[i + 1]["start"] intervals[-1]["end_billing"] = jf_end_billing # fill normalized usage information for interval in intervals: interval["nih_used"] = nih_per_sec * to_secs(interval["end"] - interval["start"]) interval["date_to_nih_used"] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date(interval["start"], interval["end"]).iteritems() ) interval["hour_to_nih_used"] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_hour(interval["start"], interval["end"]).iteritems() ) interval["nih_billed"] = nih_per_sec * to_secs(interval["end_billing"] - interval["start"]) interval["date_to_nih_billed"] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_date(interval["start"], interval["end_billing"]).iteritems() ) interval["hour_to_nih_billed"] = dict( (d, nih_per_sec * secs) for d, secs in subdivide_interval_by_hour(interval["start"], interval["end_billing"]).iteritems() ) # time billed but not used interval["nih_bbnu"] = interval["nih_billed"] - interval["nih_used"] interval["date_to_nih_bbnu"] = {} for d, nih_billed in interval["date_to_nih_billed"].iteritems(): nih_bbnu = nih_billed - interval["date_to_nih_used"].get(d, 0.0) if nih_bbnu: interval["date_to_nih_bbnu"][d] = nih_bbnu interval["hour_to_nih_bbnu"] = {} for d, nih_billed in interval["hour_to_nih_billed"].iteritems(): nih_bbnu = nih_billed - interval["hour_to_nih_used"].get(d, 0.0) if nih_bbnu: interval["hour_to_nih_bbnu"][d] = nih_bbnu return intervals