Exemplo n.º 1
0
def dump_summary(info, label):
    print "-- %s by operation by cell by platform --" % (label,)
    p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count",
                                 "Min", "Max", "Avg"])
    for c in ["Count", "Min", "Max", "Avg"]:
        p.align[c] = 'r'

    total = 0
    op_totals = {}
    cell_totals = {}
    platform_totals = {}
    for key, count in info.iteritems():
        operation, platform, cell = key
        readable = image_type.readable(platform)
        text = "n/a"
        if readable:
            text = ", ".join(readable)

        _min, _max, _count, _total = durations[key]
        _avg = float(_total) / float(_count)
        _fmin = dt.sec_to_str(_min)
        _fmax = dt.sec_to_str(_max)
        _favg = dt.sec_to_str(_avg * 100.0)

        op_totals[operation] = op_totals.get(operation, 0) + count
        cell_totals[cell] = cell_totals.get(cell, 0) + count
        platform_totals[text] = platform_totals.get(text, 0) + count

        p.add_row([operation, cell, text, count, _fmin, _fmax, _favg])
        total += count
    p.sortby = 'Count'
    print p

    dump_breakdown(op_totals, "Total %s by Operation" % label)
    dump_breakdown(cell_totals, "Total %s by Cell" % label)
    dump_breakdown(platform_totals, "Total %s by Platform" % label)

    print

    return total
Exemplo n.º 2
0
def make_report(yesterday=None,
                start_hour=0,
                hours=24,
                percentile=97,
                store=False,
                region=None,
                too_long=1800):
    if not yesterday:
        yesterday = datetime.datetime.utcnow().date() - \
                    datetime.timedelta(days=1)

    rstart = datetime.datetime(year=yesterday.year,
                               month=yesterday.month,
                               day=yesterday.day,
                               hour=start_hour)
    rend = rstart + datetime.timedelta(hours=hours - 1, minutes=59, seconds=59)

    dstart = dt.dt_to_decimal(rstart)
    dend = dt.dt_to_decimal(rend)

    codes = {}
    too_long_col = '> %d' % (too_long / 60)

    cells = []
    regions = []
    if region:
        region = region.upper()
    deployments = models.Deployment.objects.all()
    for deployment in deployments:
        name = deployment.name.upper()
        if not region or region in name:
            regions.append(deployment.id)
            cells.append(deployment.name)

    if not len(regions):
        print "No regions found for '%s'" % region
        sys.exit(1)

    # Get all the instances that have changed in the last N hours ...
    updates = models.RawData.objects.filter(event='compute.instance.update',
                                            when__gt=dstart, when__lte=dend,
                                            deployment__in=regions)\
                                    .values('instance').distinct()

    expiry = 60 * 60  # 1 hour
    cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']

    failures = {}  # { key : {failure_type: count} }
    durations = {}
    attempts = {}

    for uuid_dict in updates:
        uuid = uuid_dict['instance']

        # All the unique Request ID's for this instance during that timespan.
        reqs = models.RawData.objects.filter(instance=uuid,
                                             when__gt=dstart, when__lte=dend) \
                                     .values('request_id').distinct()

        for req_dict in reqs:
            req = req_dict['request_id']
            raws = models.RawData.objects.filter(request_id=req)\
                                      .exclude(event='compute.instance.exists')\
                                      .order_by('when')

            start = None
            err = None
            failure_type = None

            operation = "aux"
            image_type_num = 0

            for raw in raws:
                if not start:
                    start = raw.when

                if 'error' in raw.routing_key:
                    err = raw
                    failure_type = 'http'

                if raw.old_state != 'error' and raw.state == 'error':
                    failure_type = 'state'

                if raw.old_state == 'error' and \
                                (not raw.state in ['deleted', 'error']):
                    failure_type = None

                for cmd in cmds:
                    if cmd in raw.event:
                        operation = cmd
                        break

                if raw.image_type:
                    image_type_num |= raw.image_type

            image = "?"
            if image_type.isset(image_type_num, image_type.BASE_IMAGE):
                image = "base"
            if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
                image = "snap"

            if not start:
                continue

            end = raw.when
            diff = end - start

            if diff > too_long and failure_type == None:
                failure_type = too_long_col

            key = (operation, image)

            # Track durations for all attempts, good and bad ...
            _durations = durations.get(key, [])
            _durations.append(diff)
            durations[key] = _durations

            attempts[key] = attempts.get(key, 0) + 1

            if failure_type:
                if err:
                    queue, body = json.loads(err.json)
                    payload = body['payload']
                    exc = payload.get('exception')
                    if exc:
                        code = int(exc.get('kwargs', {}).get('code', 0))
                        if code >= 400 and code < 500:
                            failure_type = "4xx"
                        if code >= 500 and code < 600:
                            failure_type = "5xx"
                breakdown = failures.get(key, {})
                breakdown[failure_type] = breakdown.get(failure_type, 0) + 1
                failures[key] = breakdown

    # Summarize the results ...
    report = []
    pct = (float(100 - percentile) / 2.0) / 100.0
    details = {
        'percentile': percentile,
        'pct': pct,
        'hours': hours,
        'start': float(dstart),
        'end': float(dend),
        'region': region,
        'cells': cells
    }
    report.append(details)

    failure_types = ["4xx", "5xx", too_long_col, "state"]
    cols = [
        "Operation", "Image", "Min", "Max", "Med",
        "%d%%" % percentile, "Requests"
    ]
    for failure_type in failure_types:
        cols.append("%s" % failure_type)
        cols.append("%% %s" % failure_type)
    report.append(cols)

    total = 0
    failure_totals = {}
    for key, count in attempts.iteritems():
        total += count
        operation, image = key

        breakdown = failures.get(key, {})
        this_failure_pair = []
        for failure_type in failure_types:
            # Failure counts for this attempt.
            # Sum for grand totals.
            failure_count = breakdown.get(failure_type, 0)
            failure_totals[failure_type] = \
                         failure_totals.get(failure_type, 0) + failure_count

            # Failure percentage for this attempt.
            percentage = float(failure_count) / float(count)
            this_failure_pair.append((failure_count, percentage))

        # N-th % of durations ...
        _values = durations[key]
        _values.sort()
        _min = 99999999
        _max = 0
        _total = 0.0
        for value in _values:
            _min = min(_min, value)
            _max = max(_max, value)
            _total += float(value)
        _num = len(_values)
        _avg = float(_total) / float(_num)
        half = _num / 2
        _median = _values[half]
        _percentile_index = int((float(percentile) / 100.0) * float(_num))
        _percentile = _values[_percentile_index]

        _fmin = dt.sec_to_str(_min)
        _fmax = dt.sec_to_str(_max)
        _favg = dt.sec_to_str(_avg)
        _fmedian = dt.sec_to_str(_median)
        _fpercentile = dt.sec_to_str(_percentile)

        row = [operation, image, _fmin, _fmax, _fmedian, _fpercentile, count]
        for failure_count, failure_percentage in this_failure_pair:
            row.append(failure_count)
            row.append(failure_percentage)
        report.append(row)

    details['total'] = total
    failure_grand_total = 0
    for failure_type in failure_types:
        failure_total = failure_totals.get(failure_type, 0)
        failure_grand_total += failure_total
        details["%s failure count" % failure_type] = failure_total
        failure_percentage = (float(failure_total) / float(total)) * 100.0
        details["%s failure percentage" % failure_type] = failure_percentage

    details['failure_grand_total'] = failure_grand_total
    details['failure_grand_rate'] = (float(failure_grand_total) /
                                     float(total)) * 100.0
    return (rstart, rend, report)
Exemplo n.º 3
0
def make_report(yesterday=None, start_hour=0, hours=24, percentile=97,
                store=False, region=None, too_long=1800):
    if not yesterday:
        yesterday = datetime.datetime.utcnow().date() - \
                    datetime.timedelta(days=1)

    rstart = datetime.datetime(year=yesterday.year, month=yesterday.month,
                              day=yesterday.day, hour=start_hour)
    rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59)

    dstart = dt.dt_to_decimal(rstart)
    dend = dt.dt_to_decimal(rend)

    codes = {}
    too_long_col = '> %d' % (too_long / 60)

    cells = []
    regions = []
    if region:
        region = region.upper()
    deployments = models.Deployment.objects.all()
    for deployment in deployments:
        name = deployment.name.upper()
        if not region or region in name:
            regions.append(deployment.id)
            cells.append(deployment.name)

    if not len(regions):
        print "No regions found for '%s'" % region
        sys.exit(1)

    # Get all the instances that have changed in the last N hours ...
    updates = models.RawData.objects.filter(event='compute.instance.update',
                                            when__gt=dstart, when__lte=dend,
                                            deployment__in=regions)\
                                    .values('instance').distinct()

    expiry = 60 * 60  # 1 hour
    cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']

    failures = {}  # { key : {failure_type: count} }
    durations = {}
    attempts = {}

    for uuid_dict in updates:
        uuid = uuid_dict['instance']

        # All the unique Request ID's for this instance during that timespan.
        reqs = models.RawData.objects.filter(instance=uuid,
                                             when__gt=dstart, when__lte=dend) \
                                     .values('request_id').distinct()


        for req_dict in reqs:
            req = req_dict['request_id']
            raws = models.RawData.objects.filter(request_id=req)\
                                      .exclude(event='compute.instance.exists')\
                                      .order_by('when')

            start = None
            err = None
            failure_type = None

            operation = "aux"
            image_type_num = 0

            for raw in raws:
                if not start:
                    start = raw.when

                if 'error' in raw.routing_key:
                    err = raw
                    failure_type = 'http'

                if raw.old_state != 'error' and raw.state == 'error':
                    failure_type = 'state'

                if raw.old_state == 'error' and \
                                (not raw.state in ['deleted', 'error']):
                    failure_type = None

                for cmd in cmds:
                    if cmd in raw.event:
                        operation = cmd
                        break

                if raw.image_type:
                    image_type_num |= raw.image_type

            image = "?"
            if image_type.isset(image_type_num, image_type.BASE_IMAGE):
                image = "base"
            if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
                image = "snap"

            if not start:
                continue

            end = raw.when
            diff = end - start

            if diff > too_long and failure_type == None:
                failure_type = too_long_col

            key = (operation, image)

            # Track durations for all attempts, good and bad ...
            _durations = durations.get(key, [])
            _durations.append(diff)
            durations[key] = _durations

            attempts[key] = attempts.get(key, 0) + 1

            if failure_type:
                if err:
                    queue, body = json.loads(err.json)
                    payload = body['payload']
                    exc = payload.get('exception')
                    if exc:
                        code = int(exc.get('kwargs', {}).get('code', 0))
                        if code >= 400 and code < 500:
                            failure_type = "4xx"
                        if code >= 500 and code < 600:
                            failure_type = "5xx"
                breakdown = failures.get(key, {})
                breakdown[failure_type] = breakdown.get(failure_type, 0) + 1
                failures[key] = breakdown

    # Summarize the results ...
    report = []
    pct = (float(100 - percentile) / 2.0) / 100.0
    details = {'percentile': percentile, 'pct': pct, 'hours': hours,
               'start': float(dstart), 'end': float(dend), 'region': region,
               'cells': cells}
    report.append(details)

    failure_types = ["4xx", "5xx", too_long_col, "state"]
    cols = ["Operation", "Image", "Min", "Max", "Med", "%d%%" % percentile,
            "Requests"]
    for failure_type in failure_types:
        cols.append("%s" % failure_type)
        cols.append("%% %s" % failure_type)
    report.append(cols)

    total = 0
    failure_totals = {}
    for key, count in attempts.iteritems():
        total += count
        operation, image = key

        breakdown = failures.get(key, {})
        this_failure_pair = []
        for failure_type in failure_types:
            # Failure counts for this attempt.
            # Sum for grand totals.
            failure_count = breakdown.get(failure_type, 0)
            failure_totals[failure_type] = \
                         failure_totals.get(failure_type, 0) + failure_count

            # Failure percentage for this attempt.
            percentage = float(failure_count) / float(count)
            this_failure_pair.append((failure_count, percentage))

        # N-th % of durations ...
        _values = durations[key]
        _values.sort()
        _min = 99999999
        _max = 0
        _total = 0.0
        for value in _values:
            _min = min(_min, value)
            _max = max(_max, value)
            _total += float(value)
        _num = len(_values)
        _avg = float(_total) / float(_num)
        half = _num / 2
        _median = _values[half]
        _percentile_index = int((float(percentile) / 100.0) * float(_num))
        _percentile = _values[_percentile_index]

        _fmin = dt.sec_to_str(_min)
        _fmax = dt.sec_to_str(_max)
        _favg = dt.sec_to_str(_avg)
        _fmedian = dt.sec_to_str(_median)
        _fpercentile = dt.sec_to_str(_percentile)

        row = [operation, image, _fmin, _fmax, _fmedian, _fpercentile, count]
        for failure_count, failure_percentage in this_failure_pair:
            row.append(failure_count)
            row.append(failure_percentage)
        report.append(row)

    details['total'] = total
    failure_grand_total = 0
    for failure_type in failure_types:
        failure_total = failure_totals.get(failure_type, 0)
        failure_grand_total += failure_total
        details["%s failure count" % failure_type] = failure_total
        failure_percentage = (float(failure_total)/float(total)) * 100.0
        details["%s failure percentage" % failure_type] = failure_percentage

    details['failure_grand_total'] = failure_grand_total
    details['failure_grand_rate'] = (float(failure_grand_total)/float(total)) * 100.0
    return (rstart, rend, report)