Exemplo n.º 1
0
def email_instance_hours(date, dry_run=False):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv
                        for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(*) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM [logs.requestlogs_%s]
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (cost_fn, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "instance_hours", date, 14, lambda row: row['url_route'])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row['instance_hours']

    for row in data:
        row['%% of total'] = row['instance_hours'] / total_instance_hours * 100
        row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row:
                sparkline_data.append(
                    old_row['instance_hours'] / old_row['count_'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (per request)'] = sparkline_data

    _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests',
              'last 2 weeks (per request)', 'url_route')
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'Instance Hours by Route'
    heading = 'Instance hours by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:50]}, None,
                to=['*****@*****.**'],
                subject=subject,
                dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(data[:20],
                               'webapp.routes.instance_hours.week_over_week',
                               'url_route', metric_label_col='url_route',
                               data_col='last 2 weeks (per request)',
                               dry_run=dry_run)
Exemplo n.º 2
0
def email_client_api_usage(date, dry_run=False):
    """Emails a report of API usage, segmented by client and build version."""
    yyyymmdd = date.strftime("%Y%m%d")

    ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \
                           ' Darwin/([.0-9]*)$'

    # We group all non-ios user agents into a single bucket to keep this
    # report down to a reasonable size.
    query = """\
SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'),
      'Web Browsers/other') as client,
      IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'),
      '') as build,
    elog_url_route as route,
    count(elog_url_route) as request_count
FROM logs.requestlogs_%(date_format)s
WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal')
    AND user_agent IS NOT NULL
    AND elog_url_route IS NOT NULL
    AND LEFT(version_id, 3) != 'znd' # ignore znds
GROUP BY client, build, route
ORDER BY client DESC, build DESC, request_count DESC;
""" % {
        'ios_user_agent_regex': ios_user_agent_regex,
        'date_format': yyyymmdd
    }
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "client_api_usage", yyyymmdd)

    _ORDER = ('client', 'build', 'route', 'request_count')
    all_data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'API usage by client - '
    heading = 'API usage by client for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: all_data},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data, key='route'):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        _send_email({heading: table},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)
Exemplo n.º 3
0
def email_instance_hours(date):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv
                        for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(*) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM [logs.requestlogs_%s]
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (cost_fn, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data("instance_hours", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row['instance_hours']

    for row in data:
        row['%% of total'] = row['instance_hours'] / total_instance_hours * 100
        row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row:
                sparkline_data.append(old_row['instance_hours'] /
                                      old_row['count_'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (per request)'] = sparkline_data

    _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests',
              'last 2 weeks (per request)', 'url_route')
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'Instance Hours by Route'
    heading = 'Instance hours by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:50]},
                None,
                to=['*****@*****.**'],
                subject=subject)
def email_instance_hours(date):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = "\n".join("WHEN module_id == '%s' THEN latency * %s" % kv for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(*) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM [logs.requestlogs_%s]
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (
        cost_fn,
        yyyymmdd,
    )
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data("instance_hours", date, 14, lambda row: row["url_route"])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row["instance_hours"]

    for row in data:
        row["%% of total"] = row["instance_hours"] / total_instance_hours * 100
        row["per 1k requests"] = row["instance_hours"] / row["count_"] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row["url_route"])
            if old_row:
                sparkline_data.append(old_row["instance_hours"] / old_row["count_"])
            else:
                sparkline_data.append(None)
        row["last 2 weeks (per request)"] = sparkline_data

    _ORDER = ("%% of total", "instance_hours", "count_", "per 1k requests", "last 2 weeks (per request)", "url_route")
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = "Instance Hours by Route"
    heading = "Instance hours by route for %s" % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:50]}, None, to=["*****@*****.**"], subject=subject)
Exemplo n.º 5
0
def email_client_api_usage(date, dry_run=False):
    """Emails a report of API usage, segmented by client and build version."""
    yyyymmdd = date.strftime("%Y%m%d")

    ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \
                           ' Darwin/([.0-9]*)$'

    # We group all non-ios user agents into a single bucket to keep this
    # report down to a reasonable size.
    query = """\
SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'),
      'Web Browsers/other') as client,
      IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'),
      '') as build,
    elog_url_route as route,
    count(elog_url_route) as request_count
FROM logs.requestlogs_%(date_format)s
WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal')
    AND user_agent IS NOT NULL
    AND elog_url_route IS NOT NULL
GROUP BY client, build, route
ORDER BY client DESC, build DESC, request_count DESC;
""" % {'ios_user_agent_regex': ios_user_agent_regex, 'date_format': yyyymmdd}
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "client_api_usage", yyyymmdd)

    _ORDER = ('client', 'build', 'route', 'request_count')

    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'API usage by client'
    heading = 'API usage by client for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: data}, None,
                to=['*****@*****.**'],
                subject=subject,
                dry_run=dry_run)
Exemplo n.º 6
0
def email_rpcs(date):
    """Email RPCs-per-route report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete')

    inits = [
        "IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
        for name in rpc_fields
    ]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = [
        "LEFT OUTER JOIN ( "
        "SELECT elog_url_route AS url_route, COUNT(*) AS rpc_%s "
        "FROM FLATTEN([logs.requestlogs_%s], elog_stats_rpc) "
        "WHERE elog_stats_rpc.key = 'stats.rpc.datastore_v3.%s' "
        "GROUP BY url_route) AS t%s ON t1.url_route = t%s.url_route" %
        (name, yyyymmdd, name, name, name) for name in rpc_fields
    ]
    joins.append("""\
LEFT OUTER JOIN (
SELECT elog_url_route AS url_route,
       SUM(elog_stats_rpc_ops.value) AS rpc_cost
FROM [logs.requestlogs_%s]
WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost'
GROUP BY url_route) AS tcost ON t1.url_route = tcost.url_route
""" % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
SELECT elog_url_route AS url_route, COUNT(*) AS url_requests
FROM [logs.requestlogs_%s]
GROUP BY url_route) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data("rpcs", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = 'μ¢'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(old_row['rpc_cost'] * 1.0 /
                                      old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = ([
        'url_route', 'requests', '$', micropennies + '/req',
        'last 2 weeks (%s/req)' % micropennies
    ] + ['rpc_%s' % f
         for f in rpc_fields] + ['%s/req' % f for f in rpc_fields])
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'RPC calls by route'
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:75]},
                None,
                to=['*****@*****.**'],
                subject=subject)
Exemplo n.º 7
0
def email_memory_increases(date, window_length=20, min_increase_in_mb=1):
    """Emails the increases in memory caused by particular routes.

    It attempts to compute the amount of memory ignoring memory which is
    reclaimed in the next few requests.  (The number of requests which are
    checked is specified by the window_length parameter.  Routes with a total
    increase less than the min_increase_in_mb parameter are ignored.)
    """
    yyyymmdd = date.strftime("%Y%m%d")
    lead_lengths = range(1, window_length + 1)
    lead_selects = '\n'.join(
        "LEAD(total, %s) OVER (PARTITION BY instance_key ORDER BY start_time) "
        "AS lead_total_%s," % (i, i) for i in lead_lengths)

    fields = ['total'] + ['lead_total_%s' % i for i in lead_lengths]
    # We want to compute the minimal value of added + field - total, where
    # field is one of "total" or one of the "lead_total_i".  BigQuery
    # unfortunately doesn't give us a nice way to do this (at least that I know
    # of, without doing a CROSS JOIN of the table to itself).  One way to do
    # this would be a gigantic nested IF, but this could be exponentially
    # large, and queries have a fixed maximum size.  Instead we use a CASE
    # expression which could be O(n^2); since we don't pay for BigQuery
    # execution time, and n shouldn't be too huge, this seems like a better
    # approach.  In theory it might be better to do O(n) nested queries (each
    # of which does a single pairwise min), but this seems like it could in
    # practice be even slower, depending on the implementation.
    # TODO(benkraft): If I get a useful answer to
    # http://stackoverflow.com/questions/24923101/computing-a-moving-maximum-in-bigquery
    # we should use that instead.  Or, once we have user-defined functions in
    # BigQuery, we can probably do something actually reasonable.
    case_expr = '\n'.join(
        "WHEN %s THEN added + %s - total" % (
            ' AND '.join("%s <= %s" % (field1, field2)
                         for field2 in fields if field2 != field1),
            field1)
        for field1 in fields)

    # This is a kind of large query, so here's what it's doing, from inside to
    # out:
    #   First, extract the memory data from the logs.
    #   Second, compute the appropriate LEAD() columns, which tell us what the
    #       total will be a few requests later on the same instance
    #   Third, compute "real_added", which is the amount we think the request
    #       actually added to the heap, not counting memory which was soon
    #       reclaimed.  This might come out negative, so
    #   Fourth, make sure the memory added is at least zero, since if memory
    #       usage went down, this request probably shouldn't get the credit.
    #   Fifth, group by route and do whatever aggregation we want.  Ignore the
    #       first 25 requests to each module, since those are probably all
    #       loading things that each module has to load, and the request that
    #       does the loading shouldn't be blamed.
    query = """\
SELECT
    COUNT(*) AS count_,
    elog_url_route AS url_route,
    module_id AS module,
    AVG(real_added) AS added_avg,
    NTH(99, QUANTILES(real_added, 101)) AS added_98th,
    SUM(real_added) AS added_total,
FROM (
    SELECT
        IF(real_added > 0, real_added, 0) AS real_added,
        elog_url_route, module_id, num,
    FROM (
        SELECT
            (CASE %s ELSE added END) AS real_added,
            elog_url_route, module_id, num,
        FROM (
            SELECT
                %s
                RANK() OVER (PARTITION BY instance_key
                             ORDER BY start_time) AS num,
                added, total, elog_url_route, module_id,
            FROM (
                SELECT
                    FLOAT(REGEXP_EXTRACT(
                        app_logs.message,
                        "This request added (.*) MB to the heap.")) AS added,
                    FLOAT(REGEXP_EXTRACT(
                        app_logs.message,
                        "Total memory now used: (.*) MB")) AS total,
                    instance_key, start_time, elog_url_route, module_id,
                FROM [logs.requestlogs_%s]
                WHERE app_logs.message CONTAINS 'This request added'
            )
        )
    )
)
WHERE num > 25
GROUP BY url_route, module
ORDER BY added_total DESC
""" % (case_expr, lead_selects, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "memory_increases", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "memory_increases", date, 14,
        lambda row: (row['module'], row['url_route']))

    by_module = collections.defaultdict(list)
    for row in data:
        if row['added_total'] > min_increase_in_mb:
            heading = "Memory increases by route for %s module on %s" % (
                row['module'], _pretty_date(yyyymmdd))
            by_module[heading].append(row)

    _ORDER = ['count_', 'added_avg', 'last 2 weeks (avg)', 'added_98th',
              'added_total', 'added %%', 'url_route']
    for heading in by_module:
        total = sum(row['added_total'] for row in by_module[heading])
        for row in by_module[heading]:
            row['added %%'] = row['added_total'] / total * 100
            sparkline_data = []
            for old_data in historical_data:
                old_row = old_data.get((row['module'], row['url_route']))
                if old_row:
                    sparkline_data.append(old_row['added_avg'])
                else:
                    sparkline_data.append(None)
            row['last 2 weeks (avg)'] = sparkline_data
            del row['module']
        by_module[heading] = _convert_table_rows_to_lists(
            by_module[heading][:50], _ORDER)
    subject = "Memory Increases by Route"
    _send_email(by_module, None,
                to=['*****@*****.**'],
                subject=subject)
Exemplo n.º 8
0
def email_out_of_memory_errors(date):
    # This sends two emails, for two different ways of seeing the data.
    # But we'll have them share the same subject so they thread together.
    yyyymmdd = date.strftime("%Y%m%d")
    subject = 'OOM errors'

    # Out-of-memory errors look like:
    #   Exceeded soft private memory limit with 260.109 MB after servicing 2406 requests total  #@Nolint
    # with SDK 1.9.7, they changed. Note the double-space after "after":
    #   Exceeded soft private memory limit of 512 MB with 515 MB after  servicing 9964 requests total  #@Nolint
    numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')"
    query = """\
SELECT COUNT(module_id) AS count_,
       module_id,
       NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th,
       NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th,
       NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th
FROM [logs.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
      AND module_id IS NOT NULL
GROUP BY module_id
ORDER BY count_ DESC
""" % (numreqs, numreqs, numreqs, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_module", date, 14,
        lambda row: row['module_id'])

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['module_id'])
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this module, then it just didn't
                # OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id',
              'numserved_10th', 'numserved_50th', 'numserved_90th']
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd)
    email_content = {heading: data}

    query = """\
SELECT COUNT(*) as count_,
       module_id,
       elog_url_route as url_route
FROM [logs.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
GROUP BY module_id, url_route
ORDER BY count_ DESC
""" % yyyymmdd
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_route", date, 14,
        lambda row: (row['module_id'], row['url_route']))

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get((row['module_id'], row['url_route']))
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this route/module, then it just
                # didn't OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route']
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd)
    email_content[heading] = data

    _send_email(email_content, None,
                to=['*****@*****.**'],
                subject=subject)
Exemplo n.º 9
0
def email_rpcs(date):
    """Email RPCs-per-route report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit')

    inits = ["IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
             for name in rpc_fields]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = ["LEFT OUTER JOIN ( "
             "SELECT elog_url_route AS url_route, "
             "       SUM(elog_stats_rpc_ops.value) as rpc_%s "
             "FROM [logs.requestlogs_%s] "
             "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' "
             "GROUP BY url_route) AS t%s "
             "ON t1.url_route = t%s.url_route"
             % (name, yyyymmdd, name, name, name)
             for name in rpc_fields]
    joins.append("LEFT OUTER JOIN ( "
                 "SELECT elog_url_route AS url_route, "
                 "       SUM(elog_stats_rpc_ops.value) AS rpc_cost "
                 "FROM [logs.requestlogs_%s] "
                 "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' "
                 "GROUP BY url_route) AS tcost "
                 "ON t1.url_route = tcost.url_route"
                 % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
SELECT elog_url_route AS url_route, COUNT(*) AS url_requests
FROM [logs.requestlogs_%s]
GROUP BY url_route) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "rpcs", date, 14, lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = '&mu;&cent;'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(
                    old_row['rpc_cost'] * 1.0 / old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = (['url_route', 'requests', '$', micropennies + '/req',
               'last 2 weeks (%s/req)' % micropennies] +
              ['rpc_%s' % f for f in rpc_fields] +
              ['%s/req' % f for f in rpc_fields])
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'RPC calls by route'
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:75]}, None,
                to=['*****@*****.**'],
                subject=subject)
Exemplo n.º 10
0
def email_rpcs(date, dry_run=False):
    """Email RPCs-per-route report for the given datetime.date object.

    Also email a more urgent message if one of the RPCs is too expensive.
    This indicates a bug that is costing us money.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit')

    inits = ["IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
             for name in rpc_fields]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = ["LEFT OUTER JOIN ( "
             "SELECT elog_url_route AS url_route, "
             "       SUM(elog_stats_rpc_ops.value) as rpc_%s "
             "FROM [logs.requestlogs_%s] "
             "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' "
             "GROUP BY url_route) AS t%s "
             "ON t1.url_route = t%s.url_route"
             % (name, yyyymmdd, name, name, name)
             for name in rpc_fields]
    joins.append("LEFT OUTER JOIN ( "
                 "SELECT elog_url_route AS url_route, "
                 "       SUM(elog_stats_rpc_ops.value) AS rpc_cost "
                 "FROM [logs.requestlogs_%s] "
                 "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' "
                 "GROUP BY url_route) AS tcost "
                 "ON t1.url_route = tcost.url_route"
                 % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
SELECT elog_url_route AS url_route, COUNT(*) AS url_requests
FROM [logs.requestlogs_%s]
GROUP BY url_route) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "rpcs", date, 14, lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = '&mu;&cent;'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(
                    old_row['rpc_cost'] * 1.0 / old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = (['url_route', 'requests', '$', micropennies + '/req',
               'last 2 weeks (%s/req)' % micropennies] +
              ['rpc_%s' % f for f in rpc_fields] +
              ['%s/req' % f for f in rpc_fields])
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'RPC calls by route'
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:75]}, None,
                to=['*****@*****.**'],
                subject=subject,
                dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(data[:20],
                               'webapp.routes.rpc_cost.week_over_week',
                               'url_route', metric_label_col='url_route',
                               data_col='last 2 weeks (%s/req)' % micropennies,
                               dry_run=dry_run)

    # As of 1 Feb 2016, the most expensive RPC route is about $300 a
    # day.  More than $750 a day and we should be very suspcious.
    # TODO(csilvers): do this check more frequently.
    # TODO(csilvers): send to slack and/or 911 as well as emailing
    if any(row[2] > 750 for row in data):
        _send_email({heading: data[:75]}, None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive RPC calls on %s!'
                             % _pretty_date(yyyymmdd)),
                    dry_run=dry_run)
Exemplo n.º 11
0
def email_rrs_stats(date, dry_run=False):
    """Emails stats about rrs requests that are too slow or have errors.

    rrs == React Render Server.

    Requests that take longer than one second are timed out and thus simply add
    a second to the reponse, rather than speeding it up.
    """
    yyyymmdd = date.strftime("%Y%m%d")

    latency_q = """
SELECT
  REPLACE(REGEXP_EXTRACT(
    httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url,
  AVG(FLOAT(jsonPayload.latencyseconds)) AS average_latency,
  COUNT(1) AS count,
  SUM(CASE
      WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 1
      ELSE 0 END) AS timeouts,
  SUM(CASE
      WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 100
      ELSE 0 END) / count(1) as timeout_percent
FROM
  [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}]
GROUP BY
  url
ORDER BY
  timeout_percent DESC
""".format(yyyymmdd)

    error_q = """
SELECT
  REPLACE(REGEXP_EXTRACT(
    httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url,
  SUM(httpRequest.status == 500) AS error_count,
  (SUM(httpRequest.status == 500) / COUNT(1)) * 100 AS error_percent
FROM
  [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}]
GROUP BY
  url
ORDER BY
  error_percent DESC
""".format(yyyymmdd)

    latency_data = bq_util.get_daily_data('rrs_latency', yyyymmdd)
    if latency_data is None:
        table_name = (
            "khan-academy:react_render_logs" +
            ".appengine_googleapis_com_nginx_request_{}").format(yyyymmdd)
        table_exists = bq_util.does_table_exist(table_name)
        if not table_exists:
            print "The RRS logs were not generated. No email will be sent."
            print "Returning..."
            return
        latency_data = bq_util.query_bigquery(latency_q)
        bq_util.save_daily_data(latency_data, 'rrs_latency', yyyymmdd)

    error_data = bq_util.get_daily_data('rrs_errors', yyyymmdd)
    if error_data is None:
        error_data = bq_util.query_bigquery(error_q)
        bq_util.save_daily_data(error_data, 'rrs_errors', yyyymmdd)

    subject = 'React render server errors and timeouts - '
    error_heading = 'React component errors'
    latency_heading = 'React compontent render latency (> 1 sec = timeout)'
    error_order = ('url', 'error_count', 'error_percent')
    latency_order = ('url', 'average_latency', 'count', 'timeouts',
                     'timeout_percent')
    tables = collections.defaultdict(dict)

    # Put data in tables by initiative
    for initiative_id, initiative_data in _by_initiative(error_data,
                                                         key='url',
                                                         by_package=True):
        tables[initiative_id][error_heading] = _convert_table_rows_to_lists(
            initiative_data, error_order)
    for initiative_id, initiative_data in _by_initiative(latency_data,
                                                         key='url',
                                                         by_package=True):
        tables[initiative_id][latency_heading] = _convert_table_rows_to_lists(
            initiative_data, latency_order)

    # Send email to initiatives
    for initiative_id, initiative_tables in tables.items():
        _send_email(initiative_tables,
                    graph=None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # Send all data to infra
    tables = {}
    tables[error_heading] = _convert_table_rows_to_lists(
        error_data, error_order)
    tables[latency_heading] = _convert_table_rows_to_lists(
        latency_data, latency_order)
    _send_email(tables,
                graph=None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)
Exemplo n.º 12
0
def email_out_of_memory_errors(date, dry_run=False):
    # This sends two emails, for two different ways of seeing the data.
    # But we'll have them share the same subject so they thread together.
    yyyymmdd = date.strftime("%Y%m%d")
    subject = 'OOM errors - '

    # Out-of-memory errors for python look like:
    #   Exceeded soft memory limit of 2048 MB with 2078 MB after servicing 1497 requests total. Consider setting a larger instance class in app.yaml.  #@Nolint
    # Out-of-memory errors for kotlin look like:
    #   java.lang.OutOfMemoryError: <reason>
    #   (where <reason> is some text that may take on a number of values
    #   depending on whether the problem is lack of heap space, the garbage
    #   collector taking too long to stay ahead of garbage accumulation, etc.)
    # Note that older messages (before the gVisor sandboxs) started with
    # "Exceeded soft private memory limit" instead.
    numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')"
    query = """\
SELECT COUNT(1) AS count_,
       IFNULL(module_id, 'default') AS module_id,
       NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th,
       NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th,
       NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th
FROM [logs.requestlogs_%s]
WHERE (app_logs.message CONTAINS 'Exceeded soft memory limit'
       OR app_logs.message CONTAINS 'OutOfMemoryError')
  AND LEFT(version_id, 3) != 'znd' # ignore znds
GROUP BY module_id
ORDER BY count_ DESC
""" % (numreqs, numreqs, numreqs, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_module", date, 14,
        lambda row: row['module_id'])

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['module_id'])
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this module, then it just didn't
                # OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = [
        'count_', 'last 2 weeks', 'module_id', 'numserved_10th',
        'numserved_50th', 'numserved_90th'
    ]
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd)
    email_content = {heading: data}

    query = """\
SELECT COUNT(1) AS count_,
       module_id,
       elog_url_route AS url_route
FROM (
    SELECT IFNULL(FIRST(module_id), 'default') AS module_id,
           FIRST(elog_url_route) AS elog_url_route,
           SUM(IF(
               app_logs.message CONTAINS 'Exceeded soft memory limit'
               OR app_logs.message CONTAINS 'OutOfMemoryError',
               1, 0)) AS oom_message_count
    FROM [logs.requestlogs_%s]
    WHERE LEFT(version_id, 3) != 'znd' # ignore znds
    GROUP BY request_id
    HAVING oom_message_count > 0
)
GROUP BY module_id, url_route
ORDER BY count_ DESC
""" % yyyymmdd
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_route", date, 14, lambda row:
        (row['module_id'], row['url_route']))

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get((row['module_id'], row['url_route']))
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this route/module, then it just
                # didn't OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route']
    heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd)

    email_content[heading] = _convert_table_rows_to_lists(data, _ORDER)
    _send_email(email_content,
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        email_content = {heading: table}
        _send_email(email_content,
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)
Exemplo n.º 13
0
def email_rpcs(date, dry_run=False):
    """Email RPCs-per-route report for the given datetime.date object.

    Also email a more urgent message if one of the RPCs is too expensive.
    This indicates a bug that is costing us money.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit')

    inits = [
        "IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
        for name in rpc_fields
    ]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = [
        "LEFT OUTER JOIN ( "
        "SELECT elog_url_route AS url_route, "
        "       SUM(elog_stats_rpc_ops.value) as rpc_%s "
        "FROM [logs.requestlogs_%s] "
        "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' "
        "GROUP BY url_route) AS t%s "
        "ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name)
        for name in rpc_fields
    ]
    joins.append("LEFT OUTER JOIN ( "
                 "SELECT elog_url_route AS url_route, "
                 "       SUM(elog_stats_rpc_ops.value) AS rpc_cost "
                 "FROM [logs.requestlogs_%s] "
                 "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' "
                 "GROUP BY url_route) AS tcost "
                 "ON t1.url_route = tcost.url_route" % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
    SELECT elog_url_route AS url_route, COUNT(1) AS url_requests
    FROM (
        SELECT FIRST(elog_url_route) AS elog_url_route
        FROM [logs.requestlogs_%s]
        WHERE LEFT(version_id, 3) != 'znd' # ignore znds
        GROUP BY request_id
    )
    GROUP BY url_route
) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data("rpcs", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = '&mu;&cent;'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(old_row['rpc_cost'] * 1.0 /
                                      old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = ([
        'url_route', 'requests', '$', micropennies + '/req',
        'last 2 weeks (%s/req)' % micropennies
    ] + ['rpc_%s' % f
         for f in rpc_fields] + ['%s/req' % f for f in rpc_fields])
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    subject = 'RPC calls by route - '
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: all_data[:75]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        # Let's just send the top most expensive routes, not all of them.
        _send_email({heading: table[:75]},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(all_data[:20],
                               'webapp.routes.rpc_cost.week_over_week',
                               'url_route',
                               metric_label_col='url_route',
                               data_col='last 2 weeks (%s/req)' % micropennies,
                               dry_run=dry_run)

    # As of 1 Feb 2016, the most expensive RPC route is about $300 a
    # day.  More than $750 a day and we should be very suspcious.
    # TODO(csilvers): do this check more frequently.
    # TODO(csilvers): send to slack and/or 911 as well as emailing
    if any(row[2] > 750 for row in all_data[1:]):  # ignore the header line
        _send_email({heading: all_data[:75]},
                    None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive RPC calls on %s!' %
                             _pretty_date(yyyymmdd)),
                    dry_run=dry_run)
Exemplo n.º 14
0
def email_instance_hours(date, dry_run=False):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv
                        for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(1) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM (
  -- When logs get split into multiple entries, each has latency calculated
  -- from the start of the request to the point where the log line was emitted.
  -- This means the total latency is the maximum value that appears, not the
  -- sum.
  SELECT FIRST(elog_url_route) AS elog_url_route,
         IFNULL(FIRST(module_id), 'default') AS module_id,
         MAX(latency) AS latency,
         FIRST(url_map_entry) AS url_map_entry
  FROM [logs.requestlogs_%s]
  WHERE LEFT(version_id, 3) != 'znd' # ignore znds
  GROUP BY request_id
)
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (cost_fn, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data("instance_hours", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row['instance_hours']

    for row in data:
        row['%% of total'] = row['instance_hours'] / total_instance_hours * 100
        row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row:
                sparkline_data.append(old_row['instance_hours'] /
                                      old_row['count_'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (per request)'] = sparkline_data

    _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests',
              'last 2 weeks (per request)', 'url_route')

    subject = 'Instance Hours by Route - '
    heading = 'Cost-normalized instance hours by route for %s' % (
        _pretty_date(yyyymmdd))
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: all_data[:50]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        _send_email({heading: table[:50]},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(all_data[:20],
                               'webapp.routes.instance_hours.week_over_week',
                               'url_route',
                               metric_label_col='url_route',
                               data_col='last 2 weeks (per request)',
                               dry_run=dry_run)
Exemplo n.º 15
0
def email_out_of_memory_errors(date):
    # This sends two emails, for two different ways of seeing the data.
    # But we'll have them share the same subject so they thread together.
    yyyymmdd = date.strftime("%Y%m%d")
    subject = 'OOM errors'

    # Out-of-memory errors look like:
    #   Exceeded soft private memory limit with 260.109 MB after servicing 2406 requests total  #@Nolint
    # with SDK 1.9.7, they changed. Note the double-space after "after":
    #   Exceeded soft private memory limit of 512 MB with 515 MB after  servicing 9964 requests total  #@Nolint
    numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')"
    query = """\
SELECT COUNT(module_id) AS count_,
       module_id,
       NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th,
       NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th,
       NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th
FROM [logs.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
      AND module_id IS NOT NULL
GROUP BY module_id
ORDER BY count_ DESC
""" % (numreqs, numreqs, numreqs, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_module", date, 14,
        lambda row: row['module_id'])

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['module_id'])
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this module, then it just didn't
                # OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = [
        'count_', 'last 2 weeks', 'module_id', 'numserved_10th',
        'numserved_50th', 'numserved_90th'
    ]
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd)
    email_content = {heading: data}

    query = """\
SELECT COUNT(*) as count_,
       module_id,
       elog_url_route as url_route
FROM [logs.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
GROUP BY module_id, url_route
ORDER BY count_ DESC
""" % yyyymmdd
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_route", date, 14, lambda row:
        (row['module_id'], row['url_route']))

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get((row['module_id'], row['url_route']))
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this route/module, then it just
                # didn't OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route']
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd)
    email_content[heading] = data

    _send_email(email_content,
                None,
                to=['*****@*****.**'],
                subject=subject)
Exemplo n.º 16
0
def email_memory_increases(date, window_length=20, min_increase_in_mb=1):
    """Emails the increases in memory caused by particular routes.

    It attempts to compute the amount of memory ignoring memory which is
    reclaimed in the next few requests.  (The number of requests which are
    checked is specified by the window_length parameter.  Routes with a total
    increase less than the min_increase_in_mb parameter are ignored.)
    """
    yyyymmdd = date.strftime("%Y%m%d")
    lead_lengths = range(1, window_length + 1)
    lead_selects = '\n'.join(
        "LEAD(total, %s) OVER (PARTITION BY instance_key ORDER BY start_time) "
        "AS lead_total_%s," % (i, i) for i in lead_lengths)

    fields = ['total'] + ['lead_total_%s' % i for i in lead_lengths]
    # We want to compute the minimal value of added + field - total, where
    # field is one of "total" or one of the "lead_total_i".  BigQuery
    # unfortunately doesn't give us a nice way to do this (at least that I know
    # of, without doing a CROSS JOIN of the table to itself).  One way to do
    # this would be a gigantic nested IF, but this could be exponentially
    # large, and queries have a fixed maximum size.  Instead we use a CASE
    # expression which could be O(n^2); since we don't pay for BigQuery
    # execution time, and n shouldn't be too huge, this seems like a better
    # approach.  In theory it might be better to do O(n) nested queries (each
    # of which does a single pairwise min), but this seems like it could in
    # practice be even slower, depending on the implementation.
    # TODO(benkraft): If I get a useful answer to
    # http://stackoverflow.com/questions/24923101/computing-a-moving-maximum-in-bigquery
    # we should use that instead.  Or, once we have user-defined functions in
    # BigQuery, we can probably do something actually reasonable.
    case_expr = '\n'.join(
        "WHEN %s THEN added + %s - total" %
        (' AND '.join("%s <= %s" % (field1, field2)
                      for field2 in fields if field2 != field1), field1)
        for field1 in fields)

    # This is a kind of large query, so here's what it's doing, from inside to
    # out:
    #   First, extract the memory data from the logs.
    #   Second, compute the appropriate LEAD() columns, which tell us what the
    #       total will be a few requests later on the same instance
    #   Third, compute "real_added", which is the amount we think the request
    #       actually added to the heap, not counting memory which was soon
    #       reclaimed.  This might come out negative, so
    #   Fourth, make sure the memory added is at least zero, since if memory
    #       usage went down, this request probably shouldn't get the credit.
    #   Fifth, group by route and do whatever aggregation we want.  Ignore the
    #       first 25 requests to each module, since those are probably all
    #       loading things that each module has to load, and the request that
    #       does the loading shouldn't be blamed.
    query = """\
SELECT
    COUNT(*) AS count_,
    elog_url_route AS url_route,
    module_id AS module,
    AVG(real_added) AS added_avg,
    NTH(99, QUANTILES(real_added, 101)) AS added_98th,
    SUM(real_added) AS added_total,
FROM (
    SELECT
        IF(real_added > 0, real_added, 0) AS real_added,
        elog_url_route, module_id, num,
    FROM (
        SELECT
            (CASE %s ELSE added END) AS real_added,
            elog_url_route, module_id, num,
        FROM (
            SELECT
                %s
                RANK() OVER (PARTITION BY instance_key
                             ORDER BY start_time) AS num,
                added, total, elog_url_route, module_id,
            FROM (
                SELECT
                    FLOAT(REGEXP_EXTRACT(
                        app_logs.message,
                        "This request added (.*) MB to the heap.")) AS added,
                    FLOAT(REGEXP_EXTRACT(
                        app_logs.message,
                        "Total memory now used: (.*) MB")) AS total,
                    instance_key, start_time, elog_url_route, module_id,
                FROM [logs.requestlogs_%s]
                WHERE app_logs.message CONTAINS 'This request added'
            )
        )
    )
)
WHERE num > 25
GROUP BY url_route, module
ORDER BY added_total DESC
""" % (case_expr, lead_selects, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "memory_increases", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "memory_increases", date, 14, lambda row:
        (row['module'], row['url_route']))

    by_module = collections.defaultdict(list)
    for row in data:
        if row['added_total'] > min_increase_in_mb:
            heading = "Memory increases by route for %s module on %s" % (
                row['module'], _pretty_date(yyyymmdd))
            by_module[heading].append(row)

    _ORDER = [
        'count_', 'added_avg', 'last 2 weeks (avg)', 'added_98th',
        'added_total', 'added %%', 'url_route'
    ]
    for heading in by_module:
        total = sum(row['added_total'] for row in by_module[heading])
        for row in by_module[heading]:
            row['added %%'] = row['added_total'] / total * 100
            sparkline_data = []
            for old_data in historical_data:
                old_row = old_data.get((row['module'], row['url_route']))
                if old_row:
                    sparkline_data.append(old_row['added_avg'])
                else:
                    sparkline_data.append(None)
            row['last 2 weeks (avg)'] = sparkline_data
            del row['module']
        by_module[heading] = _convert_table_rows_to_lists(
            by_module[heading][:50], _ORDER)
    subject = "Memory Increases by Route"
    _send_email(by_module,
                None,
                to=['*****@*****.**'],
                subject=subject)
Exemplo n.º 17
0
def email_applog_sizes(date, dry_run=False):
    """Email app-log report for the given datetime.date object.

    This report says how much we are logging (via logging.info()
    and friends), grouped by the first word of the log message.
    (Which usually, but not always, is a good proxy for a single
    log-message in our app.)  Since we pay per byte logged, we
    want to make sure we're not accidentally logging a single
    log message a ton, which is really easy to do.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    query = """\
SELECT
  REGEXP_EXTRACT(app_logs.message, r'^([a-zA-Z0-9_-]*)') AS firstword,
  FIRST(app_logs.message) as sample_logline,
  SUM(LENGTH(app_logs.message)) / 1024 / 1024 AS size_mb,
  -- This cost comes from https://cloud.google.com/stackdriver/pricing_v2:
  -- "Stackdriver Logging: $0.50/GB".  But it seems like app-log messages
  -- are actually encoded *twice* in the logging data, based on our
  -- best model of app-log sizes vs num-requests vs costs in the billing
  -- reports, so we assume our cost is $1/GB.
  SUM(LENGTH(app_logs.message)) / 1024 / 1024 / 1024 AS cost_usd
FROM
  logs.requestlogs_%s
GROUP BY
  firstword
ORDER BY
  cost_usd DESC
""" % (yyyymmdd)
    data = bq_util.query_bigquery(query)
    data = [row for row in data if row['firstword'] not in (None, '(None)')]
    bq_util.save_daily_data(data, "log_bytes", yyyymmdd)
    historical_data = bq_util.process_past_data("log_bytes", date, 14,
                                                lambda row: row['firstword'])

    # Munge the table by adding a few columns.
    total_bytes = 0.0
    for row in data:
        total_bytes += row['size_mb']

    for row in data:
        row['%% of total'] = row['size_mb'] / total_bytes * 100
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['firstword'])
            if old_row:
                sparkline_data.append(old_row['size_mb'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data
        # While we're here, truncate the sample-logline, since it can get
        # really long.
        row['sample_logline'] = row['sample_logline'][:80]

    _ORDER = ('%% of total', 'size_mb', 'cost_usd', 'last 2 weeks',
              'firstword', 'sample_logline')

    subject = 'Log-bytes by first word of log-message - '
    heading = 'Cost-normalized log-bytes by firstword for %s' % (
        _pretty_date(yyyymmdd))
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: all_data[:50]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # As of 1 Jun 2018, the most expensive firstword costs about
    # $2/day.  More than $20 a day and we should be very suspicious.
    if any(row[2] > 20 for row in all_data[1:]):  # ignore the header line
        _send_email({heading: all_data[:75]},
                    None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive loglines on %s!' %
                             _pretty_date(yyyymmdd)),
                    dry_run=dry_run)