Exemplo n.º 1
0
    def test_reset_cumulative_metrics(self):
        gauge = gae_ts_mon.GaugeMetric('gauge', 'foo', None)
        counter = gae_ts_mon.CounterMetric('counter', 'foo', None)
        gauge.set(5)
        counter.increment()
        self.assertEqual(5, gauge.get())
        self.assertEqual(1, counter.get())

        config._reset_cumulative_metrics()
        self.assertEqual(5, gauge.get())
        self.assertIsNone(counter.get())
Exemplo n.º 2
0
_bucketer = gae_ts_mon.GeometricBucketer(growth_factor=10**0.05,
                                         num_finite_buckets=100)

# Regular (instance-local) metrics: jobs/completed and jobs/durations.
# Both have the following metric fields:
# - project_id: e.g. 'chromium'
# - subproject_id: e.g. 'blink'. Set to empty string if not used.
# - pool: e.g. 'Chrome'
# - spec_name: name of a job specification, e.g. '<master>:<builder>'
#     for buildbot jobs.
# - result: one of 'success', 'failure', or 'infra-failure'.
_jobs_completed = gae_ts_mon.CounterMetric(
    'jobs/completed',
    'Number of completed jobs.', [
        gae_ts_mon.StringField('spec_name'),
        gae_ts_mon.StringField('project_id'),
        gae_ts_mon.StringField('subproject_id'),
        gae_ts_mon.StringField('pool'),
        gae_ts_mon.StringField('result'),
    ])


_jobs_durations = gae_ts_mon.CumulativeDistributionMetric(
    'jobs/durations',
    'Cycle times of completed jobs, in seconds.', [
        gae_ts_mon.StringField('spec_name'),
        gae_ts_mon.StringField('project_id'),
        gae_ts_mon.StringField('subproject_id'),
        gae_ts_mon.StringField('pool'),
        gae_ts_mon.StringField('result'),
    ],
Exemplo n.º 3
0
# A custom bucketer with 12% resolution in the range of 1..10**5.
# Used for job cycle times.
_bucketer = gae_ts_mon.GeometricBucketer(growth_factor=10**0.05,
                                     num_finite_buckets=100)

# Regular (instance-local) metrics: jobs/completed and jobs/durations.
# Both have the following metric fields:
# - project_id: e.g. 'chromium'
# - subproject_id: e.g. 'blink'. Set to empty string if not used.
# - executor_id: name of the bot that executed a job, e.g. 'swarm42-m4'
# - spec_name: name of a job specification, e.g. '<master>:<builder>:<test>'
#     for buildbot jobs.
# - result: one of 'success', 'failure', or 'infra-failure'.
jobs_completed = gae_ts_mon.CounterMetric(
    'jobs/completed',
    description='Number of completed jobs.')


jobs_durations = gae_ts_mon.CumulativeDistributionMetric(
    'jobs/durations', bucketer=_bucketer,
    description='Cycle times of completed jobs, in seconds.')


# Swarming-specific metric. Metric fields:
# - project_id: e.g. 'chromium'
# - subproject_id: e.g. 'blink'. Set to empty string if not used.
# - spec_name: name of a job specification, e.g. '<master>:<builder>:<test>'
#     for buildbot jobs.
tasks_expired = gae_ts_mon.CounterMetric(
    'swarming/tasks/expired',
Exemplo n.º 4
0
# Copyright 2019 The LUCI Authors. All rights reserved.
# Use of this source code is governed under the Apache License, Version 2.0
# that can be found in the LICENSE file.
"""Timeseries metrics."""

import gae_ts_mon

from components import auth

import config

_bytes_requested = gae_ts_mon.CounterMetric(
    'downloads/bytes', 'Bytes requested for download by clients.', [
        gae_ts_mon.StringField('client_name'),
        gae_ts_mon.StringField('client_email'),
        gae_ts_mon.StringField('download_source'),
    ])


def file_size(size):
    """Reports the size of a file fetched from GCS by whitelisted clients.

  If the client's requests are not whitelisted for monitoring, does nothing.

  Args:
    size: Size of the file in bytes.
  """
    ip = auth.get_peer_ip()
    for cfg in config.settings().client_monitoring_config:
        if auth.is_in_ip_whitelist(cfg.ip_whitelist, ip):
            _bytes_requested.increment_by(
Exemplo n.º 5
0
def _adder(metric, value_fn):
    """Returns a function that adds a build value to the distribution metric.

  Fields must be string and one of _ALL_FIELD_NAMES.
  value_fn accepts a build.

  The returned function accepts a build.
  """
    fields_for = _fields_for_fn(metric.field_spec)
    return lambda b: metric.add(value_fn(b), fields_for(b))  # pragma: no cover


inc_created_builds = _incrementer(
    gae_ts_mon.CounterMetric('buildbucket/builds/created', 'Build creation',
                             _string_fields('bucket', 'builder',
                                            'user_agent')))
inc_started_builds = _incrementer(
    gae_ts_mon.CounterMetric('buildbucket/builds/started', 'Build start',
                             _string_fields('bucket', 'builder')))
inc_completed_builds = _incrementer(
    gae_ts_mon.CounterMetric(
        'buildbucket/builds/completed',
        'Build completion, including success, failure and cancellation',
        _string_fields('bucket', 'builder', 'result', 'failure_reason',
                       'cancelation_reason')))
inc_heartbeat_failures = _incrementer(
    gae_ts_mon.CounterMetric('buildbucket/builds/heartbeats',
                             'Failures to extend a build lease',
                             _string_fields('bucket', 'builder', 'status')))
inc_lease_expirations = _incrementer(
Exemplo n.º 6
0
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import gae_ts_mon

swarming_tasks = gae_ts_mon.CounterMetric(
    'findit/swarmingtasks', 'Swarming tasks triggered',
    [gae_ts_mon.StringField('category'),
     gae_ts_mon.StringField('operation')])

outgoing_http_errors = gae_ts_mon.CounterMetric(
    'findit/outgoinghttperrors', 'Failed http requests to various servers',
    [gae_ts_mon.StringField('host'),
     gae_ts_mon.StringField('exception')])

outgoing_http_statuses = gae_ts_mon.CounterMetric(
    'findit/outgoinghttpstatuses', 'Http requests to external services',
    [gae_ts_mon.StringField('host'),
     gae_ts_mon.StringField('status_code')])

issues = gae_ts_mon.CounterMetric(
    'findit/issues', 'Bugs updated with findings',
    [gae_ts_mon.StringField('category'),
     gae_ts_mon.StringField('operation')])

flakes = gae_ts_mon.CounterMetric(
    'findit/flakes', 'Flakes requested or analyzed', [
        gae_ts_mon.StringField('source'),
        gae_ts_mon.StringField('operation'),
        gae_ts_mon.StringField('trigger'),
Exemplo n.º 7
0
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import gae_ts_mon

swarming_tasks = gae_ts_mon.CounterMetric(
    'findit/swarmingtasks', 'Swarming tasks triggered',
    [gae_ts_mon.StringField('category'),
     gae_ts_mon.StringField('operation')])

outgoing_http_errors = gae_ts_mon.CounterMetric(
    'findit/outgoinghttperrors', 'Failed http requests to various servers',
    [gae_ts_mon.StringField('host'),
     gae_ts_mon.StringField('exception')])

issues = gae_ts_mon.CounterMetric(
    'findit/issues', 'Bugs updated with findings',
    [gae_ts_mon.StringField('category'),
     gae_ts_mon.StringField('operation')])

flakes = gae_ts_mon.CounterMetric('findit/flakes',
                                  'Flakes requested or analyzed', [
                                      gae_ts_mon.StringField('source'),
                                      gae_ts_mon.StringField('operation'),
                                      gae_ts_mon.StringField('trigger')
                                  ])

try_jobs = gae_ts_mon.CounterMetric('findit/try-jobs', 'Try jobs triggered', [
    gae_ts_mon.StringField('operation'),
    gae_ts_mon.StringField('type'),
Exemplo n.º 8
0
 def mk_metric(metric_prefix):
     return gae_ts_mon.CounterMetric(metric_prefix + metric_suffix,
                                     description, fields)
Exemplo n.º 9
0
# cycle times.
_bucketer = gae_ts_mon.GeometricBucketer(growth_factor=10**0.05,
                                         num_finite_buckets=100)

# Regular (instance-local) metrics: jobs/completed and jobs/durations.
# Both have the following metric fields:
# - project_id: e.g. 'chromium'
# - subproject_id: e.g. 'blink'. Set to empty string if not used.
# - pool: e.g. 'Chrome'
# - spec_name: name of a job specification, e.g. '<master>:<builder>'
#     for buildbot jobs.
# - result: one of 'success', 'failure', or 'infra-failure'.
_jobs_completed = gae_ts_mon.CounterMetric(
    'jobs/completed', 'Number of completed jobs.', [
        gae_ts_mon.StringField('spec_name'),
        gae_ts_mon.StringField('project_id'),
        gae_ts_mon.StringField('subproject_id'),
        gae_ts_mon.StringField('pool'),
        gae_ts_mon.StringField('result'),
    ])

_jobs_durations = gae_ts_mon.CumulativeDistributionMetric(
    'jobs/durations',
    'Cycle times of completed jobs, in seconds.', [
        gae_ts_mon.StringField('spec_name'),
        gae_ts_mon.StringField('project_id'),
        gae_ts_mon.StringField('subproject_id'),
        gae_ts_mon.StringField('pool'),
        gae_ts_mon.StringField('result'),
    ],
    bucketer=_bucketer)
Exemplo n.º 10
0
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import gae_ts_mon

reports_processed = gae_ts_mon.CounterMetric(
    'predator/reports_count',
    'Metric counting the number of crash reports that Predator has processed. '
    'Contains fields describing whether Predator was successful at finding a '
    'regression range, a components, or suspect changes for each report.', [
        gae_ts_mon.BooleanField('found_suspects'),
        gae_ts_mon.BooleanField('found_components'),
        gae_ts_mon.BooleanField('has_regression_range'),
        gae_ts_mon.StringField('client_id'),
        gae_ts_mon.BooleanField('success')
    ])

wrong_cls = gae_ts_mon.GaugeMetric(
    'predator/wrong_cls',
    'Number of wrong suspected cls found by Predator per '
    'day. Contains fields describing which client this wrong cl comes from, '
    'can be clusterfuzz or cracas.', [gae_ts_mon.StringField('client_id')])

wrong_components = gae_ts_mon.GaugeMetric(
    'predator/wrong_components',
    'Number of wrong suspected components found by '
    'Predator per day. Contains fields describing which client this wrong cl '
    'comes from, can be clusterfuzz or cracas.',
    [gae_ts_mon.StringField('client_id')])
Exemplo n.º 11
0

import httplib
import logging
import pprint
import time
import threading
import urllib2
import webapp2

import gae_ts_mon

from google.appengine.api.modules import modules

metric = gae_ts_mon.CounterMetric('test/dsansome/loadtest',
    'Dummy metric for testing ts_mon on Appengine',
    None)


class IncrementHandler(webapp2.RequestHandler):
  def get(self):
    count = self.request.params['count']
    start = time.time()
    for _ in xrange(int(count)):
      metric.increment()
    end = time.time()

    self.response.write(end - start)


main_handlers = [
Exemplo n.º 12
0
from google.appengine.api import urlfetch
from google.appengine.ext import deferred
from google.appengine.ext import ndb
from google.appengine.runtime import DeadlineExceededError
from handlers.flake_issues import MIN_REQUIRED_FLAKY_RUNS
from model.build_run import BuildRun
from model.build_run import PatchsetBuilderRuns
from model.fetch_status import FetchStatus
from model.flake import Flake
from model.flake import FlakeOccurrence
from model.flake import FlakyRun
from status import build_result, util
import time_functions.timestamp

requests_metric = gae_ts_mon.CounterMetric(
    'flakiness_pipeline/cq_status/requests',
    'Requests made to the chromium-cq-status API',
    [gae_ts_mon.StringField('status')])
flakes_metric = gae_ts_mon.CounterMetric(
    'flakiness_pipeline/flake_occurrences_detected',
    'Detected flake occurrences', None)
parsing_errors = gae_ts_mon.CounterMetric(
    'flakiness_pipeline/cq_status_parsing_errors',
    'Number of errors when parsing records returned by chromium-cq-status',
    None)
occurrences_per_flake_day = gae_ts_mon.NonCumulativeDistributionMetric(
    'flakiness_pipeline/occurrences_per_flake/day',
    'Distribution of flake occurrence counts, calculated over all flakes in '
    'the last day', None)
occurrences_per_flake_week = gae_ts_mon.NonCumulativeDistributionMetric(
    'flakiness_pipeline/occurrences_per_flake/week',
    'Distribution of flake occurrence counts, calculated over all flakes in '
Exemplo n.º 13
0
# Copyright 2015 The LUCI Authors. All rights reserved.
# Use of this source code is governed under the Apache License, Version 2.0
# that can be found in the LICENSE file.

"""Metrics to track with ts_mon and event_mon."""

import gae_ts_mon


lease_requests_deduped = gae_ts_mon.CounterMetric(
    'machine_provider/lease_requests/deduped',
    description='Number of lease requests deduplicated.',
)


lease_requests_expired = gae_ts_mon.CounterMetric(
    'machine_provider/lease_requests/expired',
    description='Number of lease requests expired.',
)


lease_requests_fulfilled = gae_ts_mon.CounterMetric(
    'machine_provider/lease_requests/fulfilled',
    description='Number of lease requests fulfilled.',
)


lease_requests_received = gae_ts_mon.CounterMetric(
    'machine_provider/lease_requests/received',
    description='Number of lease requests received.',
)
Exemplo n.º 14
0
# Copyright 2015 The LUCI Authors. All rights reserved.
# Use of this source code is governed under the Apache License, Version 2.0
# that can be found in the LICENSE file.
"""Metrics to track with ts_mon and event_mon."""

import gae_ts_mon

lease_requests_deduped = gae_ts_mon.CounterMetric(
    'machine_provider/lease_requests/deduped',
    'Number of lease requests deduplicated.',
    None,
)

lease_requests_expired = gae_ts_mon.CounterMetric(
    'machine_provider/lease_requests/expired',
    'Number of lease requests expired.',
    None,
)

lease_requests_fulfilled = gae_ts_mon.CounterMetric(
    'machine_provider/lease_requests/fulfilled',
    'Number of lease requests fulfilled.',
    None,
)

lease_requests_fulfilled_time = gae_ts_mon.CumulativeDistributionMetric(
    'machine_provider/lease_requests/fulfilled/time',
    'Time taken to fulfill a lease request.',
    None,
    bucketer=gae_ts_mon.GeometricBucketer(growth_factor=10**0.04),
)
Exemplo n.º 15
0
class ProcessIssue(webapp2.RequestHandler):
  time_since_first_flake = gae_ts_mon.FloatMetric(
      'flakiness_pipeline/time_since_first_flake',
      'The delay in seconds from the moment first flake occurrence in this '
      'flakiness period happens and until the time an issue is created to '
      'track it.',
      None)
  time_since_threshold_exceeded = gae_ts_mon.FloatMetric(
      'flakiness_pipeline/time_since_threshold_exceeded',
      'The delay in seconds from the moment when the last flake occurrence '
      'happens that makes a flake exceed the threshold and until the time an '
      'issue is created to track it.',
      None)
  issue_updates = gae_ts_mon.CounterMetric(
      'flakiness_pipeline/issue_updates',
      'Issues updated/created',
      [gae_ts_mon.StringField('operation')])

  @ndb.transactional
  def _get_flake_update_singleton_key(self):
    singleton_key = ndb.Key('FlakeUpdateSingleton', 'singleton')
    if not singleton_key.get():
      FlakeUpdateSingleton(key=singleton_key).put()
    return singleton_key

  @ndb.transactional
  def _increment_update_counter(self):
    FlakeUpdate(parent=self._get_flake_update_singleton_key()).put()

  @ndb.non_transactional
  def _time_difference(self, flaky_run):
    return (flaky_run.success_run.get().time_finished -
            flaky_run.failure_run_time_finished).total_seconds()

  @ndb.non_transactional
  def _is_same_day(self, flaky_run):
    time_since_finishing = (
        datetime.datetime.utcnow() - flaky_run.failure_run_time_finished)
    return time_since_finishing <= datetime.timedelta(days=1)

  @ndb.non_transactional
  def _get_new_flakes(self, flake):
    num_runs = len(flake.occurrences) - flake.num_reported_flaky_runs
    flaky_runs = ndb.get_multi(flake.occurrences[-num_runs:])
    flaky_runs = [run for run in flaky_runs if run is not None]
    return [
      flaky_run for flaky_run in flaky_runs
      if self._is_same_day(flaky_run) and
         self._time_difference(flaky_run) <= MAX_TIME_DIFFERENCE_SECONDS]

  @staticmethod
  @ndb.non_transactional
  def _find_flakiness_period_occurrences(flake):
    """Finds all occurrences in the current flakiness period."""
    assert flake.occurrences, 'Flake entity has no occurrences'
    flaky_runs = sorted([run for run in ndb.get_multi(flake.occurrences)
                             if run is not None],
                        key=lambda run: run.failure_run_time_finished)

    cur = flaky_runs[-1]
    for i, prev in enumerate(reversed(flaky_runs[:-1])):
      if (cur.failure_run_time_finished - prev.failure_run_time_finished >
          MAX_GAP_FOR_FLAKINESS_PERIOD):
        return flaky_runs[-i-1:]  # not including prev, but including cur
      cur = prev
    return flaky_runs

  @staticmethod
  def _get_time_threshold_exceeded(flakiness_period_occurrences):
    assert flakiness_period_occurrences, 'No occurrences in flakiness period'
    window = []
    for flaky_run in flakiness_period_occurrences:  # pragma: no cover
      window.append(flaky_run)

      # Remove flaky runs that happened more than a day before the latest run.
      flaky_run_finished = flaky_run.failure_run_time_finished
      window = [
          prev_run for prev_run in window
          if flaky_run_finished - prev_run.failure_run_time_finished <=
             datetime.timedelta(days=1)
      ]

      if len(window) >= MIN_REQUIRED_FLAKY_RUNS:
        return flaky_run.failure_run_time_finished

  @ndb.transactional
  def _recreate_issue_for_flake(self, flake):
    """Updates a flake to re-create an issue and creates a respective task."""
    flake.old_issue_id = flake.issue_id
    flake.issue_id = 0
    taskqueue.add(url='/issues/process/%s' % flake.key.urlsafe(),
                  queue_name='issue-updates', transactional=True)

  @staticmethod
  @ndb.non_transactional
  def _update_new_occurrences_with_issue_id(name, new_flaky_runs, issue_id):
    # TODO(sergiyb): Find a way to do this asynchronously to avoid block
    # transaction-bound method calling this. Possible solutions are to use
    # put_multi_sync (need to find a way to test this) or to use deferred
    # execution.
    for fr in new_flaky_runs:
      for occ in fr.flakes:
        if occ.failure == name:
          occ.issue_id = issue_id
    ndb.put_multi(new_flaky_runs)

  @staticmethod
  @ndb.non_transactional
  def _report_flakes_to_findit(flake, flaky_runs):
    try:
      findit.FindItAPI().flake(flake, flaky_runs)
    except (httplib.HTTPException, apiclient.errors.Error):
      logging.warning('Failed to send flakes to FindIt', exc_info=True)

  @staticmethod
  def follow_duplication_chain(api, starting_issue_id):
    """Finds last merged-into issue in the deduplication chain.

    Args:
      api: Issue Tracker API object.
      starting_issue_id: ID of the issue to start with.

    Returns:
      Issue object for the last issue in the chain (can be the same issue as
      passed in if it is not marked as Duplicate) or None if duplication loop is
      detected.
    """
    seen_issues = set()
    flake_issue = api.getIssue(starting_issue_id)
    # We need to check both status and merged_into, since it's possible to
    # create an issue with Duplicate status but without merged_into field set
    # and vice versa (see http://crbug.com/669054 and http://crbug.com/669056).
    while flake_issue.status == 'Duplicate' and flake_issue.merged_into:
      seen_issues.add(flake_issue.id)
      if flake_issue.merged_into in seen_issues:
        logging.info('Detected issue duplication loop: %s.', seen_issues)
        return None
      flake_issue = api.getIssue(flake_issue.merged_into)

    return flake_issue

  @ndb.transactional
  def _update_issue(self, api, flake, new_flakes, now):
    """Updates an issue on the issue tracker."""
    flake_issue = self.follow_duplication_chain(api, flake.issue_id)

    if flake_issue is None:
      # If the issue duplication loop was detected, we re-create the issue.
      self._recreate_issue_for_flake(flake)
      return

    if flake_issue.id != flake.issue_id:
      # Update the issue ID stored in datastore to avoid following deduplication
      # chain next time.
      flake.issue_id = flake_issue.id

    if not flake_issue.open:
      # If the issue was closed, we do not update it. This allows changes made
      # to reduce flakiness to propagate and take effect. If after
      # DAYS_TO_REOPEN_ISSUE days we still detect flakiness, we will create a
      # new issue.
      recent_cutoff = now - datetime.timedelta(days=DAYS_TO_REOPEN_ISSUE)
      if flake_issue.updated < recent_cutoff:
        self._recreate_issue_for_flake(flake)
      return

    # Make sure issue is in the appropriate bug queue as flakiness is ongoing as
    # the sheriffs are supposed to disable flaky tests. For steps, only return
    # if there is no owner on the bug.
    suffix = None
    queue_name, expected_label = get_queue_details(flake.name)
    if expected_label not in flake_issue.labels:
      if not flake.is_step or not flake_issue.owner:
        flake_issue.labels.append(expected_label)
        suffix = RETURN_TO_QUEUE_SUFFIX % queue_name

    new_flaky_runs_msg = FLAKY_RUNS_TEMPLATE % {
        'name': flake.name,
        'new_flakes_count': len(new_flakes),
        'flakes_url': FLAKES_URL_TEMPLATE % flake.key.urlsafe(),
        'suffix': ' %s' % suffix if suffix else ''}
    api.update(flake_issue, comment=new_flaky_runs_msg)
    self.issue_updates.increment_by(1, {'operation': 'update'})
    logging.info('Updated issue %d for flake %s with %d flake runs',
                 flake.issue_id, flake.name, len(new_flakes))
    self._update_new_occurrences_with_issue_id(
        flake.name, new_flakes, flake_issue.id)
    flake.num_reported_flaky_runs = len(flake.occurrences)
    flake.issue_last_updated = now

    self._report_flakes_to_findit(flake, new_flakes)

  @ndb.transactional
  def _create_issue(self, api, flake, new_flakes, now):
    _, qlabel = get_queue_details(flake.name)
    labels = ['Type-Bug', 'Pri-1', 'Via-TryFlakes', qlabel]
    if is_trooper_flake(flake.name):
      other_queue_msg = TROOPER_QUEUE_MSG
    else:
      other_queue_msg = SHERIFF_QUEUE_MSG

    summary = SUMMARY_TEMPLATE % {'name': flake.name}
    description = DESCRIPTION_TEMPLATE % {
        'summary': summary,
        'flakes_url': FLAKES_URL_TEMPLATE % flake.key.urlsafe(),
        'flakes_count': len(new_flakes),
        'other_queue_msg': other_queue_msg,
        'footer': CTF_CAN_FILE_BUGS_FOR_TESTS if flake.is_step
                                              else DESCRIPTION_TEST_FOOTER}
    if flake.old_issue_id:
      description = REOPENED_DESCRIPTION_TEMPLATE % {
          'description': description, 'old_issue': flake.old_issue_id}

    new_issue = issue.Issue({'summary': summary,
                             'description': description,
                             'status': 'Untriaged',
                             'labels': labels,
                             'components': ['Tests>Flaky']})
    flake_issue = api.create(new_issue)
    flake.issue_id = flake_issue.id
    self._update_new_occurrences_with_issue_id(
        flake.name, new_flakes, flake_issue.id)
    flake.num_reported_flaky_runs = len(flake.occurrences)
    flake.issue_last_updated = now
    self.issue_updates.increment_by(1, {'operation': 'create'})
    logging.info('Created a new issue %d for flake %s', flake.issue_id,
                 flake.name)

    self._report_flakes_to_findit(flake, new_flakes)

    # Find all flakes in the current flakiness period to compute metrics. The
    # flakiness period is a series of flakes with a gap no larger than
    # MAX_GAP_FOR_FLAKINESS_PERIOD seconds.
    period_flakes = self._find_flakiness_period_occurrences(flake)

    # Compute the delay since the first flake in the current flakiness period.
    time_since_first_flake = (
        now - period_flakes[0].failure_run_time_finished).total_seconds()
    self.time_since_first_flake.set(time_since_first_flake)
    logging.info('Reported time_since_first_flake %d for flake %s',
                 time_since_first_flake, flake.name)

    # Find the first flake that exceeded the threshold needed to create an
    # issue and report delay from the moment this flake happend and until we've
    # actually created the issue.
    time_since_threshold_exceeded = (
        now - self._get_time_threshold_exceeded(period_flakes)).total_seconds()
    self.time_since_threshold_exceeded.set(time_since_threshold_exceeded)
    logging.info('Reported time_since_threshold_exceeded %d for flake %s',
                 time_since_threshold_exceeded, flake.name)


  @ndb.transactional(xg=True)  # pylint: disable=E1120
  def post(self, urlsafe_key):
    api = issue_tracker_api.IssueTrackerAPI('chromium')

    # Check if we should stop processing this issue because we've posted too
    # many updates to issue tracker today already.
    day_ago = datetime.datetime.utcnow() - datetime.timedelta(days=1)
    num_updates_last_day = FlakeUpdate.query(
        FlakeUpdate.time_updated > day_ago,
        ancestor=self._get_flake_update_singleton_key()).count()
    if num_updates_last_day >= MAX_UPDATED_ISSUES_PER_DAY:
      logging.info('Too many issues updated in the last 24 hours')
      return

    now = datetime.datetime.utcnow()
    flake = ndb.Key(urlsafe=urlsafe_key).get()
    logging.info('Processing %s', flake.key)

    # Only update/file issues if there are new flaky runs.
    if flake.num_reported_flaky_runs == len(flake.occurrences):
      logging.info(
          'No new flakes (reported %d, total %d)',
          flake.num_reported_flaky_runs, len(flake.occurrences))
      return

    # Retrieve flaky runs outside of the transaction, because we are not
    # planning to modify them and because there could be more of them than the
    # number of groups supported by cross-group transactions on AppEngine.
    new_flakes = self._get_new_flakes(flake)

    if len(new_flakes) < MIN_REQUIRED_FLAKY_RUNS:
      logging.info('Too few new flakes: %d', len(new_flakes))
      return

    if flake.issue_id > 0:
      # Update issues at most once a day.
      if flake.issue_last_updated > now - datetime.timedelta(days=1):
        logging.info('Issue was updated less than 24 hours ago')
        return

      self._update_issue(api, flake, new_flakes, now)
      self._increment_update_counter()
    else:
      self._create_issue(api, flake, new_flakes, now)
      # Don't update the issue just yet, this may fail, and we need the
      # transaction to succeed in order to avoid filing duplicate bugs.
      self._increment_update_counter()

    # Note that if transaction fails for some reason at this point, we may post
    # updates or create issues multiple times. On the other hand, this should be
    # extremely rare because we set the number of concurrently running tasks to
    # 1, therefore there should be no contention for updating this issue's
    # entity.
    flake.put()
Exemplo n.º 16
0
inc_started_builds = _incrementer('started', 'Build start',
                                  _build_fields('bucket', 'builder', 'canary'))
inc_completed_builds = _incrementer(
    'completed',
    'Build completion, including success, failure and cancellation',
    _build_fields('bucket', 'builder', 'result', 'failure_reason',
                  'cancelation_reason', 'canary'))
inc_lease_expirations = _incrementer(
    'lease_expired', 'Build lease expirations',
    _build_fields('bucket', 'builder', 'status'))
inc_leases = _incrementer('leases',
                          'Successful build leases or lease extensions',
                          _build_fields('bucket', 'builder'))

inc_heartbeat_failures = gae_ts_mon.CounterMetric(
    'buildbucket/builds/heartbeats', 'Failures to extend a build lease',
    []).increment


def _ts_delta_sec(start, end):  # pragma: no cover
    assert start.seconds
    assert end.seconds
    return end.seconds - start.seconds


# requires the argument to have create_time and end_time.
add_build_cycle_duration = _duration_adder(  # pragma: no branch
    'cycle_durations', 'Duration between build creation and completion',
    lambda b: _ts_delta_sec(b.create_time, b.end_time))

# requires the argument to have start_time and end_time.
Exemplo n.º 17
0
class CreateFlakyRun(webapp2.RequestHandler):
  flaky_runs = gae_ts_mon.CounterMetric(
      'flakiness_pipeline/flake_occurrences_recorded',
      'Recorded flake occurrences.',
      None)

  # We execute below method in an indepedent transaction since otherwise we
  # would exceed the maximum number of entities allowed within a single
  # transaction.
  @staticmethod
  # pylint: disable=E1120
  @ndb.transactional(xg=True, propagation=ndb.TransactionOptions.INDEPENDENT)
  def add_failure_to_flake(name, flaky_run_key, failure_time, is_step):
    flake = Flake.get_by_id(name)
    if not flake:
      flake = Flake(name=name, id=name, last_time_seen=datetime.datetime.min,
                    is_step=is_step)
      flake.put()

    flake.occurrences.append(flaky_run_key)
    # TODO(sergiyb): This is necessary to update existing flakes. Remove in July
    # 2016 or later.
    flake.is_step = is_step
    util.add_occurrence_time_to_flake(flake, failure_time)
    flake.put()

  @classmethod
  def _flatten_tests(cls, tests, delimiter):
    """Finds all passed, failed and skipped tests in tests trie.

    Test names are produced by concatenating parent node names with delimieter.

    We only return 3 types of tests:
     - passed, i.e. expected is "PASS" and last actual run is "PASS"
     - failed, i.e. expected is "PASS" and last actual run is "FAIL", "TIMEOUT"
       or "CRASH"
     - skipped, i.e. expected and actual are both "SKIP"

    We do not classify or return any other tests, in particular:
     - known flaky, i.e. expected to have varying results, e.g. "PASS FAIL".
     - known failing, i.e. expected is "FAIL", "TIMEOUT" or "CRASH".
     - unexpected flakiness, i.e. failures than hapeneed before last PASS.

    Args:
      delimiter: Delimiter to use for concatenating parts of test name.
      tests: Any non-leaf node of the hierarchical GTest JSON test structure.

    Returns:
      A tuple (passed, failed, skpped), where each is a list of test names.
    """
    passed = []
    failed = []
    skipped = []
    for name, test in flatten_tests_trie(tests, delimiter).iteritems():
      if test['expected'] == ['PASS']:
        last_result = test['actual'][-1]
        if last_result == 'PASS':
          passed.append(name)
        elif last_result in ('FAIL', 'TIMEOUT', 'CRASH'):
          failed.append(name)
      elif test['expected'] == ['SKIP'] and test['actual'] == ['SKIP']:
        skipped.append(name)

    return passed, failed, skipped

  @classmethod
  def get_flakes(cls, mastername, buildername, buildnumber, step):
    """Returns a list of flakes in a given step.

    It can either be entire step or a list of specific tests.

    Args:
      mastername: Master name on which step has been run.
      buildername: Builder name on which step has been run.
      buildnume: Number of the build in which step has been run.
      step: Step name.

    Returns:
      (flakes, is_step), where flakes is a list of flake names and is_step is
      True when the whole step is a flake, in which case flakes is a list
      containing a single entry - the name of the step.
    """
    # If test results were invalid, report whole step as flaky.
    steptext = ' '.join(step['text'])
    stepname = normalize_test_type(step['name'])
    if 'TEST RESULTS WERE INVALID' in steptext:
      return [stepname], True

    url = TEST_RESULTS_URL_TEMPLATE % {
      'mastername': urllib2.quote(mastername),
      'buildername': urllib2.quote(buildername),
      'buildnumber': urllib2.quote(str(buildnumber)),
      'stepname': urllib2.quote(stepname),
    }

    try:
      result = urlfetch.fetch(url)

      if result.status_code >= 200 and result.status_code < 400:
        json_result = json.loads(result.content)

        _, failed, _ = cls._flatten_tests(
            json_result.get('tests', {}),
            json_result.get('path_delimiter', '/'))
        if len(failed) > MAX_INDIVIDUAL_FLAKES_PER_STEP:
          return [stepname], True
        return failed, False

      if result.status_code == 404:
        # This is quite a common case (only some failing steps are actually
        # running tests and reporting results to flakiness dashboard).
        logging.info('Failed to retrieve JSON from %s', url)
      else:
        logging.exception('Failed to retrieve JSON from %s', url)
    except Exception:
      logging.exception('Failed to retrieve or parse JSON from %s', url)

    return [stepname], True

  @ndb.transactional(xg=True)  # pylint: disable=E1120
  def post(self):
    if (not self.request.get('failure_run_key') or
        not self.request.get('success_run_key')):
      self.response.set_status(400, 'Invalid request parameters')
      return

    failure_run = ndb.Key(urlsafe=self.request.get('failure_run_key')).get()
    success_run = ndb.Key(urlsafe=self.request.get('success_run_key')).get()

    flaky_run = FlakyRun(
        failure_run=failure_run.key,
        failure_run_time_started=failure_run.time_started,
        failure_run_time_finished=failure_run.time_finished,
        success_run=success_run.key)

    failure_time = failure_run.time_finished
    patchset_builder_runs = failure_run.key.parent().get()

    master = BuildRun.removeMasterPrefix(patchset_builder_runs.master)
    url = ('https://chrome-build-extract.appspot.com/p/' + master +
           '/builders/' + patchset_builder_runs.builder +'/builds/' +
           str(failure_run.buildnumber) + '?json=1')
    urlfetch.set_default_fetch_deadline(60)
    logging.info('get_flaky_run_reason ' + url)
    response = urlfetch.fetch(url)
    if response.status_code >= 400 and response.status_code <= 599:
      logging.error('The request to %s has returned %d: %s', url,
                    response.status_code, response.content)
      self.response.set_status(500, 'Failed to fetch build.')
      return
    json_result = json.loads(response.content)
    steps = json_result['steps']

    failed_steps = []
    passed_steps = []
    for step in steps:
      result = step['results'][0]
      if build_result.isResultSuccess(result):
        passed_steps.append(step)
        continue
      if not build_result.isResultFailure(result):
        continue
      step_name = step['name']
      step_text = ' '.join(step['text'])
      if step_name in IGNORED_STEPS:
        continue

      # Custom (non-trivial) rules for ignoring flakes in certain steps:
      #  - [swarming] ...: summary step would also be red (do not double count)
      #  - Patch failure: ingore non-infra failures as they are typically due to
      #    changes in the code on HEAD
      #  - bot_update PATCH FAILED: Duplicates failure in 'Patch failure' step.
      #  - ... (retry summary): this is an artificial step to fail the build due
      #    to another step that has failed earlier (do not double count).
      if (step_name.startswith('[swarming]') or
          (step_name == 'Patch failure' and result != build_result.EXCEPTION) or
          (step_name == 'bot_update' and 'PATCH FAILED' in step_text)):
        continue

      failed_steps.append(step)

    steps_to_ignore = []
    for step in failed_steps:
      step_name = step['name']
      if '(with patch)' in step_name:
        # Ignore any steps from the same test suite, which is determined by the
        # normalized step name. Additionally, if the step fails without patch,
        # ignore the original step as well because tree is busted.
        normalized_step_name = normalize_test_type(step_name, True)
        for other_step in failed_steps:
          if other_step == step:
            continue
          normalized_other_step_name = normalize_test_type(
              other_step['name'], True)
          if normalized_other_step_name == normalized_step_name:
            steps_to_ignore.append(other_step['name'])
            if '(without patch)' in other_step['name']:
              steps_to_ignore.append(step['name'])

    flakes_to_update = []
    for step in failed_steps:
      step_name = step['name']
      if step_name in steps_to_ignore:
        continue
      flakes, is_step = self.get_flakes(
          master, patchset_builder_runs.builder, failure_run.buildnumber, step)
      for flake in flakes:
        flake_occurrence = FlakeOccurrence(name=step_name, failure=flake)
        flaky_run.flakes.append(flake_occurrence)
        flakes_to_update.append((flake, is_step))

    # Do not create FlakyRuns if all failed steps have been ignored.
    if not flaky_run.flakes:
      return

    flaky_run_key = flaky_run.put()
    for flake, is_step in flakes_to_update:
      self.add_failure_to_flake(flake, flaky_run_key, failure_time, is_step)
    self.flaky_runs.increment_by(1)
Exemplo n.º 18
0
import config
import errors
import events
import model
import search
import swarming
import user

MAX_RETURN_BUILDS = 100
DEFAULT_LEASE_DURATION = datetime.timedelta(minutes=1)

# A cumlative counter of access denied errors in peek() method.
# This metric exists because defining it on the buildbucket server is easier
# than modifying Buildbot. It is very specific intentionally.
PEEK_ACCESS_DENIED_ERROR_COUNTER = gae_ts_mon.CounterMetric(
    'buildbucket/peek_access_denied_errors', 'Number of errors in peek API',
    [gae_ts_mon.StringField('bucket')])


def validate_lease_key(lease_key):
    if lease_key is None:
        raise errors.InvalidInputError('Lease key is not provided')


def validate_url(url):
    if url is None:
        return
    if not isinstance(url, basestring):
        raise errors.InvalidInputError('url must be string')
    parsed = urlparse.urlparse(url)
    if not parsed.netloc:
Exemplo n.º 19
0
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import gae_ts_mon

found_suspects = gae_ts_mon.CounterMetric(
    'predator/found_suspects',
    'Metric monitoring whether Predator found CLs for the crash report. This '
    'metric has fields like: {found_suspects: True/False, client_id: '
    'Cracas/Fracas/Clusterfuzz}', [
        gae_ts_mon.BooleanField('found_suspects'),
        gae_ts_mon.StringField('client_id')
    ])

has_regression_range = gae_ts_mon.CounterMetric(
    'predator/has_regression_range',
    'Metric monitoring whether Predator has regression range for the crash '
    'report. This metric has fields like: {has_regression_range: True/False, '
    'client_id: Cracas/Fracas/Clusterfuzz}', [
        gae_ts_mon.BooleanField('has_regression_range'),
        gae_ts_mon.StringField('client_id')
    ])

found_components = gae_ts_mon.CounterMetric(
    'predator/found_components',
    'Metric monitoring whether Predator found components for the crash report. '
    'This metric has fields like: {found_components: True/False, client_id: '
    'Cracas/Fracas/Clusterfuzz}', [
        gae_ts_mon.BooleanField('found_components'),
        gae_ts_mon.StringField('client_id')