def __init__(self, label, description=''): """Initialize a context manager labeled `label` that measures the wall time between enter and exit. If description is truthy, the measurement is reported to ts mon. """ self._label = label self._start = None self.seconds = 0 self._metric = None if description: self._metric = gae_ts_mon.FloatMetric( self._Suffix(), description, [gae_ts_mon.StringField('label')], 's')
# requires the argument to have create_time and start_time. add_build_scheduling_duration = _duration_adder( # pragma: no branch 'scheduling_durations', 'Duration between build creation and start', lambda b: _ts_delta_sec(b.create_time, b.start_time)) BUILD_COUNT_PROD = gae_ts_mon.GaugeMetric( _METRIC_PREFIX_PROD + 'count', 'Number of pending/running prod builds', _build_fields('bucket', 'builder', 'status')) BUILD_COUNT_EXPERIMENTAL = gae_ts_mon.GaugeMetric( _METRIC_PREFIX_EXPERIMENTAL + 'count', 'Number of pending/running experimental builds', _build_fields('bucket', 'builder', 'status')) MAX_AGE_SCHEDULED = gae_ts_mon.FloatMetric( 'buildbucket/builds/max_age_scheduled', 'Age of the oldest SCHEDULED build.', (_build_fields('bucket', 'builder') + [gae_ts_mon.BooleanField('must_be_never_leased')]), units=gae_ts_mon.MetricsDataUnits.SECONDS) SEQUENCE_NUMBER_GEN_DURATION_MS = gae_ts_mon.CumulativeDistributionMetric( 'buildbucket/sequence_number/gen_duration', 'Duration of a sequence number generation in ms', [gae_ts_mon.StringField('sequence')], # Bucketer for 1ms..5s range bucketer=BUCKETER_5_SEC, units=gae_ts_mon.MetricsDataUnits.MILLISECONDS) TAG_INDEX_INCONSISTENT_ENTRIES = gae_ts_mon.NonCumulativeDistributionMetric( 'buildbucket/tag_index/inconsistent_entries', 'Number of inconsistent entries encountered during build search', [gae_ts_mon.StringField('tag')], # We can't have more than 1000 entries in a tag index. bucketer=BUCKETER_1K)
], bucketer=_bucketer) # Global metric. Target fields: # - hostname = 'autogen:<executor_id>' (bot id). # Status value must be 'ready', 'running', or anything else, possibly # swarming-specific, when it cannot run a job. E.g. 'quarantined' or # 'dead'. # Note that 'running' will report data as long as the job is running, # so it is best to restrict data to status == 'pending.' _jobs_max_pending_duration = gae_ts_mon.FloatMetric( 'jobs/max_pending_duration', 'Maximum pending seconds of pending jobs.', [ gae_ts_mon.StringField('spec_name'), gae_ts_mon.StringField('project_id'), gae_ts_mon.StringField('subproject_id'), gae_ts_mon.StringField('pool'), gae_ts_mon.StringField('status'), ]) # Global metric. Metric fields: # - busy = Whether or not the count is for machines that are busy. # - machine_type = server.lease_management.MachineType.key.id(). _machine_types_actual_size = gae_ts_mon.GaugeMetric( 'swarming/machine_types/actual_size', 'Actual number of Machine Provider bots per MachineType.', [ gae_ts_mon.BooleanField('busy'), gae_ts_mon.StringField('machine_type'), ])
class ProcessIssue(webapp2.RequestHandler): time_since_first_flake = gae_ts_mon.FloatMetric( 'flakiness_pipeline/time_since_first_flake', 'The delay in seconds from the moment first flake occurrence in this ' 'flakiness period happens and until the time an issue is created to ' 'track it.', None) time_since_threshold_exceeded = gae_ts_mon.FloatMetric( 'flakiness_pipeline/time_since_threshold_exceeded', 'The delay in seconds from the moment when the last flake occurrence ' 'happens that makes a flake exceed the threshold and until the time an ' 'issue is created to track it.', None) issue_updates = gae_ts_mon.CounterMetric( 'flakiness_pipeline/issue_updates', 'Issues updated/created', [gae_ts_mon.StringField('operation')]) @ndb.transactional def _get_flake_update_singleton_key(self): singleton_key = ndb.Key('FlakeUpdateSingleton', 'singleton') if not singleton_key.get(): FlakeUpdateSingleton(key=singleton_key).put() return singleton_key @ndb.transactional def _increment_update_counter(self): FlakeUpdate(parent=self._get_flake_update_singleton_key()).put() @ndb.non_transactional def _time_difference(self, flaky_run): return (flaky_run.success_run.get().time_finished - flaky_run.failure_run_time_finished).total_seconds() @ndb.non_transactional def _is_same_day(self, flaky_run): time_since_finishing = ( datetime.datetime.utcnow() - flaky_run.failure_run_time_finished) return time_since_finishing <= datetime.timedelta(days=1) @ndb.non_transactional def _get_new_flakes(self, flake): num_runs = len(flake.occurrences) - flake.num_reported_flaky_runs flaky_runs = ndb.get_multi(flake.occurrences[-num_runs:]) flaky_runs = [run for run in flaky_runs if run is not None] return [ flaky_run for flaky_run in flaky_runs if self._is_same_day(flaky_run) and self._time_difference(flaky_run) <= MAX_TIME_DIFFERENCE_SECONDS] @staticmethod @ndb.non_transactional def _find_flakiness_period_occurrences(flake): """Finds all occurrences in the current flakiness period.""" assert flake.occurrences, 'Flake entity has no occurrences' flaky_runs = sorted([run for run in ndb.get_multi(flake.occurrences) if run is not None], key=lambda run: run.failure_run_time_finished) cur = flaky_runs[-1] for i, prev in enumerate(reversed(flaky_runs[:-1])): if (cur.failure_run_time_finished - prev.failure_run_time_finished > MAX_GAP_FOR_FLAKINESS_PERIOD): return flaky_runs[-i-1:] # not including prev, but including cur cur = prev return flaky_runs @staticmethod def _get_time_threshold_exceeded(flakiness_period_occurrences): assert flakiness_period_occurrences, 'No occurrences in flakiness period' window = [] for flaky_run in flakiness_period_occurrences: # pragma: no cover window.append(flaky_run) # Remove flaky runs that happened more than a day before the latest run. flaky_run_finished = flaky_run.failure_run_time_finished window = [ prev_run for prev_run in window if flaky_run_finished - prev_run.failure_run_time_finished <= datetime.timedelta(days=1) ] if len(window) >= MIN_REQUIRED_FLAKY_RUNS: return flaky_run.failure_run_time_finished @ndb.transactional def _recreate_issue_for_flake(self, flake): """Updates a flake to re-create an issue and creates a respective task.""" flake.old_issue_id = flake.issue_id flake.issue_id = 0 taskqueue.add(url='/issues/process/%s' % flake.key.urlsafe(), queue_name='issue-updates', transactional=True) @staticmethod @ndb.non_transactional def _update_new_occurrences_with_issue_id(name, new_flaky_runs, issue_id): # TODO(sergiyb): Find a way to do this asynchronously to avoid block # transaction-bound method calling this. Possible solutions are to use # put_multi_sync (need to find a way to test this) or to use deferred # execution. for fr in new_flaky_runs: for occ in fr.flakes: if occ.failure == name: occ.issue_id = issue_id ndb.put_multi(new_flaky_runs) @staticmethod @ndb.non_transactional def _report_flakes_to_findit(flake, flaky_runs): try: findit.FindItAPI().flake(flake, flaky_runs) except (httplib.HTTPException, apiclient.errors.Error): logging.warning('Failed to send flakes to FindIt', exc_info=True) @staticmethod def follow_duplication_chain(api, starting_issue_id): """Finds last merged-into issue in the deduplication chain. Args: api: Issue Tracker API object. starting_issue_id: ID of the issue to start with. Returns: Issue object for the last issue in the chain (can be the same issue as passed in if it is not marked as Duplicate) or None if duplication loop is detected. """ seen_issues = set() flake_issue = api.getIssue(starting_issue_id) # We need to check both status and merged_into, since it's possible to # create an issue with Duplicate status but without merged_into field set # and vice versa (see http://crbug.com/669054 and http://crbug.com/669056). while flake_issue.status == 'Duplicate' and flake_issue.merged_into: seen_issues.add(flake_issue.id) if flake_issue.merged_into in seen_issues: logging.info('Detected issue duplication loop: %s.', seen_issues) return None flake_issue = api.getIssue(flake_issue.merged_into) return flake_issue @ndb.transactional def _update_issue(self, api, flake, new_flakes, now): """Updates an issue on the issue tracker.""" flake_issue = self.follow_duplication_chain(api, flake.issue_id) if flake_issue is None: # If the issue duplication loop was detected, we re-create the issue. self._recreate_issue_for_flake(flake) return if flake_issue.id != flake.issue_id: # Update the issue ID stored in datastore to avoid following deduplication # chain next time. flake.issue_id = flake_issue.id if not flake_issue.open: # If the issue was closed, we do not update it. This allows changes made # to reduce flakiness to propagate and take effect. If after # DAYS_TO_REOPEN_ISSUE days we still detect flakiness, we will create a # new issue. recent_cutoff = now - datetime.timedelta(days=DAYS_TO_REOPEN_ISSUE) if flake_issue.updated < recent_cutoff: self._recreate_issue_for_flake(flake) return # Make sure issue is in the appropriate bug queue as flakiness is ongoing as # the sheriffs are supposed to disable flaky tests. For steps, only return # if there is no owner on the bug. suffix = None queue_name, expected_label = get_queue_details(flake.name) if expected_label not in flake_issue.labels: if not flake.is_step or not flake_issue.owner: flake_issue.labels.append(expected_label) suffix = RETURN_TO_QUEUE_SUFFIX % queue_name new_flaky_runs_msg = FLAKY_RUNS_TEMPLATE % { 'name': flake.name, 'new_flakes_count': len(new_flakes), 'flakes_url': FLAKES_URL_TEMPLATE % flake.key.urlsafe(), 'suffix': ' %s' % suffix if suffix else ''} api.update(flake_issue, comment=new_flaky_runs_msg) self.issue_updates.increment_by(1, {'operation': 'update'}) logging.info('Updated issue %d for flake %s with %d flake runs', flake.issue_id, flake.name, len(new_flakes)) self._update_new_occurrences_with_issue_id( flake.name, new_flakes, flake_issue.id) flake.num_reported_flaky_runs = len(flake.occurrences) flake.issue_last_updated = now self._report_flakes_to_findit(flake, new_flakes) @ndb.transactional def _create_issue(self, api, flake, new_flakes, now): _, qlabel = get_queue_details(flake.name) labels = ['Type-Bug', 'Pri-1', 'Via-TryFlakes', qlabel] if is_trooper_flake(flake.name): other_queue_msg = TROOPER_QUEUE_MSG else: other_queue_msg = SHERIFF_QUEUE_MSG summary = SUMMARY_TEMPLATE % {'name': flake.name} description = DESCRIPTION_TEMPLATE % { 'summary': summary, 'flakes_url': FLAKES_URL_TEMPLATE % flake.key.urlsafe(), 'flakes_count': len(new_flakes), 'other_queue_msg': other_queue_msg, 'footer': CTF_CAN_FILE_BUGS_FOR_TESTS if flake.is_step else DESCRIPTION_TEST_FOOTER} if flake.old_issue_id: description = REOPENED_DESCRIPTION_TEMPLATE % { 'description': description, 'old_issue': flake.old_issue_id} new_issue = issue.Issue({'summary': summary, 'description': description, 'status': 'Untriaged', 'labels': labels, 'components': ['Tests>Flaky']}) flake_issue = api.create(new_issue) flake.issue_id = flake_issue.id self._update_new_occurrences_with_issue_id( flake.name, new_flakes, flake_issue.id) flake.num_reported_flaky_runs = len(flake.occurrences) flake.issue_last_updated = now self.issue_updates.increment_by(1, {'operation': 'create'}) logging.info('Created a new issue %d for flake %s', flake.issue_id, flake.name) self._report_flakes_to_findit(flake, new_flakes) # Find all flakes in the current flakiness period to compute metrics. The # flakiness period is a series of flakes with a gap no larger than # MAX_GAP_FOR_FLAKINESS_PERIOD seconds. period_flakes = self._find_flakiness_period_occurrences(flake) # Compute the delay since the first flake in the current flakiness period. time_since_first_flake = ( now - period_flakes[0].failure_run_time_finished).total_seconds() self.time_since_first_flake.set(time_since_first_flake) logging.info('Reported time_since_first_flake %d for flake %s', time_since_first_flake, flake.name) # Find the first flake that exceeded the threshold needed to create an # issue and report delay from the moment this flake happend and until we've # actually created the issue. time_since_threshold_exceeded = ( now - self._get_time_threshold_exceeded(period_flakes)).total_seconds() self.time_since_threshold_exceeded.set(time_since_threshold_exceeded) logging.info('Reported time_since_threshold_exceeded %d for flake %s', time_since_threshold_exceeded, flake.name) @ndb.transactional(xg=True) # pylint: disable=E1120 def post(self, urlsafe_key): api = issue_tracker_api.IssueTrackerAPI('chromium') # Check if we should stop processing this issue because we've posted too # many updates to issue tracker today already. day_ago = datetime.datetime.utcnow() - datetime.timedelta(days=1) num_updates_last_day = FlakeUpdate.query( FlakeUpdate.time_updated > day_ago, ancestor=self._get_flake_update_singleton_key()).count() if num_updates_last_day >= MAX_UPDATED_ISSUES_PER_DAY: logging.info('Too many issues updated in the last 24 hours') return now = datetime.datetime.utcnow() flake = ndb.Key(urlsafe=urlsafe_key).get() logging.info('Processing %s', flake.key) # Only update/file issues if there are new flaky runs. if flake.num_reported_flaky_runs == len(flake.occurrences): logging.info( 'No new flakes (reported %d, total %d)', flake.num_reported_flaky_runs, len(flake.occurrences)) return # Retrieve flaky runs outside of the transaction, because we are not # planning to modify them and because there could be more of them than the # number of groups supported by cross-group transactions on AppEngine. new_flakes = self._get_new_flakes(flake) if len(new_flakes) < MIN_REQUIRED_FLAKY_RUNS: logging.info('Too few new flakes: %d', len(new_flakes)) return if flake.issue_id > 0: # Update issues at most once a day. if flake.issue_last_updated > now - datetime.timedelta(days=1): logging.info('Issue was updated less than 24 hours ago') return self._update_issue(api, flake, new_flakes, now) self._increment_update_counter() else: self._create_issue(api, flake, new_flakes, now) # Don't update the issue just yet, this may fail, and we need the # transaction to succeed in order to avoid filing duplicate bugs. self._increment_update_counter() # Note that if transaction fails for some reason at this point, we may post # updates or create issues multiple times. On the other hand, this should be # extremely rare because we set the number of concurrently running tasks to # 1, therefore there should be no contention for updating this issue's # entity. flake.put()
jobs_completed = gae_ts_mon.CounterMetric( 'jobs/completed', description='Number of completed jobs.') jobs_durations = gae_ts_mon.CumulativeDistributionMetric( 'jobs/durations', bucketer=_bucketer, description='Cycle times of completed jobs, in seconds.') jobs_pending_durations = gae_ts_mon.NonCumulativeDistributionMetric( 'jobs/pending_durations', bucketer=_bucketer, description='Pending times of active jobs, in seconds.') jobs_max_pending_duration = gae_ts_mon.FloatMetric( 'jobs/max_pending_duration', description='Maximum pending seconds of pending jobs.') # Similar to jobs/completed and jobs/duration, but with a dedup field. # - project_id: e.g. 'chromium' # - subproject_id: e.g. 'blink'. Set to empty string if not used. # - spec_name: name of a job specification, e.g. '<master>:<builder>:<test>' # for buildbot jobs. # - deduped: boolean describing whether the job was deduped or not. jobs_requested = gae_ts_mon.CounterMetric( 'jobs/requested', description='Number of requested jobs over time.') # Swarming-specific metric. Metric fields: # - project_id: e.g. 'chromium'