class BuildRun(ndb.Model): # pragma: no cover @staticmethod def removeMasterPrefix(master): if master.startswith('master.'): return master[len('master.'):] else: return master def getURL(self): parent = self.key.parent().get() return ('https://build.chromium.org/p/' + self.removeMasterPrefix(parent.master) + '/builders/' + parent.builder + '/builds/' + str(self.buildnumber)) def getMiloURL(self): # In July 2016, protobuf changed and URLs for earlier builds do not open. if self.time_finished < datetime.datetime(2016, 8, 1): return parent = self.key.parent().get() return ('https://luci-milo.appspot.com/buildbot/' + self.removeMasterPrefix(parent.master) + '/' + parent.builder + '/' + str(self.buildnumber)) buildnumber = ndb.IntegerProperty(required=True) result = ndb.IntegerProperty(required=True) time_finished = ndb.DateTimeProperty(required=True) time_started = ndb.DateTimeProperty(default=datetime.datetime.max) is_success = ndb.ComputedProperty( lambda self: build_result.isResultSuccess(self.result)) is_failure = ndb.ComputedProperty( lambda self: build_result.isResultFailure(self.result))
class BuildRun(ndb.Model): def getURL(self): return ('http://build.chromium.org/p/' + self.key.parent().get().master + '/builders/' + self.key.parent().get().builder + '/builds/' + str(self.buildnumber)) buildnumber = ndb.IntegerProperty(required=True) result = ndb.IntegerProperty(required=True) time_finished = ndb.DateTimeProperty(required=True) is_success = ndb.ComputedProperty( lambda self: build_result.isResultSuccess(self.result)) is_failure = ndb.ComputedProperty( lambda self: build_result.isResultFailure(self.result))
def post(self): if (not self.request.get('failure_run_key') or not self.request.get('success_run_key')): self.response.set_status(400, 'Invalid request parameters') return failure_run = ndb.Key(urlsafe=self.request.get('failure_run_key')).get() success_run = ndb.Key(urlsafe=self.request.get('success_run_key')).get() flaky_run = FlakyRun( failure_run=failure_run.key, failure_run_time_started=failure_run.time_started, failure_run_time_finished=failure_run.time_finished, success_run=success_run.key) failure_time = failure_run.time_finished patchset_builder_runs = failure_run.key.parent().get() master = BuildRun.removeMasterPrefix(patchset_builder_runs.master) url = ('https://chrome-build-extract.appspot.com/p/' + master + '/builders/' + patchset_builder_runs.builder +'/builds/' + str(failure_run.buildnumber) + '?json=1') urlfetch.set_default_fetch_deadline(60) logging.info('get_flaky_run_reason ' + url) response = urlfetch.fetch(url) if response.status_code >= 400 and response.status_code <= 599: logging.error('The request to %s has returned %d: %s', url, response.status_code, response.content) self.response.set_status(500, 'Failed to fetch build.') return json_result = json.loads(response.content) steps = json_result['steps'] failed_steps = [] passed_steps = [] for step in steps: result = step['results'][0] if build_result.isResultSuccess(result): passed_steps.append(step) continue if not build_result.isResultFailure(result): continue step_name = step['name'] step_text = ' '.join(step['text']) if step_name in IGNORED_STEPS: continue # Custom (non-trivial) rules for ignoring flakes in certain steps: # - [swarming] ...: summary step would also be red (do not double count) # - Patch failure: ingore non-infra failures as they are typically due to # changes in the code on HEAD # - bot_update PATCH FAILED: Duplicates failure in 'Patch failure' step. # - ... (retry summary): this is an artificial step to fail the build due # to another step that has failed earlier (do not double count). if (step_name.startswith('[swarming]') or (step_name == 'Patch failure' and result != build_result.EXCEPTION) or (step_name == 'bot_update' and 'PATCH FAILED' in step_text)): continue failed_steps.append(step) steps_to_ignore = [] for step in failed_steps: step_name = step['name'] if '(with patch)' in step_name: # Ignore any steps from the same test suite, which is determined by the # normalized step name. Additionally, if the step fails without patch, # ignore the original step as well because tree is busted. normalized_step_name = normalize_test_type(step_name, True) for other_step in failed_steps: if other_step == step: continue normalized_other_step_name = normalize_test_type( other_step['name'], True) if normalized_other_step_name == normalized_step_name: steps_to_ignore.append(other_step['name']) if '(without patch)' in other_step['name']: steps_to_ignore.append(step['name']) flakes_to_update = [] for step in failed_steps: step_name = step['name'] if step_name in steps_to_ignore: continue flakes, is_step = self.get_flakes( master, patchset_builder_runs.builder, failure_run.buildnumber, step) for flake in flakes: flake_occurrence = FlakeOccurrence(name=step_name, failure=flake) flaky_run.flakes.append(flake_occurrence) flakes_to_update.append((flake, is_step)) # Do not create FlakyRuns if all failed steps have been ignored. if not flaky_run.flakes: return flaky_run_key = flaky_run.put() for flake, is_step in flakes_to_update: self.add_failure_to_flake(flake, flaky_run_key, failure_time, is_step) self.flaky_runs.increment_by(1)
def post(self): if (not self.request.get('failure_run_key') or not self.request.get('success_run_key')): self.response.set_status(400, 'Invalid request parameters') return failure_run = ndb.Key(urlsafe=self.request.get('failure_run_key')).get() success_run = ndb.Key(urlsafe=self.request.get('success_run_key')).get() flaky_run = FlakyRun( failure_run=failure_run.key, failure_run_time_started=failure_run.time_started, failure_run_time_finished=failure_run.time_finished, success_run=success_run.key) success_time = success_run.time_finished failure_time = failure_run.time_finished patchset_builder_runs = failure_run.key.parent().get() # TODO(sergiyb): The parsing logic below is very fragile and will break with # any changes to step names and step text. We should move away from parsing # buildbot to tools like flakiness dashboard (test-results.appspot.com), # which uses a standartized JSON format. url = ('http://build.chromium.org/p/' + patchset_builder_runs.master + '/json/builders/' + patchset_builder_runs.builder +'/builds/' + str(failure_run.buildnumber)) urlfetch.set_default_fetch_deadline(60) logging.info('get_flaky_run_reason ' + url) result = urlfetch.fetch(url).content try: json_result = json.loads(result) except ValueError: logging.exception('couldnt decode json for %s', url) return steps = json_result['steps'] failed_steps = [] passed_steps = [] for step in steps: result = step['results'][0] if build_result.isResultSuccess(result): passed_steps.append(step) continue if not build_result.isResultFailure(result): continue step_name = step['name'] step_text = ' '.join(step['text']) # The following step failures are ignored: # - steps: always red when any other step is red (not a failure) # - [swarming] ...: summary step would also be red (do not double count) # - presubmit: typically red due to missing OWNERs LGTM, not a flake # - recipe failure reason: always red when build fails (not a failure) # - Patch failure: if success run was before failure run, it is # likely a legitimate failure. For example it often happens that # developers use CQ dry run and then wait for a review. Once getting # LGTM they check CQ checkbox, but the patch does not cleanly apply # anymore. # - bot_update PATCH FAILED: Corresponds to 'Patch failure' step. # - test results: always red when another step is red (not a failure) # - Uncaught Exception: summary step referring to an exception in another # step (e.g. bot_update) # - ... (retry summary): this is an artificial step to fail the build due # to another step that has failed earlier (do not double count). if (step_name == 'steps' or step_name.startswith('[swarming]') or step_name == 'presubmit' or step_name == 'recipe failure reason' or (step_name == 'Patch failure' and success_time < failure_time) or (step_name == 'bot_update' and 'PATCH FAILED' in step_text) or step_name == 'test results' or step_name == 'Uncaught Exception' or step_name.endswith(' (retry summary)')): continue failed_steps.append(step) steps_to_ignore = [] for step in failed_steps: step_name = step['name'] if ' (with patch)' in step_name: # Android instrumentation tests add a prefix before the step name, which # doesn't appear on the summary step (without suffixes). To make sure we # correctly ignore duplicate failures, we remove the prefix. step_name = step_name.replace('Instrumentation test ', '') # If a step fails without the patch, then the tree is busted. Don't # count as flake. step_name_with_no_modifier = step_name.replace(' (with patch)', '') step_name_without_patch = ( '%s (without patch)' % step_name_with_no_modifier) for other_step in failed_steps: if other_step['name'] == step_name_without_patch: steps_to_ignore.append(step['name']) steps_to_ignore.append(other_step['name']) flakes_to_update = [] for step in failed_steps: step_name = step['name'] if step_name in steps_to_ignore: continue flakes = self.get_flakes( patchset_builder_runs.master, patchset_builder_runs.builder, failure_run.buildnumber, step) for flake in flakes: flake_occurrence = FlakeOccurrence(name=step_name, failure=flake) flaky_run.flakes.append(flake_occurrence) flakes_to_update.append(flake) flaky_run_key = flaky_run.put() for flake in flakes_to_update: self.add_failure_to_flake(flake, flaky_run_key, failure_time) self.flaky_runs.increment_by(1)
def parse_cq_data(json_data): logging_output = [] for result in json_data.get('results', {}): fields = result.get('fields', []) if not 'action' in fields: continue action = fields.get('action') if action != 'verifier_jobs_update': continue if fields.get('verifier') != 'try job': continue # At the moment, much of the parsing logic assumes this is a Chromium # tryjob. if fields.get('project') != 'chromium': continue job_states = fields.get('jobs', []) for state in job_states: # Just go by |result|. #if state not in ['JOB_SUCCEEDED', 'JOB_FAILED', 'JOB_TIMED_OUT']: # continue for job in job_states[state]: build_properties = job.get('build_properties') if not build_properties: continue try: master = job['master'] builder = job['builder'] result = job['result'] timestamp_tz = dateutil.parser.parse(job['timestamp']) # We assume timestamps from chromium-cq-status are already in UTC. timestamp = timestamp_tz.replace(tzinfo=None) except KeyError: continue try: buildnumber = get_int_value(build_properties, 'buildnumber') issue = get_int_value(build_properties, 'issue') patchset = get_int_value(build_properties, 'patchset') attempt_start_ts = get_int_value(build_properties, 'attempt_start_ts') time_started = datetime.datetime.utcfromtimestamp( attempt_start_ts / 1000000) except ValueError: continue if build_result.isResultPending(result): continue # At this point, only success or failure. success = build_result.isResultSuccess(result) patchset_builder_runs = get_patchset_builder_runs(issue=issue, patchset=patchset, master=master, builder=builder) build_run = BuildRun(parent=patchset_builder_runs.key, buildnumber=buildnumber, result=result, time_started=time_started, time_finished=timestamp) previous_runs = BuildRun.query( ancestor=patchset_builder_runs.key).fetch() duplicate = False for previous_run in previous_runs: # We saw this build run already or there are multiple green runs, # in which case we ignore subsequent ones to avoid showing failures # multiple times. if (previous_run.buildnumber == buildnumber) or \ (build_run.is_success and previous_run.is_success) : duplicate = True break if duplicate: continue build_run.put() for previous_run in previous_runs: if previous_run.is_success == build_run.is_success: continue if success: # We saw the flake and then the pass. failure_run = previous_run success_run = build_run else: # We saw the pass and then the failure. Could happen when fetching # historical data, or for the bot_update step (patch can't be # applied cleanly anymore). failure_run = build_run success_run = previous_run logging_output.append(failure_run.key.parent().get().builder + str(failure_run.buildnumber)) # Queue a task to fetch the error of this failure and create FlakyRun. flakes_metric.increment_by(1) taskqueue.add( queue_name='issue-updates', url='/issues/create_flaky_run', params={'failure_run_key': failure_run.key.urlsafe(), 'success_run_key': success_run.key.urlsafe()}) return logging_output
def post(self): if (not self.request.get('failure_run_key') or not self.request.get('success_run_key')): self.response.set_status(400, 'Invalid request parameters') return failure_run = ndb.Key(urlsafe=self.request.get('failure_run_key')).get() success_run = ndb.Key(urlsafe=self.request.get('success_run_key')).get() flaky_run = FlakyRun( failure_run=failure_run.key, failure_run_time_started=failure_run.time_started, failure_run_time_finished=failure_run.time_finished, success_run=success_run.key) failure_time = failure_run.time_finished patchset_builder_runs = failure_run.key.parent().get() master = BuildRun.removeMasterPrefix(patchset_builder_runs.master) url = ('https://luci-milo.appspot.com/' 'prpc/milo.Buildbot/GetBuildbotBuildJSON') request = json.dumps({ 'master': master, 'builder': patchset_builder_runs.builder, 'buildNum': failure_run.buildnumber, }) headers = { 'Content-Type': 'application/json', 'Accept': 'application/json', } urlfetch.set_default_fetch_deadline(60) logging.info('get_flaky_run_reason: %s, %s', url, request) response = urlfetch.fetch( url, payload=request, method=urlfetch.POST, headers=headers, validate_certificate=True) if response.status_code != 200: logging.error('The request to %s has returned %d: %s', url, response.status_code, response.content) self.response.set_status(500, 'Failed to fetch build.') return content = response.content if content.startswith(_MILO_RESPONSE_PREFIX): content = content[len(_MILO_RESPONSE_PREFIX):] data = json.loads(content)['data'] json_result = json.loads(base64.b64decode(data)) steps = json_result['steps'] failed_steps = [] passed_steps = [] for step in steps: result = step['results'][0] if build_result.isResultSuccess(result): passed_steps.append(step) continue if not build_result.isResultFailure(result): continue # For Luci builds, some steps don't have step text anymore. Such steps # include 'Failure reason', 'analyze', etc. step_text = ' '.join(step['text'] or []) step_name = step['name'] if step_name in IGNORED_STEPS: continue # Custom (non-trivial) rules for ignoring flakes in certain steps: # - [swarming] ...: summary step would also be red (do not double count) # - Patch failure: ingore non-infra failures as they are typically due to # changes in the code on HEAD # - bot_update PATCH FAILED: Duplicates failure in 'Patch failure' step. # - ... (retry summary): this is an artificial step to fail the build due # to another step that has failed earlier (do not double count). if (step_name.startswith('[swarming]') or (step_name == 'Patch failure' and result != build_result.EXCEPTION) or (step_name == 'bot_update' and 'PATCH FAILED' in step_text)): continue failed_steps.append(step) steps_to_ignore = [] for step in failed_steps: step_name = step['name'] if '(with patch)' in step_name: # Ignore any steps from the same test suite, which is determined by the # normalized step name. Additionally, if the step fails without patch, # ignore the original step as well because tree is busted. normalized_step_name = normalize_test_type(step_name, True) for other_step in failed_steps: if other_step == step: continue normalized_other_step_name = normalize_test_type( other_step['name'], True) if normalized_other_step_name == normalized_step_name: steps_to_ignore.append(other_step['name']) if '(without patch)' in other_step['name']: steps_to_ignore.append(step['name']) flakes_to_update = [] for step in failed_steps: step_name = step['name'] if step_name in steps_to_ignore: continue flakes, is_step = self.get_flakes( master, patchset_builder_runs.builder, failure_run.buildnumber, step) if is_step and not is_infra_step_flake(step_name): continue # Ignore flakes of non-infra steps. for flake in flakes: flake_occurrence = FlakeOccurrence(name=step_name, failure=flake) flaky_run.flakes.append(flake_occurrence) flakes_to_update.append((flake, is_step)) # Do not create FlakyRuns if all failed steps have been ignored. if not flaky_run.flakes: return flaky_run_key = flaky_run.put() for flake, is_step in flakes_to_update: if self.is_duplicate_occurrence(flake, flaky_run): logging.info('Not adding duplicate occurrence for the same CL') continue self.add_failure_to_flake(flake, flaky_run_key, failure_time, is_step) self.flaky_runs.increment_by(1)
def parse_cq_data(json_data): logging_output = [] for result in json_data.get('results', {}): fields = result.get('fields', []) if not 'action' in fields: logging.warning('Missing field action in status record') parsing_errors.increment_by(1) continue action = fields.get('action') if action != 'verifier_jobs_update': continue if fields.get('verifier') != 'try job': continue # At the moment, much of the parsing logic assumes this is a Chromium # tryjob. project = fields.get('project') if project != 'chromium/chromium/src': logging.info('project not chromium: %s', project) continue job_states = fields.get('jobs', {}) for job in itertools.chain.from_iterable(job_states.values()): try: builder = job['builder'] result = job['result'] timestamp_tz = dateutil.parser.parse( job.get('created_ts') or job['timestamp']) # We assume timestamps from chromium-cq-status are already in UTC. timestamp = timestamp_tz.replace(tzinfo=None) except KeyError: logging.warning('Failed to parse job details', exc_info=True) parsing_errors.increment_by(1) continue if build_result.isResultPending(result): continue build_properties = job.get('build_properties') if not build_properties: logging.warning( 'Missing field build_properties in job details') parsing_errors.increment_by(1) continue issue = -1 patchset = -1 time_started = 0 try: buildnumber = get_int_value(build_properties, 'buildnumber') if 'patch_issue' in build_properties: issue = get_int_value(build_properties, 'patch_issue') else: # pragma: no cover logging.warning('no issue') if 'patch_set' in build_properties: patchset = get_int_value(build_properties, 'patch_set') else: # pragma: no cover logging.warning('no patchset') if 'attempt_start_ts' in build_properties: attempt_start_ts = get_int_value(build_properties, 'attempt_start_ts') time_started = datetime.datetime.utcfromtimestamp( attempt_start_ts / 1000000) else: # pragma: no cover logging.warning('no attempt_start_ts') continue # For builds through Buildbucket, job['master'] is actually the bucket # name. For buildbot-based builds, it just happens to be the same as the # master name. For Luci-based builds, it is different from the master # name, and the master name is set as a build property instead. # https://chromium.googlesource.com/chromium/src/+/infra/config/cr-buildbucket.cfg#115 # So in either case, the "real" master name is in the build properties. master = build_properties['mastername'] except (ValueError, KeyError): logging.warning('Failed to parse build properties', exc_info=True) parsing_errors.increment_by(1) continue # At this point, only success or failure. success = build_result.isResultSuccess(result) patchset_builder_runs = get_patchset_builder_runs( issue=issue, patchset=patchset, master=master, builder=builder) build_run = BuildRun(parent=patchset_builder_runs.key, buildnumber=buildnumber, result=result, time_started=time_started, time_finished=timestamp) previous_runs = BuildRun.query( ancestor=patchset_builder_runs.key).fetch() duplicate = False for previous_run in previous_runs: # We saw this build run already or there are multiple green runs, # in which case we ignore subsequent ones to avoid showing failures # multiple times. if (previous_run.buildnumber == buildnumber) or \ (build_run.is_success and previous_run.is_success) : duplicate = True break if duplicate: continue build_run.put() for previous_run in previous_runs: if previous_run.is_success == build_run.is_success: continue if success: # We saw the flake and then the pass. failure_run = previous_run success_run = build_run else: # We saw the pass and then the failure. Could happen when fetching # historical data, or for the bot_update step (patch can't be # applied cleanly anymore). failure_run = build_run success_run = previous_run logging_output.append(failure_run.key.parent().get().builder + str(failure_run.buildnumber)) # Queue a task to fetch the error of this failure and create FlakyRun. flakes_metric.increment_by(1) taskqueue.add(queue_name='issue-updates', url='/issues/create_flaky_run', params={ 'failure_run_key': failure_run.key.urlsafe(), 'success_run_key': success_run.key.urlsafe() }) return logging_output
def parse_cq_data(json_data): logging_output = [] for result in json_data['results']: fields = result['fields'] if not 'action' in fields: continue action = fields['action'] if action != 'verifier_jobs_update': continue if fields['verifier'] != 'try job': continue job_states = fields['jobs'] for state in job_states: # Just go by |result|. #if state not in ['JOB_SUCCEEDED', 'JOB_FAILED', 'JOB_TIMED_OUT']: # continue for job in job_states[state]: build_properties = job['build_properties'] if not build_properties: continue master = job['master'] builder = job['builder'] result = job['result'] timestamp = datetime.datetime.strptime(job['timestamp'], '%Y-%m-%d %H:%M:%S.%f') try: buildnumber = get_int_value(build_properties, 'buildnumber') issue = get_int_value(build_properties, 'issue') patchset = get_int_value(build_properties, 'patchset') except ValueError as e: continue if build_result.isResultPending(result): continue # At this point, only success or failure. success = build_result.isResultSuccess(result) patchset_builder_runs = get_patchset_builder_runs(issue=issue, patchset=patchset, master=master, builder=builder) build_run = BuildRun(parent=patchset_builder_runs.key, buildnumber=buildnumber, result=result, time_finished=timestamp) previous_runs = BuildRun.query( ancestor=patchset_builder_runs.key).fetch() duplicate = False for previous_run in previous_runs: # We saw this build run already or there are multiple green runs, # in which case we ignore subsequent ones to avoid showing failures # multiple times. if (previous_run.buildnumber == buildnumber) or \ (build_run.is_success and previous_run.is_success) : duplicate = True break if duplicate: continue build_run.put() for previous_run in previous_runs: if previous_run.is_success == build_run.is_success: continue if success: # We saw the flake and then the pass. flaky_run = FlakyRun( failure_run=previous_run.key, failure_run_time_finished=previous_run.time_finished, success_run=build_run.key) flaky_run.put() logging_output.append(previous_run.key.parent().get().builder + str(previous_run.buildnumber)) else: # We saw the pass and then the failure. Could happen when fetching # historical data. flaky_run = FlakyRun( failure_run=build_run.key, failure_run_time_finished=build_run.time_finished, success_run=previous_run.key) flaky_run.put() logging_output.append(build_run.key.parent().get().builder + str(build_run.buildnumber)) # Queue a task to fetch the error of this failure. deferred.defer(get_flaky_run_reason, flaky_run.key) return logging_output
def get_flaky_run_reason(flaky_run_key): flaky_run = flaky_run_key.get() failure_run = flaky_run.failure_run.get() patchset_builder_runs = failure_run.key.parent().get() url = ('http://build.chromium.org/p/' + patchset_builder_runs.master + '/json/builders/' + patchset_builder_runs.builder +'/builds/' + str(failure_run.buildnumber)) urlfetch.set_default_fetch_deadline(60) logging.info('get_flaky_run_reason ' + url) result = urlfetch.fetch(url).content try: json_result = json.loads(result) except ValueError: logging.error('couldnt decode json for ' + url) return steps = json_result['steps'] failed_steps = [] passed_steps = [] for step in steps: result = step['results'][0] if build_result.isResultSuccess(result): passed_steps.append(step) continue if not build_result.isResultFailure(result): continue step_name = step['name'] if step_name == 'steps' or step_name.startswith('[swarming]') or \ step_name == 'presubmit': # recipe code shows errors twice with first being 'steps'. also when a # swarming test fails, it shows up twice. also ignore 'presubmit' since # it changes from fail to pass for same patchset depending on new lgtm. continue failed_steps.append(step) steps_to_ignore = [] for step in failed_steps: step_name = step['name'] if ' (with patch)' in step_name: step_name_with_no_modifier = step_name.replace(' (with patch)', '') for other_step in failed_steps: # A step which fails, and then is retried and also fails, will have its # name without the ' (with patch)' again. Don't double count. if other_step['name'] == step_name_with_no_modifier: steps_to_ignore.append(other_step['name']) # If a step fails without the patch, then the tree is busted. Don't count # as flake. step_name_without_patch = step_name_with_no_modifier + ' (without patch)' for other_step in failed_steps: if other_step['name'] == step_name_without_patch: steps_to_ignore.append(step['name']) steps_to_ignore.append(other_step['name']) for step in failed_steps: step_name = step['name'] if step_name in steps_to_ignore: continue flakes = get_flakes(step) if not flakes: continue for flake in flakes: flake_occurance = FlakeOccurance(name=step_name, failure=flake) flaky_run.flakes.append(flake_occurance) add_failure_to_flake(flake, flaky_run) flaky_run.put()