def fetch_cq_status(): """Fetches data from chromium-cq-status app and saves new data. Remembers old cursor and fetches new data. """ fetch_status = FetchStatus.query().get() cursor = '' begin = '' end = '' retry_count = 0 while True: if fetch_status: if fetch_status.done: logging.info('historical fetching done so fetch latest...') end = str(time_functions.timestamp.utcnow_ts()) last_build_run_seen = BuildRun.query().order( -BuildRun.time_finished).fetch(1) begin = str(time_functions.timestamp.utctimestamp( last_build_run_seen[0].time_finished)) cursor = '' else: begin = fetch_status.begin end = fetch_status.end cursor = fetch_status.cursor else: logging.info('didnt find any historical information. fetching last week') begin = str(time_functions.timestamp.utctimestamp( datetime.datetime.utcnow() - datetime.timedelta(weeks=1))) end = str(time_functions.timestamp.utcnow_ts()) if begin and end: logging.info('fetching from ' + str(datetime.datetime.fromtimestamp(float(begin))) + ' to ' + str(datetime.datetime.fromtimestamp(float(end))) + ' cursor: ' + cursor) else: logging.info('fetching with no begin/end and cursor: ' + cursor) url = "https://chromium-cq-status.appspot.com/query" params = [] params.append('tags=action=verifier_jobs_update') if cursor: params.append('cursor=' + cursor) if begin: params.append('begin=' + begin) if end: params.append('end=' + end) # Tried count of 200 or more but would get OOM or deadline errors. Even 50 # sometimes gives: # "Values may not be more than 1000000 bytes in length; received 2118015 # bytes" params.append('count=10') url += '?' + '&'.join(params) logging.info('fetching url: ' + url) try: urlfetch.set_default_fetch_deadline(60) result = urlfetch.fetch(url).content timestamp_str = '"timestamp":"' logging_idx = result.find(timestamp_str) if logging_idx != -1: logging_idx += len(timestamp_str) logging_idx2 = result.find('"', logging_idx) logging.info(' current fetch has time of ' + result[logging_idx:logging_idx2]) try: json_result = json.loads(result) more = json_result['more'] cursor = json_result['cursor'] try: logging_output = parse_cq_data(json_result) if logging_output: logging.info('found flakes: ' + ' '.join(logging_output)) except DeadlineExceededError: logging.info('got DeadlineExceededError during parse_cq_data, ' 'catching to not show up as error') return except ValueError: requests_metric.increment_by(1, fields={'status': 'parse_error'}) logging.exception('failed to parse CQ data from %s', url) if 'DeadlineExceededError' in result: logging.error('got deadline exceeded, trying again after 1s') time.sleep(1) continue elif retry_count < 3: retry_count += 1 logging.error('will retry after sleeping ' + str(retry_count)) time.sleep(retry_count) continue else: logging.error('giving up and will count current fetch as done') # Don't want to continue this as could be a bad cursor. more = False else: requests_metric.increment_by(1, fields={'status': 'success'}) if not fetch_status: fetch_status = FetchStatus() fetch_status.done = not more if fetch_status.done: fetch_status.cursor = '' fetch_status.begin = '' fetch_status.end = '' retry_count = 0 logging.info('finished fetching for current cursor') else: fetch_status.begin = begin fetch_status.end = end fetch_status.cursor = cursor fetch_status.put() if not more: return # finish the cron job and wait for next iteration except urllib2.URLError, e: requests_metric.increment_by(1, fields={'status': 'fetch_error'}) logging.warning('Failed to fetch CQ status: %s', e.reason)
def parse_cq_data(json_data): logging_output = [] for result in json_data.get('results', {}): fields = result.get('fields', []) if not 'action' in fields: continue action = fields.get('action') if action != 'verifier_jobs_update': continue if fields.get('verifier') != 'try job': continue # At the moment, much of the parsing logic assumes this is a Chromium # tryjob. if fields.get('project') != 'chromium': continue job_states = fields.get('jobs', []) for state in job_states: # Just go by |result|. #if state not in ['JOB_SUCCEEDED', 'JOB_FAILED', 'JOB_TIMED_OUT']: # continue for job in job_states[state]: build_properties = job.get('build_properties') if not build_properties: continue try: master = job['master'] builder = job['builder'] result = job['result'] timestamp_tz = dateutil.parser.parse(job['timestamp']) # We assume timestamps from chromium-cq-status are already in UTC. timestamp = timestamp_tz.replace(tzinfo=None) except KeyError: continue try: buildnumber = get_int_value(build_properties, 'buildnumber') issue = get_int_value(build_properties, 'issue') patchset = get_int_value(build_properties, 'patchset') attempt_start_ts = get_int_value(build_properties, 'attempt_start_ts') time_started = datetime.datetime.utcfromtimestamp( attempt_start_ts / 1000000) except ValueError: continue if build_result.isResultPending(result): continue # At this point, only success or failure. success = build_result.isResultSuccess(result) patchset_builder_runs = get_patchset_builder_runs(issue=issue, patchset=patchset, master=master, builder=builder) build_run = BuildRun(parent=patchset_builder_runs.key, buildnumber=buildnumber, result=result, time_started=time_started, time_finished=timestamp) previous_runs = BuildRun.query( ancestor=patchset_builder_runs.key).fetch() duplicate = False for previous_run in previous_runs: # We saw this build run already or there are multiple green runs, # in which case we ignore subsequent ones to avoid showing failures # multiple times. if (previous_run.buildnumber == buildnumber) or \ (build_run.is_success and previous_run.is_success) : duplicate = True break if duplicate: continue build_run.put() for previous_run in previous_runs: if previous_run.is_success == build_run.is_success: continue if success: # We saw the flake and then the pass. failure_run = previous_run success_run = build_run else: # We saw the pass and then the failure. Could happen when fetching # historical data, or for the bot_update step (patch can't be # applied cleanly anymore). failure_run = build_run success_run = previous_run logging_output.append(failure_run.key.parent().get().builder + str(failure_run.buildnumber)) # Queue a task to fetch the error of this failure and create FlakyRun. flakes_metric.increment_by(1) taskqueue.add( queue_name='issue-updates', url='/issues/create_flaky_run', params={'failure_run_key': failure_run.key.urlsafe(), 'success_run_key': success_run.key.urlsafe()}) return logging_output
def parse_cq_data(json_data): logging_output = [] for result in json_data.get('results', {}): fields = result.get('fields', []) if not 'action' in fields: logging.warning('Missing field action in status record') parsing_errors.increment_by(1) continue action = fields.get('action') if action != 'verifier_jobs_update': continue if fields.get('verifier') != 'try job': continue # At the moment, much of the parsing logic assumes this is a Chromium # tryjob. project = fields.get('project') if project != 'chromium/chromium/src': logging.info('project not chromium: %s', project) continue job_states = fields.get('jobs', {}) for job in itertools.chain.from_iterable(job_states.values()): try: builder = job['builder'] result = job['result'] timestamp_tz = dateutil.parser.parse( job.get('created_ts') or job['timestamp']) # We assume timestamps from chromium-cq-status are already in UTC. timestamp = timestamp_tz.replace(tzinfo=None) except KeyError: logging.warning('Failed to parse job details', exc_info=True) parsing_errors.increment_by(1) continue if build_result.isResultPending(result): continue build_properties = job.get('build_properties') if not build_properties: logging.warning( 'Missing field build_properties in job details') parsing_errors.increment_by(1) continue issue = -1 patchset = -1 time_started = 0 try: buildnumber = get_int_value(build_properties, 'buildnumber') if 'patch_issue' in build_properties: issue = get_int_value(build_properties, 'patch_issue') else: # pragma: no cover logging.warning('no issue') if 'patch_set' in build_properties: patchset = get_int_value(build_properties, 'patch_set') else: # pragma: no cover logging.warning('no patchset') if 'attempt_start_ts' in build_properties: attempt_start_ts = get_int_value(build_properties, 'attempt_start_ts') time_started = datetime.datetime.utcfromtimestamp( attempt_start_ts / 1000000) else: # pragma: no cover logging.warning('no attempt_start_ts') continue # For builds through Buildbucket, job['master'] is actually the bucket # name. For buildbot-based builds, it just happens to be the same as the # master name. For Luci-based builds, it is different from the master # name, and the master name is set as a build property instead. # https://chromium.googlesource.com/chromium/src/+/infra/config/cr-buildbucket.cfg#115 # So in either case, the "real" master name is in the build properties. master = build_properties['mastername'] except (ValueError, KeyError): logging.warning('Failed to parse build properties', exc_info=True) parsing_errors.increment_by(1) continue # At this point, only success or failure. success = build_result.isResultSuccess(result) patchset_builder_runs = get_patchset_builder_runs( issue=issue, patchset=patchset, master=master, builder=builder) build_run = BuildRun(parent=patchset_builder_runs.key, buildnumber=buildnumber, result=result, time_started=time_started, time_finished=timestamp) previous_runs = BuildRun.query( ancestor=patchset_builder_runs.key).fetch() duplicate = False for previous_run in previous_runs: # We saw this build run already or there are multiple green runs, # in which case we ignore subsequent ones to avoid showing failures # multiple times. if (previous_run.buildnumber == buildnumber) or \ (build_run.is_success and previous_run.is_success) : duplicate = True break if duplicate: continue build_run.put() for previous_run in previous_runs: if previous_run.is_success == build_run.is_success: continue if success: # We saw the flake and then the pass. failure_run = previous_run success_run = build_run else: # We saw the pass and then the failure. Could happen when fetching # historical data, or for the bot_update step (patch can't be # applied cleanly anymore). failure_run = build_run success_run = previous_run logging_output.append(failure_run.key.parent().get().builder + str(failure_run.buildnumber)) # Queue a task to fetch the error of this failure and create FlakyRun. flakes_metric.increment_by(1) taskqueue.add(queue_name='issue-updates', url='/issues/create_flaky_run', params={ 'failure_run_key': failure_run.key.urlsafe(), 'success_run_key': success_run.key.urlsafe() }) return logging_output
def fetch_cq_status(): """Fetches data from chromium-cq-status app and saves new data. Remembers old cursor and fetches new data. """ fetch_status = FetchStatus.query().get() cursor = '' begin = '' end = '' retry_count = 0 while True: if fetch_status: if fetch_status.done: logging.info('historical fetching done so fetch latest...') end = str(time_functions.timestamp.utcnow_ts()) last_build_run_seen = BuildRun.query().order( -BuildRun.time_finished).fetch(1) begin = str( time_functions.timestamp.utctimestamp( last_build_run_seen[0].time_finished)) cursor = '' else: begin = fetch_status.begin end = fetch_status.end cursor = fetch_status.cursor else: logging.info( 'didnt find any historical information. fetching last week') begin = str( time_functions.timestamp.utctimestamp( datetime.datetime.utcnow() - datetime.timedelta(weeks=1))) end = str(time_functions.timestamp.utcnow_ts()) if begin and end: logging.info('fetching from %s to %s cursor: %s', str(datetime.datetime.utcfromtimestamp(float(begin))), str(datetime.datetime.utcfromtimestamp(float(end))), cursor) else: logging.info('fetching with no begin/end and cursor: ' + cursor) url = "https://chromium-cq-status.appspot.com/query" params = [] params.append('tags=action=verifier_jobs_update') if cursor: params.append('cursor=' + cursor) if begin: params.append('begin=' + begin) if end: params.append('end=' + end) # Tried count of 200 or more but would get OOM or deadline errors. Even 50 # sometimes gives: # "Values may not be more than 1000000 bytes in length; received 2118015 # bytes" params.append('count=10') url += '?' + '&'.join(params) logging.info('fetching url: ' + url) try: urlfetch.set_default_fetch_deadline(60) result = urlfetch.fetch(url).content timestamp_str = '"timestamp":"' logging_idx = result.find(timestamp_str) if logging_idx != -1: logging_idx += len(timestamp_str) logging_idx2 = result.find('"', logging_idx) logging.info(' current fetch has time of ' + result[logging_idx:logging_idx2]) try: json_result = json.loads(result) more = json_result['more'] cursor = json_result['cursor'] try: logging_output = parse_cq_data(json_result) if logging_output: logging.info('found flakes: ' + ' '.join(logging_output)) except DeadlineExceededError: logging.info( 'got DeadlineExceededError during parse_cq_data, ' 'catching to not show up as error') return except ValueError: requests_metric.increment_by(1, fields={'status': 'parse_error'}) logging.exception('failed to parse CQ data from %s', url) if 'DeadlineExceededError' in result: logging.error( 'got deadline exceeded, trying again after 1s') time.sleep(1) continue elif retry_count < 3: retry_count += 1 logging.error('will retry after sleeping ' + str(retry_count)) time.sleep(retry_count) continue else: logging.error( 'giving up and will count current fetch as done') # Don't want to continue this as could be a bad cursor. more = False else: requests_metric.increment_by(1, fields={'status': 'success'}) if not fetch_status: fetch_status = FetchStatus() fetch_status.done = not more if fetch_status.done: fetch_status.cursor = '' fetch_status.begin = '' fetch_status.end = '' retry_count = 0 logging.info('finished fetching for current cursor') else: fetch_status.begin = begin fetch_status.end = end fetch_status.cursor = cursor fetch_status.put() if not more: return # finish the cron job and wait for next iteration except urllib2.URLError, e: requests_metric.increment_by(1, fields={'status': 'fetch_error'}) logging.warning('Failed to fetch CQ status: %s', e.reason)
def parse_cq_data(json_data): logging_output = [] for result in json_data['results']: fields = result['fields'] if not 'action' in fields: continue action = fields['action'] if action != 'verifier_jobs_update': continue if fields['verifier'] != 'try job': continue job_states = fields['jobs'] for state in job_states: # Just go by |result|. #if state not in ['JOB_SUCCEEDED', 'JOB_FAILED', 'JOB_TIMED_OUT']: # continue for job in job_states[state]: build_properties = job['build_properties'] if not build_properties: continue master = job['master'] builder = job['builder'] result = job['result'] timestamp = datetime.datetime.strptime(job['timestamp'], '%Y-%m-%d %H:%M:%S.%f') try: buildnumber = get_int_value(build_properties, 'buildnumber') issue = get_int_value(build_properties, 'issue') patchset = get_int_value(build_properties, 'patchset') except ValueError as e: continue if build_result.isResultPending(result): continue # At this point, only success or failure. success = build_result.isResultSuccess(result) patchset_builder_runs = get_patchset_builder_runs(issue=issue, patchset=patchset, master=master, builder=builder) build_run = BuildRun(parent=patchset_builder_runs.key, buildnumber=buildnumber, result=result, time_finished=timestamp) previous_runs = BuildRun.query( ancestor=patchset_builder_runs.key).fetch() duplicate = False for previous_run in previous_runs: # We saw this build run already or there are multiple green runs, # in which case we ignore subsequent ones to avoid showing failures # multiple times. if (previous_run.buildnumber == buildnumber) or \ (build_run.is_success and previous_run.is_success) : duplicate = True break if duplicate: continue build_run.put() for previous_run in previous_runs: if previous_run.is_success == build_run.is_success: continue if success: # We saw the flake and then the pass. flaky_run = FlakyRun( failure_run=previous_run.key, failure_run_time_finished=previous_run.time_finished, success_run=build_run.key) flaky_run.put() logging_output.append(previous_run.key.parent().get().builder + str(previous_run.buildnumber)) else: # We saw the pass and then the failure. Could happen when fetching # historical data. flaky_run = FlakyRun( failure_run=build_run.key, failure_run_time_finished=build_run.time_finished, success_run=previous_run.key) flaky_run.put() logging_output.append(build_run.key.parent().get().builder + str(build_run.buildnumber)) # Queue a task to fetch the error of this failure. deferred.defer(get_flaky_run_reason, flaky_run.key) return logging_output