def QueryAndEmit(baselines, cursor): """Queries MySQL for important stats and emits Monarch metrics @param baselines: A dict containing the initial values for the cumulative metrics. @param cursor: The mysql command line. """ for status in EMITTED_STATUSES_COUNTERS: delta = GetStatus(cursor, status) - baselines[status] metric_name = 'chromeos/autotest/afe_db/%s' % status.lower() metrics.Counter(metric_name).set(delta) for status in EMITTED_STATUS_GAUGES: metric_name = 'chromeos/autotest/afe_db/%s' % status.lower() metrics.Gauge(metric_name).set(GetStatus(cursor, status)) pages_free = GetStatus(cursor, 'Innodb_buffer_pool_pages_free') pages_total = GetStatus(cursor, 'Innodb_buffer_pool_pages_total') metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set( pages_free, fields={'used': False}) metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set( pages_total - pages_free, fields={'used': True})
def _modify_table(cursor, mysql_cmds, table): """Helper method to commit a list of sql_cmds. @param cursor: mysql cursor instance. @param mysql_cmds: the list of sql cmd to be executed. @param table: the name of the table modified. """ try: succeed = False for cmd in mysql_cmds: logging.info('running command: %s', cmd) cursor.execute(cmd) succeed = True except Exception as e: msg = ('Fail to run the following sql command:\n%s\nError:\n%s\n' 'All changes made to server db will be rollback.' % (cmd, e)) logging.error(msg) raise UpdateDatabaseException(msg) finally: num_deletes = len([cmd.startswith('DELETE') for cmd in mysql_cmds]) num_inserts = len([cmd.startswith('INSERT') for cmd in mysql_cmds]) metrics.Gauge(_METRICS_PREFIX + '/inconsistency_fixed').set( num_deletes, fields={ 'table': table, 'action': 'delete', 'succeed': succeed }) metrics.Gauge(_METRICS_PREFIX + '/inconsistency_fixed').set( num_inserts, fields={ 'table': table, 'action': 'insert', 'succeed': succeed })
def do_heartbeat(self): """Perform a heartbeat: Retreive new jobs. This function executes a `shard_heartbeat` RPC. It retrieves the response of this call and processes the response by storing the returned objects in the local database. """ heartbeat_metrics_prefix = 'chromeos/autotest/shard_client/heartbeat/' logging.info("Performing heartbeat.") packet = self._heartbeat_packet() metrics.Gauge(heartbeat_metrics_prefix + 'request_size').set( len(str(packet))) try: response = self.afe.run(HEARTBEAT_AFE_ENDPOINT, **packet) except urllib2.HTTPError as e: self._heartbeat_failure("HTTPError %d: %s" % (e.code, e.reason)) return except urllib2.URLError as e: self._heartbeat_failure("URLError: %s" % e.reason) return except httplib.HTTPException as e: self._heartbeat_failure("HTTPException: %s" % e) return except timeout_util.TimeoutError as e: self._heartbeat_failure("TimeoutError: %s" % e) return metrics.Gauge(heartbeat_metrics_prefix + 'response_size').set( len(str(response))) self._mark_jobs_as_uploaded([job['id'] for job in packet['jobs']]) self.process_heartbeat_response(response) logging.info("Heartbeat completed.")
def _report_packet_metrics(self, packet): """Report stats about outgoing packet to monarch.""" metrics.Gauge(_METRICS_PREFIX + 'known_job_ids_count').set( len(packet['known_job_ids'])) metrics.Gauge(_METRICS_PREFIX + 'jobs_upload_count').set( len(packet['jobs'])) metrics.Gauge(_METRICS_PREFIX + 'known_host_ids_count').set( len(packet['known_host_ids']))
def process_heartbeat_response(self, heartbeat_response): """Save objects returned by a heartbeat to the local database. This deseralizes hosts and jobs including their dependencies and saves them to the local database. @param heartbeat_response: A dictionary with keys 'hosts' and 'jobs', as returned by the `shard_heartbeat` rpc call. """ hosts_serialized = heartbeat_response['hosts'] jobs_serialized = heartbeat_response['jobs'] suite_keyvals_serialized = heartbeat_response['suite_keyvals'] incorrect_host_ids = heartbeat_response.get('incorrect_host_ids', []) metrics.Gauge('chromeos/autotest/shard_client/hosts_received').set( len(hosts_serialized)) metrics.Gauge('chromeos/autotest/shard_client/jobs_received').set( len(jobs_serialized)) metrics.Gauge( 'chromeos/autotest/shard_client/suite_keyvals_received').set( len(suite_keyvals_serialized)) self._deserialize_many(hosts_serialized, models.Host, 'host') self._deserialize_many(jobs_serialized, models.Job, 'job') self._deserialize_many(suite_keyvals_serialized, models.JobKeyval, 'jobkeyval') host_ids = [h['id'] for h in hosts_serialized] logging.info('Heartbeat response contains hosts %s', host_ids) job_ids = [j['id'] for j in jobs_serialized] logging.info('Heartbeat response contains jobs %s', job_ids) parent_jobs_with_keyval = set( [kv['job_id'] for kv in suite_keyvals_serialized]) logging.info('Heartbeat response contains suite_keyvals_for jobs %s', list(parent_jobs_with_keyval)) if incorrect_host_ids: logging.info( 'Heartbeat response contains incorrect_host_ids %s ' 'which will be deleted.', incorrect_host_ids) self._remove_incorrect_hosts(incorrect_host_ids) # If the master has just sent any jobs that we think have completed, # re-sync them with the master. This is especially useful when a # heartbeat or job is silently dropped, as the next heartbeat will # have a disagreement. Updating the shard_id to NULL will mark these # jobs for upload on the next heartbeat. job_models = models.Job.objects.filter(id__in=job_ids, hostqueueentry__complete=True) if job_models: job_models.update(shard=None) job_ids_repr = ', '.join([str(job.id) for job in job_models]) logging.warn( 'Following completed jobs are reset shard_id to NULL ' 'to be uploaded to master again: %s', job_ids_repr)
def clean_labels(options): """Cleans unused labels from AFE database""" msg = 'Label cleaner starts. Will delete ' if options.prefix: msg += 'all labels whose prefix is "%s".' else: msg += 'a label "%s".' logging.info(msg, options.label) logging.info('Target database: %s.', options.db_server) if options.check_status and not is_primary_server(): raise Exception('Cannot run in a non-primary server') conn = MySQLdb.connect( host=options.db_server, user=options.db_user, passwd=options.db_password, db=DATABASE, ) all_labels = fetch_labels(conn) logging.info('Found total %d labels', len(all_labels)) metrics.Gauge(_METRICS_PREFIX + '/total_labels_count').set( len(all_labels), fields={ 'target_db': options.db_server, 'label_prefix': '', }, ) labels = fetch_labels(conn, options.label, options.prefix) logging.info('Found total %d labels matching %s', len(labels), options.label) if options.prefix and options.label in _LABEL_PREFIX_METRICS_WHITELIST: metrics.Gauge(_METRICS_PREFIX + '/total_labels_count').set( len(labels), fields={ 'target_db': options.db_server, 'label_prefix': options.label, }, ) used_labels = get_used_labels(conn) logging.info('Found %d labels are used', len(used_labels)) metrics.Gauge(_METRICS_PREFIX + '/used_labels_count').set( len(used_labels), fields={'target_db': options.db_server}) to_delete = list(labels - used_labels) logging.info('Deleting %d unused labels', len(to_delete)) delete_labels(conn, to_delete, options.max_delete, options.dry_run) metrics.Counter(_METRICS_PREFIX + '/labels_deleted').increment_by( len(to_delete), fields={'target_db': options.db_server})
def _report_failed_jobs_count(self, failed_jobs): """Report the number of outstanding failed offload jobs to monarch. @param: List of failed jobs. """ metrics.Gauge('chromeos/autotest/gs_offloader/failed_jobs_count').set( len(failed_jobs))
def _report_detected_errors(metric_name, gauge, fields={}): """Reports a gauge metric for errors detected @param metric_name: Name of the metric to report about. @param gauge: Outstanding number of unrecoverable errors of this type. @param fields: Optional fields to include with the metric. """ m = '%s/errors_detected/%s' % (_METRICS_PREFIX, metric_name) metrics.Gauge(m).set(gauge, fields=fields)
def do_heartbeat(self): """Perform a heartbeat: Retreive new jobs. This function executes a `shard_heartbeat` RPC. It retrieves the response of this call and processes the response by storing the returned objects in the local database. Returns: True if the heartbeat ran successfully, False otherwise. """ logging.info("Performing heartbeat.") packet = self._heartbeat_packet() self._report_packet_metrics(packet) metrics.Gauge(_METRICS_PREFIX + 'request_size').set(len(str(packet))) try: response = self.afe.run(HEARTBEAT_AFE_ENDPOINT, **packet) logging.info('Finished heartbeat upload.') except urllib2.HTTPError as e: self._heartbeat_failure('HTTPError %d: %s' % (e.code, e.reason), 'HTTPError') return False except urllib2.URLError as e: self._heartbeat_failure('URLError: %s' % e.reason, 'URLError') return False except httplib.HTTPException as e: self._heartbeat_failure('HTTPException: %s' % e, 'HTTPException') return False except timeout_util.TimeoutError as e: self._heartbeat_failure('TimeoutError: %s' % e, 'TimeoutError') return False except proxy.JSONRPCException as e: self._heartbeat_failure('JSONRPCException: %s' % e, 'JSONRPCException') return False metrics.Gauge(_METRICS_PREFIX + 'response_size').set(len( str(response))) logging.info('Marking jobs as uploaded.') self._mark_jobs_as_uploaded([job['id'] for job in packet['jobs']]) logging.info('Processing heartbeat response.') self.process_heartbeat_response(response) logging.info("Heartbeat completed.") return True
def _run(): """Report metadata in the queue until being aborted. """ # Time when the first time upload failed. None if the last upload succeeded. first_failed_upload = None upload_size = _MIN_RETRY_ENTRIES try: while True: start_time = time.time() data_list = [] if (first_failed_upload and time.time() - first_failed_upload > _MAX_UPLOAD_FAIL_DURATION): upload_size = _MIN_RETRY_ENTRIES else: upload_size = min(upload_size*2, _MAX_UPLOAD_SIZE) while (not metadata_queue.empty() and len(data_list) < upload_size): data_list.append(metadata_queue.get_nowait()) if data_list: success = False fields = _get_metrics_fields().copy() fields['success'] = success metrics.Gauge( _METADATA_METRICS_PREFIX + 'upload/batch_sizes').set( len(data_list), fields=fields) metrics.Counter( _METADATA_METRICS_PREFIX + 'upload/attempts').increment( fields=fields); metrics.Gauge(_METADATA_METRICS_PREFIX + 'queue_size').set( metadata_queue.qsize(), fields=_get_metrics_fields()) sleep_time = _REPORT_INTERVAL_SECONDS - time.time() + start_time if sleep_time < 0: sleep_time = 0.5 _abort.wait(timeout=sleep_time) except Exception as e: logging.exception('Metadata reporter thread failed with error: %s', e) raise finally: logging.info('Metadata reporting thread is exiting.') _abort.clear() _report_lock.release()
def _compute_active_processes(self, drone): drone.active_processes = 0 for pidfile_id, contents in self._pidfiles.iteritems(): is_running = contents.exit_status is None on_this_drone = (contents.process and contents.process.hostname == drone.hostname) if is_running and on_this_drone: info = self._registered_pidfile_info[pidfile_id] if info.num_processes is not None: drone.active_processes += info.num_processes metrics.Gauge('chromeos/autotest/drone/active_processes').set( drone.active_processes, fields={'drone_hostname': drone.hostname})
def main(): """Counts the number of AFE jobs in the last day, then pushes the count to statsd""" parser = argparse.ArgumentParser(description=( 'A script which records the number of afe jobs run in a time interval.' )) parser.parse_args(sys.argv[1:]) count = number_of_jobs_since(timedelta(days=1)) with site_utils.SetupTsMonGlobalState('count_jobs', short_lived=True): # TODO: Reporting a stat for each job created from the afe directly could be better. # More discussions are needed to decide whether to remove this file. metrics.Gauge( 'chromeos/autotest/experimental/jobs_rate/afe_daily_count').set( count)
def main(argv): """Entry point for dut_mon.""" logging.getLogger().setLevel(logging.INFO) with ts_mon_config.SetupTsMonGlobalState('dut_mon', indirect=True): afe = frontend.AFE() counters = collections.defaultdict(lambda: 0) field_spec = [ts_mon.StringField('board'), ts_mon.StringField('model'), ts_mon.StringField('pool'), ts_mon.BooleanField('is_locked'), ts_mon.StringField('status'), ] dut_count = metrics.Gauge('chromeos/autotest/dut_mon/dut_count', description='The number of duts in a given ' 'state and bucket.', field_spec=field_spec) tick_count = metrics.Counter('chromeos/autotest/dut_mon/tick', description='Tick counter of dut_mon.') while True: # Note: We reset all counters to zero in each loop rather than # creating a new defaultdict, because we want to ensure that any # gauges that were previously set to a nonzero value by this process # get set back to zero if necessary. for k in counters: counters[k] = 0 logging.info('Fetching all hosts.') hosts = afe.get_hosts() logging.info('Fetched %s hosts.', len(hosts)) for host in hosts: fields = _get_bucket_for_host(host) counters[fields] += 1 for field, value in counters.iteritems(): logging.info('%s %s', field, value) dut_count.set(value, fields=field.__dict__) tick_count.increment() logging.info('Sleeping for 2 minutes.') time.sleep(120)
def check_proc(prog, max_elapsed_sec): """Check the number of long-running processes for a given program. Finds out the number of processes for a given program that have run more than a given elapsed time. Sends out the number to stats dashboard. @param prog: Program name. @param max_elapsed_sec: Max elapsed time in seconds. Processes that have run more than this value will be caught. """ cmd = ('ps -eo etimes,args | grep "%s" | awk \'{if($1 > %d) print $0}\' | ' 'wc -l' % (prog, max_elapsed_sec)) count = int(subprocess.check_output(cmd, shell=True)) if prog not in PROGRAM_TO_CHECK_SET: prog = 'unknown' metrics.Gauge('chromeos/autotest/hung_processes').set( count, fields={'program': prog})
def _schedule_jobs(self): """Schedule new jobs against hosts.""" new_jobs_with_hosts = 0 queue_entries = self.job_query_manager.get_pending_queue_entries( only_hostless=False) unverified_host_jobs = [ job for job in queue_entries if not job.is_hostless() ] if unverified_host_jobs: for acquisition in self.find_hosts_for_jobs(unverified_host_jobs): self.schedule_host_job(acquisition.host, acquisition.job) self._record_host_assignment(acquisition.host, acquisition.job) new_jobs_with_hosts += 1 metrics.Counter('%s/new_jobs_with_hosts' % _METRICS_PREFIX).increment_by(new_jobs_with_hosts) num_jobs_without_hosts = (len(unverified_host_jobs) - new_jobs_with_hosts) metrics.Gauge('%s/current_jobs_without_hosts' % _METRICS_PREFIX).set(num_jobs_without_hosts) metrics.Counter('%s/tick' % _METRICS_PREFIX).increment()
def batch_acquire_hosts(self, host_requests): """Acquire hosts for a list of requests. The act of acquisition involves finding and leasing a set of hosts that match the parameters of a request. Each acquired host is added to the response_map dictionary as an RDBServerHostWrapper. @param host_requests: A list of requests to acquire hosts. """ distinct_requests = 0 logging.debug('Processing %s host acquisition requests', len(host_requests)) metrics.Gauge('chromeos/autotest/scheduler/pending_host_acq_requests' ).set(len(host_requests)) self.request_accountant = rdb_utils.RequestAccountant(host_requests) # First pass tries to satisfy min_duts for each suite. for request in self.request_accountant.requests: to_acquire = self.request_accountant.get_min_duts(request) if to_acquire > 0: self._acquire_hosts(request, to_acquire, is_acquire_min_duts=True) distinct_requests += 1 # Second pass tries to allocate duts to the rest unsatisfied requests. for request in self.request_accountant.requests: to_acquire = self.request_accountant.get_duts(request) if to_acquire > 0: self._acquire_hosts(request, to_acquire, is_acquire_min_duts=False) self.cache.record_stats() logging.debug('Host acquisition stats: distinct requests: %s, leased ' 'hosts: %s, unsatisfied requests: %s', distinct_requests, self.leased_hosts_count, self.unsatisfied_requests)
def _report_current_jobs_count(self): """Report the number of outstanding jobs to monarch.""" metrics.Gauge('chromeos/autotest/gs_offloader/current_jobs_count').set( len(self._open_jobs))
class BaseDroneManager(object): """ This class acts as an interface from the scheduler to drones, whether it be only a single "drone" for localhost or multiple remote drones. All paths going into and out of this class are relative to the full results directory, except for those returns by absolute_path(). """ # Minimum time to wait before next email # about a drone hitting process limit is sent. NOTIFY_INTERVAL = 60 * 60 * 24 # one day _STATS_KEY = 'drone_manager' _ACTIVE_PROCESS_GAUGE = metrics.Gauge( 'chromeos/autotest/drone/active_processes') def __init__(self): # absolute path of base results dir self._results_dir = None # holds Process objects self._process_set = set() # holds the list of all processes running on all drones self._all_processes = {} # maps PidfileId to PidfileContents self._pidfiles = {} # same as _pidfiles self._pidfiles_second_read = {} # maps PidfileId to _PidfileInfo self._registered_pidfile_info = {} # used to generate unique temporary paths self._temporary_path_counter = 0 # maps hostname to Drone object self._drones = {} self._results_drone = None # maps results dir to dict mapping file path to contents self._attached_files = {} # heapq of _DroneHeapWrappers self._drone_queue = [] # A threaded task queue used to refresh drones asynchronously. if _THREADED_DRONE_MANAGER: self._refresh_task_queue = thread_lib.ThreadedTaskQueue( name='%s.refresh_queue' % self._STATS_KEY) else: self._refresh_task_queue = drone_task_queue.DroneTaskQueue() def initialize(self, base_results_dir, drone_hostnames, results_repository_hostname): self._results_dir = base_results_dir for hostname in drone_hostnames: self._add_drone(hostname) if not self._drones: # all drones failed to initialize raise DroneManagerError('No valid drones found') self.refresh_drone_configs() logging.info('Using results repository on %s', results_repository_hostname) self._results_drone = drones.get_drone(results_repository_hostname) results_installation_dir = global_config.global_config.get_config_value( scheduler_config.CONFIG_SECTION, 'results_host_installation_directory', default=None) if results_installation_dir: self._results_drone.set_autotest_install_dir( results_installation_dir) # don't initialize() the results drone - we don't want to clear out any # directories and we don't need to kill any processes def reinitialize_drones(self): for drone in self.get_drones(): with metrics.SecondsTimer( 'chromeos/autotest/drone_manager/' 'reinitialize_drones_duration', fields={'drone': drone.hostname}): drone.call('initialize', self._results_dir) def shutdown(self): for drone in self.get_drones(): drone.shutdown() def _get_max_pidfile_refreshes(self): """ Normally refresh() is called on every monitor_db.Dispatcher.tick(). @returns: The number of refresh() calls before we forget a pidfile. """ pidfile_timeout = global_config.global_config.get_config_value( scheduler_config.CONFIG_SECTION, 'max_pidfile_refreshes', type=int, default=2000) return pidfile_timeout def _add_drone(self, hostname): logging.info('Adding drone %s', hostname) drone = drones.get_drone(hostname) if drone: self._drones[drone.hostname] = drone drone.call('initialize', self.absolute_path('')) def _remove_drone(self, hostname): self._drones.pop(hostname, None) def refresh_drone_configs(self): """ Reread global config options for all drones. """ # Import server_manager_utils is delayed rather than at the beginning of # this module. The reason is that test_that imports drone_manager when # importing autoserv_utils. The import is done before test_that setup # django (test_that only setup django in setup_local_afe, since it's # not needed when test_that runs the test in a lab duts through :lab: # option. Therefore, if server_manager_utils is imported at the # beginning of this module, test_that will fail since django is not # setup yet. from autotest_lib.site_utils import server_manager_utils config = global_config.global_config section = scheduler_config.CONFIG_SECTION config.parse_config_file() for hostname, drone in self._drones.iteritems(): if server_manager_utils.use_server_db(): server = server_manager_utils.get_servers(hostname=hostname)[0] attributes = dict([(a.attribute, a.value) for a in server.attributes.all()]) drone.enabled = (int(attributes.get('disabled', 0)) == 0) drone.max_processes = int( attributes.get( 'max_processes', scheduler_config.config.max_processes_per_drone)) allowed_users = attributes.get('users', None) else: disabled = config.get_config_value(section, '%s_disabled' % hostname, default='') drone.enabled = not bool(disabled) drone.max_processes = config.get_config_value( section, '%s_max_processes' % hostname, type=int, default=scheduler_config.config.max_processes_per_drone) allowed_users = config.get_config_value(section, '%s_users' % hostname, default=None) if allowed_users: drone.allowed_users = set(allowed_users.split()) else: drone.allowed_users = None logging.info('Drone %s.max_processes: %s', hostname, drone.max_processes) logging.info('Drone %s.enabled: %s', hostname, drone.enabled) logging.info('Drone %s.allowed_users: %s', hostname, drone.allowed_users) logging.info('Drone %s.support_ssp: %s', hostname, drone.support_ssp) self._reorder_drone_queue() # max_processes may have changed # Clear notification record about reaching max_processes limit. self._notify_record = {} def get_drones(self): return self._drones.itervalues() def cleanup_orphaned_containers(self): """Queue cleanup_orphaned_containers call at each drone. """ for drone in self._drones.values(): logging.info('Queue cleanup_orphaned_containers at %s', drone.hostname) drone.queue_call('cleanup_orphaned_containers') def _get_drone_for_process(self, process): return self._drones[process.hostname] def _get_drone_for_pidfile_id(self, pidfile_id): pidfile_contents = self.get_pidfile_contents(pidfile_id) assert pidfile_contents.process is not None return self._get_drone_for_process(pidfile_contents.process) def _drop_old_pidfiles(self): # use items() since the dict is modified in unregister_pidfile() for pidfile_id, info in self._registered_pidfile_info.items(): if info.age > self._get_max_pidfile_refreshes(): logging.warning('dropping leaked pidfile %s', pidfile_id) self.unregister_pidfile(pidfile_id) else: info.age += 1 def _reset(self): self._process_set = set() self._all_processes = {} self._pidfiles = {} self._pidfiles_second_read = {} self._drone_queue = [] def _parse_pidfile(self, drone, raw_contents): """Parse raw pidfile contents. @param drone: The drone on which this pidfile was found. @param raw_contents: The raw contents of a pidfile, eg: "pid\nexit_staus\nnum_tests_failed\n". """ contents = PidfileContents() if not raw_contents: return contents lines = raw_contents.splitlines() if len(lines) > 3: return InvalidPidfile('Corrupt pid file (%d lines):\n%s' % (len(lines), lines)) try: pid = int(lines[0]) contents.process = Process(drone.hostname, pid) # if len(lines) == 2, assume we caught Autoserv between writing # exit_status and num_failed_tests, so just ignore it and wait for # the next cycle if len(lines) == 3: contents.exit_status = int(lines[1]) contents.num_tests_failed = int(lines[2]) except ValueError, exc: return InvalidPidfile('Corrupt pid file: ' + str(exc.args)) return contents
def PerformStage(self): """Perform the actual work for this stage. This includes final metadata archival, and update CIDB with our final status as well as producting a logged build result summary. """ build_identifier, _ = self._run.GetCIDBHandle() build_id = build_identifier.cidb_id buildbucket_id = build_identifier.buildbucket_id if results_lib.Results.BuildSucceededSoFar(self.buildstore, buildbucket_id, self.name): final_status = constants.BUILDER_STATUS_PASSED else: final_status = constants.BUILDER_STATUS_FAILED if not hasattr(self._run.attrs, 'release_tag'): # If, for some reason, sync stage was not completed and # release_tag was not set. Set it to None here because # ArchiveResults() depends the existence of this attr. self._run.attrs.release_tag = None # Set up our report metadata. self._run.attrs.metadata.UpdateWithDict( self.GetReportMetadata( final_status=final_status, completion_instance=self._completion_instance)) src_root = self._build_root # Workspace builders use a different buildroot for overlays. if self._run.config.workspace_branch and self._run.options.workspace: src_root = self._run.options.workspace # Add tags for the arches and statuses of the build. # arches requires crossdev which isn't available at the early part of the # build. arches = [] for board in self._run.config['boards']: toolchains = toolchain.GetToolchainsForBoard(board, buildroot=src_root) default = list( toolchain.FilterToolchains(toolchains, 'default', True)) if default: try: arches.append(toolchain.GetArchForTarget(default[0])) except cros_build_lib.RunCommandError as e: logging.warning( 'Unable to retrieve arch for board %s default toolchain %s: %s', board, default, e) tags = { 'arches': arches, 'status': final_status, } results = self._run.attrs.metadata.GetValue('results') for stage in results: tags['stage_status:%s' % stage['name']] = stage['status'] tags['stage_summary:%s' % stage['name']] = stage['summary'] self._run.attrs.metadata.UpdateKeyDictWithDict(constants.METADATA_TAGS, tags) # Some operations can only be performed if a valid version is available. try: self._run.GetVersionInfo() self.ArchiveResults(final_status) metadata_url = os.path.join(self.upload_url, constants.METADATA_JSON) except cbuildbot_run.VersionNotSetError: logging.error('A valid version was never set for this run. ' 'Can not archive results.') metadata_url = '' results_lib.Results.Report(sys.stdout, current_version=(self._run.attrs.release_tag or '')) # Upload goma log if used for BuildPackage and TestSimpleChrome. _UploadAndLinkGomaLogIfNecessary( 'BuildPackages', self._run.config.name, self._run.options.goma_dir, self._run.options.goma_client_json, self._run.attrs.metadata.GetValueWithDefault('goma_tmp_dir')) _UploadAndLinkGomaLogIfNecessary( 'TestSimpleChromeWorkflow', self._run.config.name, self._run.options.goma_dir, self._run.options.goma_client_json, self._run.attrs.metadata.GetValueWithDefault( 'goma_tmp_dir_for_simple_chrome')) if self.buildstore.AreClientsReady(): status_for_db = final_status # TODO(pprabhu): After BuildData and CBuildbotMetdata are merged, remove # this extra temporary object creation. # XXX:HACK We're creating a BuildData with an empty URL. Don't try to # MarkGathered this object. build_data = metadata_lib.BuildData( '', self._run.attrs.metadata.GetDict()) # TODO(akeshet): Find a clearer way to get the "primary upload url" for # the metadata.json file. One alternative is _GetUploadUrls(...)[0]. # Today it seems that element 0 of its return list is the primary upload # url, but there is no guarantee or unit test coverage of that. self.buildstore.FinishBuild(build_id, status=status_for_db, summary=build_data.failure_message, metadata_url=metadata_url) duration = self._GetBuildDuration() mon_fields = { 'status': status_for_db, 'build_config': self._run.config.name, 'important': self._run.config.important } metrics.Counter( constants.MON_BUILD_COMP_COUNT).increment(fields=mon_fields) metrics.CumulativeSecondsDistribution( constants.MON_BUILD_DURATION).add(duration, fields=mon_fields) if self._run.options.sanity_check_build: metrics.Counter( constants.MON_BUILD_SANITY_COMP_COUNT).increment( fields=mon_fields) metrics.Gauge( constants.MON_BUILD_SANITY_ID, description= 'The build number of the latest sanity build. Used ' 'for recovering the link to the latest failing build ' 'in the alert when a sanity build fails.', field_spec=[ ts_mon.StringField('status'), ts_mon.StringField('build_config'), ts_mon.StringField('builder_name'), ts_mon.BooleanField('important') ]).set(self._run.buildnumber, fields=dict( mon_fields, builder_name=self._run.GetBuilderName())) if config_lib.IsMasterCQ(self._run.config): self_destructed = self._run.attrs.metadata.GetValueWithDefault( constants.SELF_DESTRUCTED_BUILD, False) mon_fields = { 'status': status_for_db, 'self_destructed': self_destructed } metrics.CumulativeSecondsDistribution( constants.MON_CQ_BUILD_DURATION).add(duration, fields=mon_fields) annotator_link = uri_lib.ConstructAnnotatorUri(build_id) logging.PrintBuildbotLink('Build annotator', annotator_link) # From this point forward, treat all exceptions as warnings. self._post_completion = True # Dump report about things we retry. retry_stats.ReportStats(sys.stdout)
def create_mysql_updates(api_output, db_output, table, server_id_map, warn_only): """Sync up servers table in server db with the inventory service. First step, entries in server_db but not in inventory services will be deleted from db. Then, entries in inventory service but not in server_db will be inserted into server_db. @param api_output: a dict mapping table name to list of corresponding namedtuples parsed from inventory. This is the only source of truth. @param db_output: a dict mapping table name to list of corresponding namedtuples parsed from server db. @param table: name of the targeted server_db table. @param server_id_map: server hostname to id mapping dict. @param warn_only: whether it is warn_only. If yes, there will be no server id for server_attributes and server_roles. @returns a list of mysql update commands, e.g. ['DELETE FROM a WHERE xx', 'INSERT ...'] """ logging.info('Checking table %s with inventory service...', table) mysql_cmds = [] delete_entries = set(db_output[table]) - set(api_output[table]) insert_entries = set(api_output[table]) - set(db_output[table]) if delete_entries: logging.info( '\nTable %s is not synced up! Below is a list of entries ' 'that exist only in server db. These invalid entries will be ' 'deleted from server db:\n%s', table, delete_entries) for entry in delete_entries: if table == 'servers': cmd = 'DELETE FROM servers WHERE hostname=%r' % entry.hostname elif table == 'server_attributes': cmd = ( 'DELETE FROM server_attrs WHERE server_id=%d and attribute=%r' % (server_id_map[entry.hostname], entry.attribute)) else: cmd = ( 'DELETE FROM server_roles WHERE server_id=%d and role=%r' % (server_id_map[entry.hostname], entry.role)) mysql_cmds.append(cmd) if insert_entries: logging.info( '\nTable %s is not synced up! Below is a list of entries ' 'that exist only in inventory service. These new entries will' ' be inserted in to server db:\n%s', table, insert_entries) for entry in insert_entries: # If this is warn_only, it is very likely that the server id for new # entry does not exsit since the server has not been inserted into servers # table yet. For this case, fake it as 0. if warn_only and not server_id_map.get(entry.hostname): server_id = 0 else: server_id = server_id_map[entry.hostname] if table == 'servers': cname = entry.cname.__repr__() if entry.cname else 'NULL' cmd = ('INSERT INTO servers (hostname, cname, status, note) ' 'VALUES(%r, %s, %r, %r)' % (entry.hostname, cname, entry.status, entry.note)) elif table == 'server_attributes': cmd = ( 'INSERT INTO server_attributes (server_id, attribute, value) ' 'VALUES(%d, %r, %r)' % (server_id, entry.attribute, entry.value)) else: cmd = ( 'INSERT INTO server_roles (server_id, role) VALUES(%d, %r)' % (server_id, entry.role)) mysql_cmds.append(cmd) metrics.Gauge(_METRICS_PREFIX + '/inconsistency_found').set( len(delete_entries), fields={ 'table': table, 'action': 'to_delete' }) metrics.Gauge(_METRICS_PREFIX + '/inconsistency_found').set( len(insert_entries), fields={ 'table': table, 'action': 'to_add' }) return mysql_cmds
def _gauge_metrics(cls): """Report to monarch the number of running processes.""" m = metrics.Gauge('chromeos/autotest/scheduler/postjob_tasks') m.set(cls._num_running_processes, fields={'task_name': cls.__name__})