def _get(self, id, timeout): """Gets a container from the pool. @param id: A ContainerId to assign to the new container. @param timeout: A timeout (in seconds) to wait for the pool. If a container is not available from the pool within the given period, None will be returned. @return: A container from the pool. """ logging.debug('Received get request (id=%s)', id) container = self._pool.get(timeout) # Assign an ID to the container as soon as it is removed from the pool. # This associates the container with the process to which it will be # handed off. if container is not None: logging.debug('Assigning container (name=%s, id=%s)', container.name, id) container.id = id else: logging.debug('No container (id=%s)', id) metrics.Counter(METRICS_PREFIX + '/container_requests', field_spec=[ts_mon.BooleanField('success')]).increment( fields={'success': (container is not None)}) return container
def __init__(self, container_path=constants.DEFAULT_CONTAINER_PATH, container_factory=None): """Initialize a ContainerBucket. @param container_path: Path to the directory used to store containers. Default is set to AUTOSERV/container_path in global config. @param container_factory: A factory for creating Containers. """ self.container_path = os.path.realpath(container_path) if container_factory is not None: self._factory = container_factory else: # Pass in the container path so that the bucket is hermetic (i.e. so # that if the container path is customized, the base image doesn't # fall back to using the default container path). try: base_image_ok = True container = BaseImage(self.container_path).get() except error.ContainerError as e: base_image_ok = False raise e finally: metrics.Counter(METRICS_PREFIX + '/base_image', field_spec=[ts_mon.BooleanField('corrupted')] ).increment( fields={'corrupted': not base_image_ok}) self._factory = ContainerFactory( base_container=container, lxc_path=self.container_path) self.container_cache = {}
def EmitStop(_m, graceful): """Emits a Counter metric for apache service stops @param _m: A regex match object @param graceful: Whether apache was stopped gracefully. """ metrics.Counter(STOP_METRIC, description="A metric counting Apache service stops.", field_spec=[ts_mon.BooleanField('graceful') ]).increment(fields={'graceful': graceful})
def testGetMetricFieldSpec(self): """Test each field type gets its FieldSpec.""" fields = { 'int': 12, 'bool': True, 'str': 'string', } expected_fieldspec = [ts_mon.IntegerField('int'), ts_mon.BooleanField('bool'), ts_mon.StringField('str')] self.assertEqual(ts_mon_config.GetMetricFieldSpec(fields), expected_fieldspec)
def _create_workers(self): """Spawns workers to handle container requests. This method modifies the _workers list and should only be called from within run(). """ if self._pool.full(): return # Do not exceed the worker limit. if len(self._workers) >= self._worker_max: return too_many_errors = len(self._error_timestamps) >= _MAX_ERRORS_PER_HOUR metrics.Counter(METRICS_PREFIX + '/error_throttled', field_spec=[ ts_mon.BooleanField('throttled') ]).increment(fields={'throttled': too_many_errors}) # Throttle if too many errors occur. if too_many_errors: logging.warning('Error throttled (until %d)', self._error_timestamps[0] + 3600) return # Create workers to refill the pool. qsize = self._pool.qsize() shortfall = self._pool.maxsize - qsize old_worker_count = len(self._workers) # Avoid spamming - only log if the monitor is taking some action. Log # this before creating worker threads, because we are counting live # threads and want to avoid race conditions w.r.t. threads actually # starting. if (old_worker_count < shortfall and old_worker_count < self._worker_max): # This can include workers that aren't currently in the self._worker # list, e.g. workers that were dropped from the list because they # timed out. active_workers = sum( [1 for t in threading.enumerate() if type(t) is _Worker]) # qsize : Current size of the container pool. # shortfall: Number of empty slots currently in the pool. # workers : m+n, where m is the current number of active worker # threads and n is the number of new threads created. logging.debug('qsize:%d shortfall:%d workers:%d', qsize, shortfall, active_workers) if len(self._workers) < shortfall: worker = _Worker(self._factory, self._on_worker_result, self._on_worker_error) worker.start() self._workers.append(worker)
def main(argv): """Entry point for dut_mon.""" logging.getLogger().setLevel(logging.INFO) with ts_mon_config.SetupTsMonGlobalState('dut_mon', indirect=True): afe = frontend.AFE() counters = collections.defaultdict(lambda: 0) field_spec = [ts_mon.StringField('board'), ts_mon.StringField('model'), ts_mon.StringField('pool'), ts_mon.BooleanField('is_locked'), ts_mon.StringField('status'), ] dut_count = metrics.Gauge('chromeos/autotest/dut_mon/dut_count', description='The number of duts in a given ' 'state and bucket.', field_spec=field_spec) tick_count = metrics.Counter('chromeos/autotest/dut_mon/tick', description='Tick counter of dut_mon.') while True: # Note: We reset all counters to zero in each loop rather than # creating a new defaultdict, because we want to ensure that any # gauges that were previously set to a nonzero value by this process # get set back to zero if necessary. for k in counters: counters[k] = 0 logging.info('Fetching all hosts.') hosts = afe.get_hosts() logging.info('Fetched %s hosts.', len(hosts)) for host in hosts: fields = _get_bucket_for_host(host) counters[fields] += 1 for field, value in counters.iteritems(): logging.info('%s %s', field, value) dut_count.set(value, fields=field.__dict__) tick_count.increment() logging.info('Sleeping for 2 minutes.') time.sleep(120)
def create_container(self, new_id): """Creates a new container. Attempts to retrieve a container from the container pool. If that operation fails, this falls back to the parent class behaviour. @param new_id: ContainerId to assign to the new container. Containers must be assigned an ID before they can be released from the container pool. @return: The new container. """ container = None if self._client: try: container = self._client.get_container( new_id, _CONTAINER_POOL_TIMEOUT) except Exception: logging.exception('Error communicating with container pool.') else: if container is not None: logging.debug('Retrieved container from pool: %s', container.name) return container metrics.Counter( METRICS_PREFIX + '/containers_served', field_spec=[ ts_mon.BooleanField('from_pool') ]).increment(fields={'from_pool': (container is not None)}) if container is not None: return container # If the container pool did not yield a container, make one locally. logging.warning('Unable to obtain container from pre-populated pool. ' 'Creating container locally. This slows server tests ' 'down and should be debugged even if local creation ' 'works out.') return super(_PoolBasedFactory, self).create_container(new_id)
from __future__ import absolute_import from framework import authdata from framework import sql from framework import xsrf from gae_ts_mon.handlers import TSMonJSHandler from google.appengine.api import users from infra_libs import ts_mon STANDARD_FIELDS = [ ts_mon.StringField('client_id'), ts_mon.StringField('host_name'), ts_mon.BooleanField('document_visible'), ] # User action metrics. ISSUE_CREATE_LATENCY_METRIC = ts_mon.CumulativeDistributionMetric( 'monorail/frontend/issue_create_latency', ('Latency between Issue Entry form submission and page load of ' 'the subsequent issue page.'), field_spec=STANDARD_FIELDS, units=ts_mon.MetricsDataUnits.MILLISECONDS) ISSUE_UPDATE_LATENCY_METRIC = ts_mon.CumulativeDistributionMetric( 'monorail/frontend/issue_update_latency', ('Latency between Issue Update form submission and page load of ' 'the subsequent issue page.'), field_spec=STANDARD_FIELDS, units=ts_mon.MetricsDataUnits.MILLISECONDS)
def SecondsInstanceTimer(name, fields=None, description=None, field_spec=_MISSING, record_on_exception=True, add_exception_field=False): """Record the time of an operation to a FloatMetric. Records the time taken inside of the context block, to the Float metric named |name|, with the given fields. This is a non-cumulative metric; this represents the absolute time taken for a specific block. The duration is stored in a float to provide flexibility in the future for higher accuracy. Examples: # Time the doSomething() call, with field values that are independent of the # results of the operation. with SecondsInstanceTimer('timer/name', fields={'foo': 'bar'}, description='My timer', field_spec=[ts_mon.StringField('foo'), ts_mon.BooleanField('success')]): doSomething() # Time the doSomethingElse call, with field values that depend on the # results of that operation. Note that it is important that a default value # is specified for these fields, in case an exception is thrown by # doSomethingElse() f = {'success': False, 'foo': 'bar'} with SecondsInstanceTimer('timer/name', fields=f, description='My timer', field_spec=[ts_mon.StringField('foo')]) as c: doSomethingElse() c['success'] = True # Incorrect Usage! with SecondsInstanceTimer('timer/name', description='My timer') as c: doSomething() c['foo'] = bar # 'foo' is not a valid field, because no default # value for it was specified in the context constructor. # It will be silently ignored. Args: name: The name of the metric to create fields: The fields of the metric to create. description: A string description of the metric. field_spec: A sequence of ts_mon.Field objects to specify the field schema. record_on_exception: Whether to record metrics if an exception is raised. add_exception_field: Whether to add a BooleanField('encountered_exception') to the FieldSpec provided, and set its value to True iff an exception was raised in the context. Yields: Float based metric measing the duration of execution. """ if field_spec is not None and field_spec is not _MISSING: field_spec.append(ts_mon.BooleanField('encountered_exception')) m = FloatMetric(name, description=description, field_spec=field_spec) f = dict(fields or {}) keys = list(f) t0 = _GetSystemClock() error = True try: yield f error = False finally: if record_on_exception and add_exception_field: keys.append('encountered_exception') f.setdefault('encountered_exception', error) # Filter out keys that were not part of the initial key set. This is to # avoid inconsistent fields. # TODO(akeshet): Doing this filtering isn't super efficient. Would be better # to implement some key-restricted subclass or wrapper around dict, and just # yield that above rather than yielding a regular dict. if record_on_exception or not error: dt = _GetSystemClock() - t0 m.set(dt, fields={k: f[k] for k in keys})
def SecondsTimer(name, fields=None, description=None, field_spec=_MISSING, scale=1, record_on_exception=True, add_exception_field=False): """Record the time of an operation to a CumulativeSecondsDistributionMetric. Records the time taken inside of the context block, to the CumulativeSecondsDistribution named |name|, with the given fields. Usage: # Time the doSomething() call, with field values that are independent of the # results of the operation. with SecondsTimer('timer/name', fields={'foo': 'bar'}, description="My timer", field_spec=[ts_mon.StringField('foo'), ts_mon.BooleanField('success')]): doSomething() # Time the doSomethingElse call, with field values that depend on the results # of that operation. Note that it is important that a default value is # specified for these fields, in case an exception is thrown by # doSomethingElse() f = {'success': False, 'foo': 'bar'} with SecondsTimer('timer/name', fields=f, description="My timer", field_spec=[ts_mon.StringField('foo')]) as c: doSomethingElse() c['success'] = True # Incorrect Usage! with SecondsTimer('timer/name', description="My timer") as c: doSomething() c['foo'] = bar # 'foo' is not a valid field, because no default # value for it was specified in the context constructor. # It will be silently ignored. Args: name: The name of the metric to create fields: The fields of the metric to create. description: A string description of the metric. field_spec: A sequence of ts_mon.Field objects to specify the field schema. scale: A float to scale the CumulativeSecondsDistribution buckets by. record_on_exception: Whether to record metrics if an exception is raised. add_exception_field: Whether to add a BooleanField("encountered_exception") to the FieldSpec provided, and set its value to True iff an exception was raised in the context. """ if field_spec is not None and field_spec is not _MISSING: field_spec.append(ts_mon.BooleanField('encountered_exception')) m = CumulativeSecondsDistribution( name, scale=scale, description=description, field_spec=field_spec) f = fields or {} f = dict(f) keys = f.keys() t0 = datetime.datetime.now() error = True try: yield f error = False finally: if record_on_exception and add_exception_field: keys.append('encountered_exception') f.setdefault('encountered_exception', error) # Filter out keys that were not part of the initial key set. This is to # avoid inconsistent fields. # TODO(akeshet): Doing this filtering isn't super efficient. Would be better # to implement some key-restricted subclass or wrapper around dict, and just # yield that above rather than yielding a regular dict. if record_on_exception or not error: dt = (datetime.datetime.now() - t0).total_seconds() m.add(dt, fields={k: f[k] for k in keys})
connected_metric = ts_mon.BooleanMetric( 'buildbot/slave/connected', 'Whether the slave is currently connected to its master.', None) connection_failures_metric = ts_mon.CounterMetric( 'buildbot/slave/connection_failures', 'Count of failures connecting to the buildbot master.', [ts_mon.StringField('reason')]) running_metric = ts_mon.BooleanMetric( 'buildbot/slave/is_building', 'Whether a build step is currently in progress.', [ts_mon.StringField('builder')]) steps_metric = ts_mon.CounterMetric( 'buildbot/slave/steps', 'Count of build steps run by each builder on this slave.', [ts_mon.StringField('builder'), ts_mon.BooleanField('success')]) class UnknownCommand(pb.Error): pass class SlaveBuilder(pb.Referenceable, service.Service): """This is the local representation of a single Builder: it handles a single kind of build (like an all-warnings build). It has a name and a home directory. The rest of its behavior is determined by the master. """ stopCommandOnShutdown = True # remote is a ref to the Builder object on the master side, and is set
# Copyright 2019 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. from gae_ts_mon.handlers import TSMonJSHandler from infra_libs import ts_mon FIELDS = [ ts_mon.IntegerField('fe_version'), ts_mon.BooleanField('signed_in'), ] METRICS = [ ts_mon.CumulativeDistributionMetric( 'chromeperf/load/page', 'page loadEventEnd - fetchStart', units=ts_mon.MetricsDataUnits.MILLISECONDS, field_spec=FIELDS), ts_mon.CumulativeDistributionMetric( 'chromeperf/load/chart', 'chart load latency', units=ts_mon.MetricsDataUnits.MILLISECONDS, field_spec=FIELDS), ts_mon.CumulativeDistributionMetric( 'chromeperf/load/alerts', 'alerts load latency', units=ts_mon.MetricsDataUnits.MILLISECONDS, field_spec=FIELDS), ts_mon.CumulativeDistributionMetric( 'chromeperf/action/triage', 'alert triage latency',
class LoadApiClientConfigs(webapp2.RequestHandler): config_loads = ts_mon.CounterMetric( 'monorail/client_config_svc/loads', 'Results of fetches from luci-config.', [ts_mon.BooleanField('success'), ts_mon.StringField('type')]) def get(self): authorization_token, _ = app_identity.get_access_token( framework_constants.OAUTH_SCOPE) response = urlfetch.fetch(LUCI_CONFIG_URL, method=urlfetch.GET, follow_redirects=False, headers={ 'Content-Type': 'application/json; charset=UTF-8', 'Authorization': 'Bearer ' + authorization_token }) if response.status_code != 200: logging.error('Invalid response from luci-config: %r', response) self.config_loads.increment({ 'success': False, 'type': 'luci-cfg-error' }) self.abort(500, 'Invalid response from luci-config') try: content_text = self._process_response(response) except Exception as e: self.abort(500, str(e)) logging.info('luci-config content decoded: %r.', content_text) configs = ClientConfig(configs=content_text, key_name='api_client_configs') configs.put() self.config_loads.increment({'success': True, 'type': 'success'}) def _process_response(self, response): try: content = json.loads(response.content) except ValueError: logging.error('Response was not JSON: %r', response.content) self.config_loads.increment({ 'success': False, 'type': 'json-load-error' }) raise try: config_content = content['content'] except KeyError: logging.error('JSON contained no content: %r', content) self.config_loads.increment({ 'success': False, 'type': 'json-key-error' }) raise try: content_text = base64.b64decode(config_content) except TypeError: logging.error('Content was not b64: %r', config_content) self.config_loads.increment({ 'success': False, 'type': 'b64-decode-error' }) raise try: cfg = api_clients_config_pb2.ClientCfg() protobuf.text_format.Merge(content_text, cfg) except: logging.error('Content was not a valid ClientCfg proto: %r', content_text) self.config_loads.increment({ 'success': False, 'type': 'proto-load-error' }) raise return content_text
def PerformStage(self): """Perform the actual work for this stage. This includes final metadata archival, and update CIDB with our final status as well as producting a logged build result summary. """ build_identifier, _ = self._run.GetCIDBHandle() build_id = build_identifier.cidb_id buildbucket_id = build_identifier.buildbucket_id if results_lib.Results.BuildSucceededSoFar(self.buildstore, buildbucket_id, self.name): final_status = constants.BUILDER_STATUS_PASSED else: final_status = constants.BUILDER_STATUS_FAILED if not hasattr(self._run.attrs, 'release_tag'): # If, for some reason, sync stage was not completed and # release_tag was not set. Set it to None here because # ArchiveResults() depends the existence of this attr. self._run.attrs.release_tag = None # Set up our report metadata. self._run.attrs.metadata.UpdateWithDict( self.GetReportMetadata( final_status=final_status, completion_instance=self._completion_instance)) src_root = self._build_root # Workspace builders use a different buildroot for overlays. if self._run.config.workspace_branch and self._run.options.workspace: src_root = self._run.options.workspace # Add tags for the arches and statuses of the build. # arches requires crossdev which isn't available at the early part of the # build. arches = [] for board in self._run.config['boards']: toolchains = toolchain.GetToolchainsForBoard(board, buildroot=src_root) default = list( toolchain.FilterToolchains(toolchains, 'default', True)) if default: try: arches.append(toolchain.GetArchForTarget(default[0])) except cros_build_lib.RunCommandError as e: logging.warning( 'Unable to retrieve arch for board %s default toolchain %s: %s', board, default, e) tags = { 'arches': arches, 'status': final_status, } results = self._run.attrs.metadata.GetValue('results') for stage in results: tags['stage_status:%s' % stage['name']] = stage['status'] tags['stage_summary:%s' % stage['name']] = stage['summary'] self._run.attrs.metadata.UpdateKeyDictWithDict(constants.METADATA_TAGS, tags) # Some operations can only be performed if a valid version is available. try: self._run.GetVersionInfo() self.ArchiveResults(final_status) metadata_url = os.path.join(self.upload_url, constants.METADATA_JSON) except cbuildbot_run.VersionNotSetError: logging.error('A valid version was never set for this run. ' 'Can not archive results.') metadata_url = '' results_lib.Results.Report(sys.stdout, current_version=(self._run.attrs.release_tag or '')) # Upload goma log if used for BuildPackage and TestSimpleChrome. _UploadAndLinkGomaLogIfNecessary( 'BuildPackages', self._run.config.name, self._run.options.goma_dir, self._run.options.goma_client_json, self._run.attrs.metadata.GetValueWithDefault('goma_tmp_dir')) _UploadAndLinkGomaLogIfNecessary( 'TestSimpleChromeWorkflow', self._run.config.name, self._run.options.goma_dir, self._run.options.goma_client_json, self._run.attrs.metadata.GetValueWithDefault( 'goma_tmp_dir_for_simple_chrome')) if self.buildstore.AreClientsReady(): status_for_db = final_status # TODO(pprabhu): After BuildData and CBuildbotMetdata are merged, remove # this extra temporary object creation. # XXX:HACK We're creating a BuildData with an empty URL. Don't try to # MarkGathered this object. build_data = metadata_lib.BuildData( '', self._run.attrs.metadata.GetDict()) # TODO(akeshet): Find a clearer way to get the "primary upload url" for # the metadata.json file. One alternative is _GetUploadUrls(...)[0]. # Today it seems that element 0 of its return list is the primary upload # url, but there is no guarantee or unit test coverage of that. self.buildstore.FinishBuild(build_id, status=status_for_db, summary=build_data.failure_message, metadata_url=metadata_url) duration = self._GetBuildDuration() mon_fields = { 'status': status_for_db, 'build_config': self._run.config.name, 'important': self._run.config.important } metrics.Counter( constants.MON_BUILD_COMP_COUNT).increment(fields=mon_fields) metrics.CumulativeSecondsDistribution( constants.MON_BUILD_DURATION).add(duration, fields=mon_fields) if self._run.options.sanity_check_build: metrics.Counter( constants.MON_BUILD_SANITY_COMP_COUNT).increment( fields=mon_fields) metrics.Gauge( constants.MON_BUILD_SANITY_ID, description= 'The build number of the latest sanity build. Used ' 'for recovering the link to the latest failing build ' 'in the alert when a sanity build fails.', field_spec=[ ts_mon.StringField('status'), ts_mon.StringField('build_config'), ts_mon.StringField('builder_name'), ts_mon.BooleanField('important') ]).set(self._run.buildnumber, fields=dict( mon_fields, builder_name=self._run.GetBuilderName())) if config_lib.IsMasterCQ(self._run.config): self_destructed = self._run.attrs.metadata.GetValueWithDefault( constants.SELF_DESTRUCTED_BUILD, False) mon_fields = { 'status': status_for_db, 'self_destructed': self_destructed } metrics.CumulativeSecondsDistribution( constants.MON_CQ_BUILD_DURATION).add(duration, fields=mon_fields) annotator_link = uri_lib.ConstructAnnotatorUri(build_id) logging.PrintBuildbotLink('Build annotator', annotator_link) # From this point forward, treat all exceptions as warnings. self._post_completion = True # Dump report about things we retry. retry_stats.ReportStats(sys.stdout)