def test_aggregate_cpu(self): monitor = SystemResourceMonitor(poll_interval=0.25) monitor.start() time.sleep(1) monitor.stop() values = monitor.aggregate_cpu_percent() self.assertIsInstance(values, list) self.assertEqual(len(values), multiprocessing.cpu_count()) for v in values: self.assertIsInstance(v, float) value = monitor.aggregate_cpu_percent(per_cpu=False) self.assertIsInstance(value, float) values = monitor.aggregate_cpu_times() self.assertIsInstance(values, list) self.assertGreater(len(values), 0) self.assertTrue(hasattr(values[0], 'user')) t = type(values[0]) value = monitor.aggregate_cpu_times(per_cpu=False) self.assertIsInstance(value, t)
def test_as_dict(self): monitor = SystemResourceMonitor(poll_interval=0.25) monitor.start() time.sleep(0.1) monitor.begin_phase('phase1') monitor.record_event('foo') time.sleep(0.1) monitor.begin_phase('phase2') monitor.record_event('bar') time.sleep(0.2) monitor.finish_phase('phase1') time.sleep(0.2) monitor.finish_phase('phase2') time.sleep(0.4) monitor.stop() d = monitor.as_dict() self.assertEqual(d['version'], 2) self.assertEqual(len(d['events']), 2) self.assertEqual(len(d['phases']), 2) self.assertIn('system', d) self.assertIsInstance(d['system'], dict) self.assertIsInstance(d['overall'], dict) self.assertIn('duration', d['overall']) self.assertIn('cpu_times', d['overall'])
def test_empty(self): monitor = SystemResourceMonitor(poll_interval=2.0) monitor.start() monitor.stop() data = list(monitor.range_usage()) self.assertEqual(len(data), 0)
def test_phases(self): monitor = SystemResourceMonitor(poll_interval=0.25) monitor.start() time.sleep(1) with monitor.phase('phase1'): time.sleep(1) with monitor.phase('phase2'): time.sleep(1) monitor.stop() self.assertEqual(len(monitor.phases), 2) self.assertEqual(['phase2', 'phase1'], monitor.phases.keys()) all = list(monitor.range_usage()) data1 = list(monitor.phase_usage('phase1')) data2 = list(monitor.phase_usage('phase2')) self.assertGreater(len(all), len(data1)) self.assertGreater(len(data1), len(data2)) # This could fail if time.time() takes more than 0.1s. It really # shouldn't. self.assertAlmostEqual(data1[-1].end, data2[-1].end, delta=0.25)
def test_events(self): monitor = SystemResourceMonitor(poll_interval=0.25) monitor.start() time.sleep(0.5) t0 = time.time() monitor.record_event('t0') time.sleep(0.5) t1 = time.time() monitor.record_event('t1') time.sleep(0.5) monitor.stop() events = monitor.events self.assertEqual(len(events), 2) event = events[0] self.assertEqual(event[1], 't0') self.assertAlmostEqual(event[0], t0, delta=0.25) data = list(monitor.between_events_usage('t0', 't1')) self.assertGreater(len(data), 0)
def test_as_dict(self): monitor = SystemResourceMonitor(poll_interval=0.25) monitor.start() time.sleep(0.1) monitor.begin_phase("phase1") monitor.record_event("foo") time.sleep(0.1) monitor.begin_phase("phase2") monitor.record_event("bar") time.sleep(0.2) monitor.finish_phase("phase1") time.sleep(0.2) monitor.finish_phase("phase2") time.sleep(0.4) monitor.stop() d = monitor.as_dict() self.assertEqual(d["version"], 2) self.assertEqual(len(d["events"]), 2) self.assertEqual(len(d["phases"]), 2) self.assertIn("system", d) self.assertIsInstance(d["system"], dict) self.assertIsInstance(d["overall"], dict) self.assertIn("duration", d["overall"]) self.assertIn("cpu_times", d["overall"])
def test_aggregate_io(self): monitor = SystemResourceMonitor(poll_interval=0.25) # There's really no easy way to ensure I/O occurs. For all we know # reads and writes will all be serviced by the page cache. monitor.start() time.sleep(1.0) monitor.stop() values = monitor.aggregate_io() self.assertTrue(hasattr(values, 'read_count'))
def test_memory(self): monitor = SystemResourceMonitor(poll_interval=0.25) monitor.start() time.sleep(1.0) monitor.stop() v = monitor.min_memory_available() self.assertIsInstance(v, long) v = monitor.max_memory_percent() self.assertIsInstance(v, float)
def test_basic(self): monitor = SystemResourceMonitor(poll_interval=0.5) monitor.start() time.sleep(3) monitor.stop() data = list(monitor.range_usage()) self.assertGreater(len(data), 3) self.assertIsInstance(data[0], SystemResourceUsage)
def test_as_dict(self): monitor = SystemResourceMonitor(poll_interval=0.25) monitor.start() time.sleep(0.1) monitor.begin_phase('phase1') monitor.record_event('foo') time.sleep(0.1) monitor.begin_phase('phase2') monitor.record_event('bar') time.sleep(0.2) monitor.finish_phase('phase1') time.sleep(0.2) monitor.finish_phase('phase2') time.sleep(0.4) monitor.stop() d = monitor.as_dict() self.assertEqual(d['version'], 1) self.assertEqual(len(d['events']), 2) self.assertEqual(len(d['phases']), 2)
class BuildMonitor(MozbuildObject): """Monitors the output of the build.""" def init(self, warnings_path): """Create a new monitor. warnings_path is a path of a warnings database to use. """ self._warnings_path = warnings_path self.resources = SystemResourceMonitor(poll_interval=1.0) self._resources_started = False self.tiers = TierStatus(self.resources) self.warnings_database = WarningsDatabase() if os.path.exists(warnings_path): try: self.warnings_database.load_from_file(warnings_path) except ValueError: os.remove(warnings_path) self._warnings_collector = WarningsCollector( database=self.warnings_database, objdir=self.topobjdir) def start(self): """Record the start of the build.""" self.start_time = time.time() self._finder_start_cpu = self._get_finder_cpu_usage() def start_resource_recording(self): # This should be merged into start() once bug 892342 lands. self.resources.start() self._resources_started = True def on_line(self, line): """Consume a line of output from the build system. This will parse the line for state and determine whether more action is needed. Returns a BuildOutputResult instance. In this named tuple, warning will be an object describing a new parsed warning. Otherwise it will be None. state_changed indicates whether the build system changed state with this line. If the build system changed state, the caller may want to query this instance for the current state in order to update UI, etc. for_display is a boolean indicating whether the line is relevant to the user. This is typically used to filter whether the line should be presented to the user. """ if line.startswith('BUILDSTATUS'): args = line.split()[1:] action = args.pop(0) update_needed = True if action == 'TIERS': self.tiers.set_tiers(args) update_needed = False elif action == 'TIER_START': tier = args[0] self.tiers.begin_tier(tier) elif action == 'TIER_FINISH': tier, = args self.tiers.finish_tier(tier) else: raise Exception('Unknown build status: %s' % action) return BuildOutputResult(None, update_needed, False) warning = None try: warning = self._warnings_collector.process_line(line) except: pass return BuildOutputResult(warning, False, True) def finish(self, record_usage=True): """Record the end of the build.""" self.end_time = time.time() if self._resources_started: self.resources.stop() self._finder_end_cpu = self._get_finder_cpu_usage() self.elapsed = self.end_time - self.start_time self.warnings_database.prune() self.warnings_database.save_to_file(self._warnings_path) if not record_usage: return try: usage = self.record_resource_usage() if not usage: return with open(self._get_state_filename('build_resources.json'), 'w') as fh: json.dump(usage, fh, indent=2) except Exception as e: self.log(logging.WARNING, 'build_resources_error', {'msg': str(e)}, 'Exception when writing resource usage file: {msg}') def _get_finder_cpu_usage(self): """Obtain the CPU usage of the Finder app on OS X. This is used to detect high CPU usage. """ if not sys.platform.startswith('darwin'): return None if not psutil: return None for proc in psutil.process_iter(): if proc.name != 'Finder': continue if proc.username != getpass.getuser(): continue # Try to isolate system finder as opposed to other "Finder" # processes. if not proc.exe.endswith('CoreServices/Finder.app/Contents/MacOS/Finder'): continue return proc.get_cpu_times() return None def have_high_finder_usage(self): """Determine whether there was high Finder CPU usage during the build. Returns True if there was high Finder CPU usage, False if there wasn't, or None if there is nothing to report. """ if not self._finder_start_cpu: return None, None # We only measure if the measured range is sufficiently long. if self.elapsed < 15: return None, None if not self._finder_end_cpu: return None, None start = self._finder_start_cpu end = self._finder_end_cpu start_total = start.user + start.system end_total = end.user + end.system cpu_seconds = end_total - start_total # If Finder used more than 25% of 1 core during the build, report an # error. finder_percent = cpu_seconds / self.elapsed * 100 return finder_percent > 25, finder_percent def have_excessive_swapping(self): """Determine whether there was excessive swapping during the build. Returns a tuple of (excessive, swap_in, swap_out). All values are None if no swap information is available. """ if not self.have_resource_usage: return None, None, None swap_in = sum(m.swap.sin for m in self.resources.measurements) swap_out = sum(m.swap.sout for m in self.resources.measurements) # The threshold of 1024 MB has been arbitrarily chosen. # # Choosing a proper value that is ideal for everyone is hard. We will # likely iterate on the logic until people are generally satisfied. # If a value is too low, the eventual warning produced does not carry # much meaning. If the threshold is too high, people may not see the # warning and the warning will thus be ineffective. excessive = swap_in > 512 * 1048576 or swap_out > 512 * 1048576 return excessive, swap_in, swap_out @property def have_resource_usage(self): """Whether resource usage is available.""" return self.resources.start_time is not None def record_resource_usage(self): """Record the resource usage of this build. We write a log message containing a high-level summary. We also produce a data structure containing the low-level resource usage information. This data structure can e.g. be serialized into JSON and saved for subsequent analysis. If no resource usage is available, None is returned. """ if not self.have_resource_usage: return None cpu_percent = self.resources.aggregate_cpu_percent(phase=None, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(phase=None, per_cpu=False) io = self.resources.aggregate_io(phase=None) self._log_resource_usage('Overall system resources', 'resource_usage', self.end_time - self.start_time, cpu_percent, cpu_times, io) excessive, sin, sout = self.have_excessive_swapping() if excessive is not None and (sin or sout): sin /= 1048576 sout /= 1048576 self.log(logging.WARNING, 'swap_activity', {'sin': sin, 'sout': sout}, 'Swap in/out (MB): {sin}/{sout}') o = dict( version=1, start=self.start_time, end=self.end_time, duration=self.end_time - self.start_time, resources=[], cpu_percent=cpu_percent, cpu_times=cpu_times, io=io, ) o['tiers'] = self.tiers.tiered_resource_usage() self.tiers.add_resource_fields_to_dict(o) for usage in self.resources.range_usage(): cpu_percent = self.resources.aggregate_cpu_percent(usage.start, usage.end, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(usage.start, usage.end, per_cpu=False) entry = dict( start=usage.start, end=usage.end, virt=list(usage.virt), swap=list(usage.swap), ) self.tiers.add_resources_to_dict(entry, start=usage.start, end=usage.end) o['resources'].append(entry) return o def _log_resource_usage(self, prefix, m_type, duration, cpu_percent, cpu_times, io, extra_params={}): params = dict( duration=duration, cpu_percent=cpu_percent, io_reads=io.read_count, io_writes=io.write_count, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time, ) params.update(extra_params) message = prefix + ' - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent:.0f}%; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' self.log(logging.WARNING, m_type, params, message)
class ResourceMonitoringMixin(PerfherderResourceOptionsMixin): """Provides resource monitoring capabilities to scripts. When this class is in the inheritance chain, resource usage stats of the executing script will be recorded. This class requires the VirtualenvMixin in order to install a package used for recording resource usage. While we would like to record resource usage for the entirety of a script, since we require an external package, we can only record resource usage after that package is installed (as part of creating the virtualenv). That's just the way things have to be. """ def __init__(self, *args, **kwargs): super(ResourceMonitoringMixin, self).__init__(*args, **kwargs) self.register_virtualenv_module('psutil>=3.1.1', method='pip', optional=True) self.register_virtualenv_module('mozsystemmonitor==0.3', method='pip', optional=True) self.register_virtualenv_module('jsonschema==2.5.1', method='pip') # explicitly install functools32, because some slaves aren't using # a version of pip recent enough to install it automatically with # jsonschema (which depends on it) # https://github.com/Julian/jsonschema/issues/233 self.register_virtualenv_module('functools32==3.2.3-2', method='pip') self._resource_monitor = None # 2-tuple of (name, options) to assign Perfherder resource monitor # metrics to. This needs to be assigned by a script in order for # Perfherder metrics to be reported. self.resource_monitor_perfherder_id = None @PostScriptAction('create-virtualenv') def _start_resource_monitoring(self, action, success=None): self.activate_virtualenv() # Resource Monitor requires Python 2.7, however it's currently optional. # Remove when all machines have had their Python version updated (bug 711299). if sys.version_info[:2] < (2, 7): self.warning('Resource monitoring will not be enabled! Python 2.7+ required.') return try: from mozsystemmonitor.resourcemonitor import SystemResourceMonitor self.info("Starting resource monitoring.") self._resource_monitor = SystemResourceMonitor(poll_interval=1.0) self._resource_monitor.start() except Exception: self.warning("Unable to start resource monitor: %s" % traceback.format_exc()) @PreScriptAction def _resource_record_pre_action(self, action): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.begin_phase(action) @PostScriptAction def _resource_record_post_action(self, action, success=None): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.finish_phase(action) @PostScriptRun def _resource_record_post_run(self): if not self._resource_monitor: return # This should never raise an exception. This is a workaround until # mozsystemmonitor is fixed. See bug 895388. try: self._resource_monitor.stop() self._log_resource_usage() # Upload a JSON file containing the raw resource data. try: upload_dir = self.query_abs_dirs()['abs_blob_upload_dir'] if not os.path.exists(upload_dir): os.makedirs(upload_dir) with open(os.path.join(upload_dir, 'resource-usage.json'), 'wb') as fh: json.dump(self._resource_monitor.as_dict(), fh, sort_keys=True, indent=4) except (AttributeError, KeyError): self.exception('could not upload resource usage JSON', level=WARNING) except Exception: self.warning("Exception when reporting resource usage: %s" % traceback.format_exc()) def _log_resource_usage(self): # Delay import because not available until virtualenv is populated. import jsonschema rm = self._resource_monitor if rm.start_time is None: return def resources(phase): cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False) io = rm.aggregate_io(phase=phase) swap_in = sum(m.swap.sin for m in rm.measurements) swap_out = sum(m.swap.sout for m in rm.measurements) return cpu_percent, cpu_times, io, (swap_in, swap_out) def log_usage(prefix, duration, cpu_percent, cpu_times, io): message = '{prefix} - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent}; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' # XXX Some test harnesses are complaining about a string being # being fed into a 'f' formatter. This will help diagnose the # issue. cpu_percent_str = str(round(cpu_percent)) + '%' if cpu_percent else "Can't collect data" try: self.info( message.format( prefix=prefix, duration=duration, cpu_percent=cpu_percent_str, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time ) ) except ValueError: self.warning("Exception when formatting: %s" % traceback.format_exc()) cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None) duration = rm.end_time - rm.start_time # Write out Perfherder data if configured. if self.resource_monitor_perfherder_id: perfherder_name, perfherder_options = self.resource_monitor_perfherder_id suites = [] overall = [] if cpu_percent: overall.append({ 'name': 'cpu_percent', 'value': cpu_percent, }) overall.extend([ {'name': 'io_write_bytes', 'value': io.write_bytes}, {'name': 'io.read_bytes', 'value': io.read_bytes}, {'name': 'io_write_time', 'value': io.write_time}, {'name': 'io_read_time', 'value': io.read_time}, ]) suites.append({ 'name': '%s.overall' % perfherder_name, 'extraOptions': perfherder_options + self.perfherder_resource_options(), 'subtests': overall, }) for phase in rm.phases.keys(): phase_duration = rm.phases[phase][1] - rm.phases[phase][0] subtests = [ { 'name': 'time', 'value': phase_duration, } ] cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) if cpu_percent is not None: subtests.append({ 'name': 'cpu_percent', 'value': rm.aggregate_cpu_percent(phase=phase, per_cpu=False), }) # We don't report I/O during each step because measured I/O # is system I/O and that I/O can be delayed (e.g. writes will # buffer before being flushed and recorded in our metrics). suites.append({ 'name': '%s.%s' % (perfherder_name, phase), 'subtests': subtests, }) data = { 'framework': {'name': 'job_resource_usage'}, 'suites': suites, } schema_path = os.path.join(external_tools_path, 'performance-artifact-schema.json') with open(schema_path, 'rb') as fh: schema = json.load(fh) # this will throw an exception that causes the job to fail if the # perfherder data is not valid -- please don't change this # behaviour, otherwise people will inadvertently break this # functionality self.info('Validating Perfherder data against %s' % schema_path) jsonschema.validate(data, schema) self.info('PERFHERDER_DATA: %s' % json.dumps(data)) log_usage('Total resource usage', duration, cpu_percent, cpu_times, io) # Print special messages so usage shows up in Treeherder. if cpu_percent: self._tinderbox_print('CPU usage<br/>{:,.1f}%'.format( cpu_percent)) self._tinderbox_print('I/O read bytes / time<br/>{:,} / {:,}'.format( io.read_bytes, io.read_time)) self._tinderbox_print('I/O write bytes / time<br/>{:,} / {:,}'.format( io.write_bytes, io.write_time)) # Print CPU components having >1%. "cpu_times" is a data structure # whose attributes are measurements. Ideally we'd have an API that # returned just the measurements as a dict or something. cpu_attrs = [] for attr in sorted(dir(cpu_times)): if attr.startswith('_'): continue if attr in ('count', 'index'): continue cpu_attrs.append(attr) cpu_total = sum(getattr(cpu_times, attr) for attr in cpu_attrs) for attr in cpu_attrs: value = getattr(cpu_times, attr) # cpu_total can be 0.0. Guard against division by 0. percent = value / cpu_total * 100.0 if cpu_total else 0.0 if percent > 1.00: self._tinderbox_print('CPU {}<br/>{:,.1f} ({:,.1f}%)'.format( attr, value, percent)) # Swap on Windows isn't reported by psutil. if not self._is_windows(): self._tinderbox_print('Swap in / out<br/>{:,} / {:,}'.format( swap_in, swap_out)) for phase in rm.phases.keys(): start_time, end_time = rm.phases[phase] cpu_percent, cpu_times, io, swap = resources(phase) log_usage(phase, end_time - start_time, cpu_percent, cpu_times, io) def _tinderbox_print(self, message): self.info('TinderboxPrint: %s' % message)
class BuildMonitor(MozbuildObject): """Monitors the output of the build.""" def init(self, warnings_path): """Create a new monitor. warnings_path is a path of a warnings database to use. """ self._warnings_path = warnings_path self.resources = SystemResourceMonitor(poll_interval=1.0) self._resources_started = False self.tiers = TierStatus(self.resources) self.warnings_database = WarningsDatabase() if os.path.exists(warnings_path): try: self.warnings_database.load_from_file(warnings_path) except ValueError: os.remove(warnings_path) self._warnings_collector = WarningsCollector( database=self.warnings_database, objdir=self.topobjdir) self.build_objects = [] def start(self): """Record the start of the build.""" self.start_time = time.time() self._finder_start_cpu = self._get_finder_cpu_usage() def start_resource_recording(self): # This should be merged into start() once bug 892342 lands. self.resources.start() self._resources_started = True def on_line(self, line): """Consume a line of output from the build system. This will parse the line for state and determine whether more action is needed. Returns a BuildOutputResult instance. In this named tuple, warning will be an object describing a new parsed warning. Otherwise it will be None. state_changed indicates whether the build system changed state with this line. If the build system changed state, the caller may want to query this instance for the current state in order to update UI, etc. for_display is a boolean indicating whether the line is relevant to the user. This is typically used to filter whether the line should be presented to the user. """ if line.startswith('BUILDSTATUS'): args = line.split()[1:] action = args.pop(0) update_needed = True if action == 'TIERS': self.tiers.set_tiers(args) update_needed = False elif action == 'TIER_START': tier = args[0] self.tiers.begin_tier(tier) elif action == 'TIER_FINISH': tier, = args self.tiers.finish_tier(tier) elif action == 'OBJECT_FILE': self.build_objects.append(args[0]) update_needed = False else: raise Exception('Unknown build status: %s' % action) return BuildOutputResult(None, update_needed, False) warning = None try: warning = self._warnings_collector.process_line(line) except: pass return BuildOutputResult(warning, False, True) def stop_resource_recording(self): if self._resources_started: self.resources.stop() self._resources_started = False def finish(self, record_usage=True): """Record the end of the build.""" self.stop_resource_recording() self.end_time = time.time() self._finder_end_cpu = self._get_finder_cpu_usage() self.elapsed = self.end_time - self.start_time self.warnings_database.prune() self.warnings_database.save_to_file(self._warnings_path) if not record_usage: return try: usage = self.get_resource_usage() if not usage: return self.log_resource_usage(usage) with open(self._get_state_filename('build_resources.json'), 'w') as fh: json.dump(self.resources.as_dict(), fh, indent=2) except Exception as e: self.log(logging.WARNING, 'build_resources_error', {'msg': str(e)}, 'Exception when writing resource usage file: {msg}') def _get_finder_cpu_usage(self): """Obtain the CPU usage of the Finder app on OS X. This is used to detect high CPU usage. """ if not sys.platform.startswith('darwin'): return None if not psutil: return None for proc in psutil.process_iter(): if proc.name != 'Finder': continue if proc.username != getpass.getuser(): continue # Try to isolate system finder as opposed to other "Finder" # processes. if not proc.exe.endswith( 'CoreServices/Finder.app/Contents/MacOS/Finder'): continue return proc.get_cpu_times() return None def have_high_finder_usage(self): """Determine whether there was high Finder CPU usage during the build. Returns True if there was high Finder CPU usage, False if there wasn't, or None if there is nothing to report. """ if not self._finder_start_cpu: return None, None # We only measure if the measured range is sufficiently long. if self.elapsed < 15: return None, None if not self._finder_end_cpu: return None, None start = self._finder_start_cpu end = self._finder_end_cpu start_total = start.user + start.system end_total = end.user + end.system cpu_seconds = end_total - start_total # If Finder used more than 25% of 1 core during the build, report an # error. finder_percent = cpu_seconds / self.elapsed * 100 return finder_percent > 25, finder_percent def have_excessive_swapping(self): """Determine whether there was excessive swapping during the build. Returns a tuple of (excessive, swap_in, swap_out). All values are None if no swap information is available. """ if not self.have_resource_usage: return None, None, None swap_in = sum(m.swap.sin for m in self.resources.measurements) swap_out = sum(m.swap.sout for m in self.resources.measurements) # The threshold of 1024 MB has been arbitrarily chosen. # # Choosing a proper value that is ideal for everyone is hard. We will # likely iterate on the logic until people are generally satisfied. # If a value is too low, the eventual warning produced does not carry # much meaning. If the threshold is too high, people may not see the # warning and the warning will thus be ineffective. excessive = swap_in > 512 * 1048576 or swap_out > 512 * 1048576 return excessive, swap_in, swap_out @property def have_resource_usage(self): """Whether resource usage is available.""" return self.resources.start_time is not None def get_resource_usage(self): """ Produce a data structure containing the low-level resource usage information. This data structure can e.g. be serialized into JSON and saved for subsequent analysis. If no resource usage is available, None is returned. """ if not self.have_resource_usage: return None cpu_percent = self.resources.aggregate_cpu_percent(phase=None, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(phase=None, per_cpu=False) io = self.resources.aggregate_io(phase=None) o = dict(version=3, argv=sys.argv, start=self.start_time, end=self.end_time, duration=self.end_time - self.start_time, resources=[], cpu_percent=cpu_percent, cpu_times=cpu_times, io=io, objects=self.build_objects) o['tiers'] = self.tiers.tiered_resource_usage() self.tiers.add_resource_fields_to_dict(o) for usage in self.resources.range_usage(): cpu_percent = self.resources.aggregate_cpu_percent(usage.start, usage.end, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(usage.start, usage.end, per_cpu=False) entry = dict( start=usage.start, end=usage.end, virt=list(usage.virt), swap=list(usage.swap), ) self.tiers.add_resources_to_dict(entry, start=usage.start, end=usage.end) o['resources'].append(entry) # If the imports for this file ran before the in-tree virtualenv # was bootstrapped (for instance, for a clobber build in automation), # psutil might not be available. # # Treat psutil as optional to avoid an outright failure to log resources # TODO: it would be nice to collect data on the storage device as well # in this case. o['system'] = {} if psutil: o['system'].update( dict( logical_cpu_count=psutil.cpu_count(), physical_cpu_count=psutil.cpu_count(logical=False), swap_total=psutil.swap_memory()[0], vmem_total=psutil.virtual_memory()[0], )) return o def log_resource_usage(self, usage): """Summarize the resource usage of this build in a log message.""" if not usage: return params = dict( duration=self.end_time - self.start_time, cpu_percent=usage['cpu_percent'], io_read_bytes=usage['io'].read_bytes, io_write_bytes=usage['io'].write_bytes, io_read_time=usage['io'].read_time, io_write_time=usage['io'].write_time, ) message = 'Overall system resources - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent:.0f}%; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' self.log(logging.WARNING, 'resource_usage', params, message) excessive, sin, sout = self.have_excessive_swapping() if excessive is not None and (sin or sout): sin /= 1048576 sout /= 1048576 self.log(logging.WARNING, 'swap_activity', { 'sin': sin, 'sout': sout }, 'Swap in/out (MB): {sin}/{sout}') def ccache_stats(self): ccache_stats = None try: ccache = which.which('ccache') output = subprocess.check_output([ccache, '-s']) ccache_stats = CCacheStats(output) except which.WhichError: pass except ValueError as e: self.log(logging.WARNING, 'ccache', {'msg': str(e)}, '{msg}') return ccache_stats
class ResourceMonitoringMixin(object): """Provides resource monitoring capabilities to scripts. When this class is in the inheritance chain, resource usage stats of the executing script will be recorded. This class requires the VirtualenvMixin in order to install a package used for recording resource usage. While we would like to record resource usage for the entirety of a script, since we require an external package, we can only record resource usage after that package is installed (as part of creating the virtualenv). That's just the way things have to be. """ def __init__(self, *args, **kwargs): super(ResourceMonitoringMixin, self).__init__(*args, **kwargs) self.register_virtualenv_module('psutil>=0.7.1', method='pip', optional=True) self.register_virtualenv_module('mozsystemmonitor==0.0.0', method='pip', optional=True) self._resource_monitor = None @PostScriptAction('create-virtualenv') def _start_resource_monitoring(self, action, success=None): self.activate_virtualenv() # Resource Monitor requires Python 2.7, however it's currently optional. # Remove when all machines have had their Python version updated (bug 711299). if sys.version_info[:2] < (2, 7): self.warning( 'Resource monitoring will not be enabled! Python 2.7+ required.' ) return try: from mozsystemmonitor.resourcemonitor import SystemResourceMonitor self.info("Starting resource monitoring.") self._resource_monitor = SystemResourceMonitor(poll_interval=1.0) self._resource_monitor.start() except Exception: self.warning("Unable to start resource monitor: %s" % traceback.format_exc()) @PreScriptAction def _resource_record_pre_action(self, action): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.begin_phase(action) @PostScriptAction def _resource_record_post_action(self, action, success=None): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.finish_phase(action) @PostScriptRun def _resource_record_post_run(self): if not self._resource_monitor: return # This should never raise an exception. This is a workaround until # mozsystemmonitor is fixed. See bug 895388. try: self._resource_monitor.stop() self._log_resource_usage() except Exception: self.warning("Exception when reporting resource usage: %s" % traceback.format_exc()) def _log_resource_usage(self): rm = self._resource_monitor if rm.start_time is None: return def resources(phase): cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False) io = rm.aggregate_io(phase=phase) return cpu_percent, cpu_times, io def log_usage(prefix, duration, cpu_percent, cpu_times, io): message = '{prefix} - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent}; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' # XXX Some test harnesses are complaining about a string being # being fed into a 'f' formatter. This will help diagnose the # issue. cpu_percent_str = str(round( cpu_percent)) + '%' if cpu_percent else "Can't collect data" try: self.info( message.format(prefix=prefix, duration=duration, cpu_percent=cpu_percent_str, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time)) except ValueError: self.warning("Exception when formatting: %s" % traceback.format_exc()) cpu_percent, cpu_times, io = resources(None) duration = rm.end_time - rm.start_time log_usage('Total resource usage', duration, cpu_percent, cpu_times, io) for phase in rm.phases.keys(): start_time, end_time = rm.phases[phase] cpu_percent, cpu_times, io = resources(phase) log_usage(phase, end_time - start_time, cpu_percent, cpu_times, io)
class ResourceMonitoringMixin(PerfherderResourceOptionsMixin): """Provides resource monitoring capabilities to scripts. When this class is in the inheritance chain, resource usage stats of the executing script will be recorded. This class requires the VirtualenvMixin in order to install a package used for recording resource usage. While we would like to record resource usage for the entirety of a script, since we require an external package, we can only record resource usage after that package is installed (as part of creating the virtualenv). That's just the way things have to be. """ def __init__(self, *args, **kwargs): super(ResourceMonitoringMixin, self).__init__(*args, **kwargs) self.register_virtualenv_module('psutil>=3.1.1', method='pip', optional=True) self.register_virtualenv_module('mozsystemmonitor==0.3', method='pip', optional=True) self.register_virtualenv_module('jsonschema==2.5.1', method='pip') # explicitly install functools32, because some slaves aren't using # a version of pip recent enough to install it automatically with # jsonschema (which depends on it) # https://github.com/Julian/jsonschema/issues/233 self.register_virtualenv_module('functools32==3.2.3-2', method='pip') self._resource_monitor = None # 2-tuple of (name, options) to assign Perfherder resource monitor # metrics to. This needs to be assigned by a script in order for # Perfherder metrics to be reported. self.resource_monitor_perfherder_id = None @PostScriptAction('create-virtualenv') def _start_resource_monitoring(self, action, success=None): self.activate_virtualenv() # Resource Monitor requires Python 2.7, however it's currently optional. # Remove when all machines have had their Python version updated (bug 711299). if sys.version_info[:2] < (2, 7): self.warning( 'Resource monitoring will not be enabled! Python 2.7+ required.' ) return try: from mozsystemmonitor.resourcemonitor import SystemResourceMonitor self.info("Starting resource monitoring.") self._resource_monitor = SystemResourceMonitor(poll_interval=1.0) self._resource_monitor.start() except Exception: self.warning("Unable to start resource monitor: %s" % traceback.format_exc()) @PreScriptAction def _resource_record_pre_action(self, action): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.begin_phase(action) @PostScriptAction def _resource_record_post_action(self, action, success=None): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.finish_phase(action) @PostScriptRun def _resource_record_post_run(self): if not self._resource_monitor: return # This should never raise an exception. This is a workaround until # mozsystemmonitor is fixed. See bug 895388. try: self._resource_monitor.stop() self._log_resource_usage() # Upload a JSON file containing the raw resource data. try: upload_dir = self.query_abs_dirs()['abs_blob_upload_dir'] if not os.path.exists(upload_dir): os.makedirs(upload_dir) with open(os.path.join(upload_dir, 'resource-usage.json'), 'wb') as fh: json.dump(self._resource_monitor.as_dict(), fh, sort_keys=True, indent=4) except (AttributeError, KeyError): self.exception('could not upload resource usage JSON', level=WARNING) except Exception: self.warning("Exception when reporting resource usage: %s" % traceback.format_exc()) def _log_resource_usage(self): # Delay import because not available until virtualenv is populated. import jsonschema rm = self._resource_monitor if rm.start_time is None: return def resources(phase): cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False) io = rm.aggregate_io(phase=phase) swap_in = sum(m.swap.sin for m in rm.measurements) swap_out = sum(m.swap.sout for m in rm.measurements) return cpu_percent, cpu_times, io, (swap_in, swap_out) def log_usage(prefix, duration, cpu_percent, cpu_times, io): message = '{prefix} - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent}; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' # XXX Some test harnesses are complaining about a string being # being fed into a 'f' formatter. This will help diagnose the # issue. cpu_percent_str = str(round( cpu_percent)) + '%' if cpu_percent else "Can't collect data" try: self.info( message.format(prefix=prefix, duration=duration, cpu_percent=cpu_percent_str, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time)) except ValueError: self.warning("Exception when formatting: %s" % traceback.format_exc()) cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None) duration = rm.end_time - rm.start_time # Write out Perfherder data if configured. if self.resource_monitor_perfherder_id: perfherder_name, perfherder_options = self.resource_monitor_perfherder_id suites = [] overall = [] if cpu_percent: overall.append({ 'name': 'cpu_percent', 'value': cpu_percent, }) overall.extend([ { 'name': 'io_write_bytes', 'value': io.write_bytes }, { 'name': 'io.read_bytes', 'value': io.read_bytes }, { 'name': 'io_write_time', 'value': io.write_time }, { 'name': 'io_read_time', 'value': io.read_time }, ]) suites.append({ 'name': '%s.overall' % perfherder_name, 'extraOptions': perfherder_options + self.perfherder_resource_options(), 'subtests': overall, }) for phase in rm.phases.keys(): phase_duration = rm.phases[phase][1] - rm.phases[phase][0] subtests = [{ 'name': 'time', 'value': phase_duration, }] cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) if cpu_percent is not None: subtests.append({ 'name': 'cpu_percent', 'value': rm.aggregate_cpu_percent(phase=phase, per_cpu=False), }) # We don't report I/O during each step because measured I/O # is system I/O and that I/O can be delayed (e.g. writes will # buffer before being flushed and recorded in our metrics). suites.append({ 'name': '%s.%s' % (perfherder_name, phase), 'subtests': subtests, }) data = { 'framework': { 'name': 'job_resource_usage' }, 'suites': suites, } schema_path = os.path.join(external_tools_path, 'performance-artifact-schema.json') with open(schema_path, 'rb') as fh: schema = json.load(fh) # this will throw an exception that causes the job to fail if the # perfherder data is not valid -- please don't change this # behaviour, otherwise people will inadvertently break this # functionality self.info('Validating Perfherder data against %s' % schema_path) jsonschema.validate(data, schema) self.info('PERFHERDER_DATA: %s' % json.dumps(data)) log_usage('Total resource usage', duration, cpu_percent, cpu_times, io) # Print special messages so usage shows up in Treeherder. if cpu_percent: self._tinderbox_print('CPU usage<br/>{:,.1f}%'.format(cpu_percent)) self._tinderbox_print('I/O read bytes / time<br/>{:,} / {:,}'.format( io.read_bytes, io.read_time)) self._tinderbox_print('I/O write bytes / time<br/>{:,} / {:,}'.format( io.write_bytes, io.write_time)) # Print CPU components having >1%. "cpu_times" is a data structure # whose attributes are measurements. Ideally we'd have an API that # returned just the measurements as a dict or something. cpu_attrs = [] for attr in sorted(dir(cpu_times)): if attr.startswith('_'): continue if attr in ('count', 'index'): continue cpu_attrs.append(attr) cpu_total = sum(getattr(cpu_times, attr) for attr in cpu_attrs) for attr in cpu_attrs: value = getattr(cpu_times, attr) # cpu_total can be 0.0. Guard against division by 0. percent = value / cpu_total * 100.0 if cpu_total else 0.0 if percent > 1.00: self._tinderbox_print('CPU {}<br/>{:,.1f} ({:,.1f}%)'.format( attr, value, percent)) # Swap on Windows isn't reported by psutil. if not self._is_windows(): self._tinderbox_print('Swap in / out<br/>{:,} / {:,}'.format( swap_in, swap_out)) for phase in rm.phases.keys(): start_time, end_time = rm.phases[phase] cpu_percent, cpu_times, io, swap = resources(phase) log_usage(phase, end_time - start_time, cpu_percent, cpu_times, io) def _tinderbox_print(self, message): self.info('TinderboxPrint: %s' % message)
class ResourceMonitoringMixin(object): """Provides resource monitoring capabilities to scripts. When this class is in the inheritance chain, resource usage stats of the executing script will be recorded. This class requires the VirtualenvMixin in order to install a package used for recording resource usage. While we would like to record resource usage for the entirety of a script, since we require an external package, we can only record resource usage after that package is installed (as part of creating the virtualenv). That's just the way things have to be. """ def __init__(self, *args, **kwargs): super(ResourceMonitoringMixin, self).__init__(*args, **kwargs) self.register_virtualenv_module('psutil>=0.7.1', method='pip', optional=True) self.register_virtualenv_module('mozsystemmonitor==0.0.0', method='pip', optional=True) self._resource_monitor = None @PostScriptAction('create-virtualenv') def _start_resource_monitoring(self, action, success=None): self.activate_virtualenv() # Resource Monitor requires Python 2.7, however it's currently optional. # Remove when all machines have had their Python version updated (bug 711299). if sys.version_info[:2] < (2, 7): self.warning('Resource monitoring will not be enabled! Python 2.7+ required.') return try: from mozsystemmonitor.resourcemonitor import SystemResourceMonitor self.info("Starting resource monitoring.") self._resource_monitor = SystemResourceMonitor(poll_interval=1.0) self._resource_monitor.start() except Exception: self.warning("Unable to start resource monitor: %s" % traceback.format_exc()) @PreScriptAction def _resource_record_pre_action(self, action): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.begin_phase(action) @PostScriptAction def _resource_record_post_action(self, action, success=None): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.finish_phase(action) @PostScriptRun def _resource_record_post_run(self): if not self._resource_monitor: return # This should never raise an exception. This is a workaround until # mozsystemmonitor is fixed. See bug 895388. try: self._resource_monitor.stop() self._log_resource_usage() except Exception: self.warning("Exception when reporting resource usage: %s" % traceback.format_exc()) def _log_resource_usage(self): rm = self._resource_monitor if rm.start_time is None: return def resources(phase): cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False) io = rm.aggregate_io(phase=phase) return cpu_percent, cpu_times, io def log_usage(prefix, duration, cpu_percent, cpu_times, io): message = '{prefix} - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent}; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' # XXX Some test harnesses are complaining about a string being # being fed into a 'f' formatter. This will help diagnose the # issue. cpu_percent_str = str(round(cpu_percent)) + '%' if cpu_percent else "Can't collect data" try: self.info( message.format( prefix=prefix, duration=duration, cpu_percent=cpu_percent_str, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time ) ) except ValueError: self.warning("Exception when formatting: %s" % traceback.format_exc()) cpu_percent, cpu_times, io = resources(None) duration = rm.end_time - rm.start_time log_usage('Total resource usage', duration, cpu_percent, cpu_times, io) for phase in rm.phases.keys(): start_time, end_time = rm.phases[phase] cpu_percent, cpu_times, io = resources(phase) log_usage(phase, end_time - start_time, cpu_percent, cpu_times, io)
class BuildMonitor(MozbuildObject): """Monitors the output of the build.""" def init(self, warnings_path): """Create a new monitor. warnings_path is a path of a warnings database to use. """ self._warnings_path = warnings_path self.resources = SystemResourceMonitor(poll_interval=1.0) self._resources_started = False self.tiers = TierStatus(self.resources) self.warnings_database = WarningsDatabase() if os.path.exists(warnings_path): try: self.warnings_database.load_from_file(warnings_path) except ValueError: os.remove(warnings_path) self._warnings_collector = WarningsCollector(database=self.warnings_database, objdir=self.topobjdir) self.build_objects = [] def start(self): """Record the start of the build.""" self.start_time = time.time() self._finder_start_cpu = self._get_finder_cpu_usage() def start_resource_recording(self): # This should be merged into start() once bug 892342 lands. self.resources.start() self._resources_started = True def on_line(self, line): """Consume a line of output from the build system. This will parse the line for state and determine whether more action is needed. Returns a BuildOutputResult instance. In this named tuple, warning will be an object describing a new parsed warning. Otherwise it will be None. state_changed indicates whether the build system changed state with this line. If the build system changed state, the caller may want to query this instance for the current state in order to update UI, etc. for_display is a boolean indicating whether the line is relevant to the user. This is typically used to filter whether the line should be presented to the user. """ if line.startswith("BUILDSTATUS"): args = line.split()[1:] action = args.pop(0) update_needed = True if action == "TIERS": self.tiers.set_tiers(args) update_needed = False elif action == "TIER_START": tier = args[0] self.tiers.begin_tier(tier) elif action == "TIER_FINISH": tier, = args self.tiers.finish_tier(tier) elif action == "OBJECT_FILE": self.build_objects.append(args[0]) update_needed = False else: raise Exception("Unknown build status: %s" % action) return BuildOutputResult(None, update_needed, False) warning = None try: warning = self._warnings_collector.process_line(line) except: pass return BuildOutputResult(warning, False, True) def stop_resource_recording(self): if self._resources_started: self.resources.stop() self._resources_started = False def finish(self, record_usage=True): """Record the end of the build.""" self.stop_resource_recording() self.end_time = time.time() self._finder_end_cpu = self._get_finder_cpu_usage() self.elapsed = self.end_time - self.start_time self.warnings_database.prune() self.warnings_database.save_to_file(self._warnings_path) if not record_usage: return try: usage = self.get_resource_usage() if not usage: return self.log_resource_usage(usage) with open(self._get_state_filename("build_resources.json"), "w") as fh: json.dump(self.resources.as_dict(), fh, indent=2) except Exception as e: self.log( logging.WARNING, "build_resources_error", {"msg": str(e)}, "Exception when writing resource usage file: {msg}", ) def _get_finder_cpu_usage(self): """Obtain the CPU usage of the Finder app on OS X. This is used to detect high CPU usage. """ if not sys.platform.startswith("darwin"): return None if not psutil: return None for proc in psutil.process_iter(): if proc.name != "Finder": continue if proc.username != getpass.getuser(): continue # Try to isolate system finder as opposed to other "Finder" # processes. if not proc.exe.endswith("CoreServices/Finder.app/Contents/MacOS/Finder"): continue return proc.get_cpu_times() return None def have_high_finder_usage(self): """Determine whether there was high Finder CPU usage during the build. Returns True if there was high Finder CPU usage, False if there wasn't, or None if there is nothing to report. """ if not self._finder_start_cpu: return None, None # We only measure if the measured range is sufficiently long. if self.elapsed < 15: return None, None if not self._finder_end_cpu: return None, None start = self._finder_start_cpu end = self._finder_end_cpu start_total = start.user + start.system end_total = end.user + end.system cpu_seconds = end_total - start_total # If Finder used more than 25% of 1 core during the build, report an # error. finder_percent = cpu_seconds / self.elapsed * 100 return finder_percent > 25, finder_percent def have_excessive_swapping(self): """Determine whether there was excessive swapping during the build. Returns a tuple of (excessive, swap_in, swap_out). All values are None if no swap information is available. """ if not self.have_resource_usage: return None, None, None swap_in = sum(m.swap.sin for m in self.resources.measurements) swap_out = sum(m.swap.sout for m in self.resources.measurements) # The threshold of 1024 MB has been arbitrarily chosen. # # Choosing a proper value that is ideal for everyone is hard. We will # likely iterate on the logic until people are generally satisfied. # If a value is too low, the eventual warning produced does not carry # much meaning. If the threshold is too high, people may not see the # warning and the warning will thus be ineffective. excessive = swap_in > 512 * 1048576 or swap_out > 512 * 1048576 return excessive, swap_in, swap_out @property def have_resource_usage(self): """Whether resource usage is available.""" return self.resources.start_time is not None def get_resource_usage(self): """ Produce a data structure containing the low-level resource usage information. This data structure can e.g. be serialized into JSON and saved for subsequent analysis. If no resource usage is available, None is returned. """ if not self.have_resource_usage: return None cpu_percent = self.resources.aggregate_cpu_percent(phase=None, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(phase=None, per_cpu=False) io = self.resources.aggregate_io(phase=None) o = dict( version=3, argv=sys.argv, start=self.start_time, end=self.end_time, duration=self.end_time - self.start_time, resources=[], cpu_percent=cpu_percent, cpu_times=cpu_times, io=io, objects=self.build_objects, ) o["tiers"] = self.tiers.tiered_resource_usage() self.tiers.add_resource_fields_to_dict(o) for usage in self.resources.range_usage(): cpu_percent = self.resources.aggregate_cpu_percent(usage.start, usage.end, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(usage.start, usage.end, per_cpu=False) entry = dict(start=usage.start, end=usage.end, virt=list(usage.virt), swap=list(usage.swap)) self.tiers.add_resources_to_dict(entry, start=usage.start, end=usage.end) o["resources"].append(entry) # If the imports for this file ran before the in-tree virtualenv # was bootstrapped (for instance, for a clobber build in automation), # psutil might not be available. # # Treat psutil as optional to avoid an outright failure to log resources # TODO: it would be nice to collect data on the storage device as well # in this case. o["system"] = {} if psutil: o["system"].update( dict( logical_cpu_count=psutil.cpu_count(), physical_cpu_count=psutil.cpu_count(logical=False), swap_total=psutil.swap_memory()[0], vmem_total=psutil.virtual_memory()[0], ) ) return o def log_resource_usage(self, usage): """Summarize the resource usage of this build in a log message.""" if not usage: return params = dict( duration=self.end_time - self.start_time, cpu_percent=usage["cpu_percent"], io_read_bytes=usage["io"].read_bytes, io_write_bytes=usage["io"].write_bytes, io_read_time=usage["io"].read_time, io_write_time=usage["io"].write_time, ) message = ( "Overall system resources - Wall time: {duration:.0f}s; " "CPU: {cpu_percent:.0f}%; " "Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; " "Read time: {io_read_time}; Write time: {io_write_time}" ) self.log(logging.WARNING, "resource_usage", params, message) excessive, sin, sout = self.have_excessive_swapping() if excessive is not None and (sin or sout): sin /= 1048576 sout /= 1048576 self.log(logging.WARNING, "swap_activity", {"sin": sin, "sout": sout}, "Swap in/out (MB): {sin}/{sout}") def ccache_stats(self): ccache_stats = None try: ccache = which.which("ccache") output = subprocess.check_output([ccache, "-s"]) ccache_stats = CCacheStats(output) except which.WhichError: pass except ValueError as e: self.log(logging.WARNING, "ccache", {"msg": str(e)}, "{msg}") return ccache_stats
class BuildMonitor(MozbuildObject): """Monitors the output of the build.""" def init(self, warnings_path): """Create a new monitor. warnings_path is a path of a warnings database to use. """ self._warnings_path = warnings_path self.resources = SystemResourceMonitor(poll_interval=1.0) self._resources_started = False self.tiers = TierStatus(self.resources) self.warnings_database = WarningsDatabase() if os.path.exists(warnings_path): try: self.warnings_database.load_from_file(warnings_path) except ValueError: os.remove(warnings_path) self._warnings_collector = WarningsCollector( database=self.warnings_database, objdir=self.topobjdir) def start(self): """Record the start of the build.""" self.start_time = time.time() self._finder_start_cpu = self._get_finder_cpu_usage() def start_resource_recording(self): # This should be merged into start() once bug 892342 lands. self.resources.start() self._resources_started = True def on_line(self, line): """Consume a line of output from the build system. This will parse the line for state and determine whether more action is needed. Returns a BuildOutputResult instance. In this named tuple, warning will be an object describing a new parsed warning. Otherwise it will be None. state_changed indicates whether the build system changed state with this line. If the build system changed state, the caller may want to query this instance for the current state in order to update UI, etc. for_display is a boolean indicating whether the line is relevant to the user. This is typically used to filter whether the line should be presented to the user. """ if line.startswith('BUILDSTATUS'): args = line.split()[1:] action = args.pop(0) update_needed = True if action == 'TIERS': self.tiers.set_tiers(args) update_needed = False elif action == 'TIER_START': tier = args[0] subtiers = args[1:] self.tiers.begin_tier(tier, subtiers) elif action == 'TIER_FINISH': tier, = args self.tiers.finish_tier(tier) elif action == 'SUBTIER_START': tier, subtier = args[0:2] dirs = args[2:] self.tiers.begin_subtier(tier, subtier, dirs) elif action == 'SUBTIER_FINISH': tier, subtier = args self.tiers.finish_subtier(tier, subtier) elif action == 'TIERDIR_START': tier, subtier, d = args self.tiers.begin_dir(tier, subtier, d) elif action == 'TIERDIR_FINISH': tier, subtier, d = args self.tiers.finish_dir(tier, subtier, d) else: raise Exception('Unknown build status: %s' % action) return BuildOutputResult(None, update_needed, False) warning = None try: warning = self._warnings_collector.process_line(line) except: pass return BuildOutputResult(warning, False, True) def finish(self, record_usage=True): """Record the end of the build.""" self.end_time = time.time() if self._resources_started: self.resources.stop() self._finder_end_cpu = self._get_finder_cpu_usage() self.elapsed = self.end_time - self.start_time self.warnings_database.prune() self.warnings_database.save_to_file(self._warnings_path) if not record_usage: return usage = self.record_resource_usage() if not usage: return with open(self._get_state_filename('build_resources.json'), 'w') as fh: json.dump(usage, fh, indent=2) def _get_finder_cpu_usage(self): """Obtain the CPU usage of the Finder app on OS X. This is used to detect high CPU usage. """ if not sys.platform.startswith('darwin'): return None if not psutil: return None for proc in psutil.process_iter(): if proc.name != 'Finder': continue if proc.username != getpass.getuser(): continue # Try to isolate system finder as opposed to other "Finder" # processes. if not proc.exe.endswith( 'CoreServices/Finder.app/Contents/MacOS/Finder'): continue return proc.get_cpu_times() return None def have_high_finder_usage(self): """Determine whether there was high Finder CPU usage during the build. Returns True if there was high Finder CPU usage, False if there wasn't, or None if there is nothing to report. """ if not self._finder_start_cpu: return None, None # We only measure if the measured range is sufficiently long. if self.elapsed < 15: return None, None if not self._finder_end_cpu: return None, None start = self._finder_start_cpu end = self._finder_end_cpu start_total = start.user + start.system end_total = end.user + end.system cpu_seconds = end_total - start_total # If Finder used more than 25% of 1 core during the build, report an # error. finder_percent = cpu_seconds / self.elapsed * 100 return finder_percent > 25, finder_percent @property def have_resource_usage(self): """Whether resource usage is available.""" return self.resources.start_time is not None def record_resource_usage(self): """Record the resource usage of this build. We write a log message containing a high-level summary. We also produce a data structure containing the low-level resource usage information. This data structure can e.g. be serialized into JSON and saved for subsequent analysis. If no resource usage is available, None is returned. """ if not self.have_resource_usage: return None cpu_percent = self.resources.aggregate_cpu_percent(phase=None, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(phase=None, per_cpu=False) io = self.resources.aggregate_io(phase=None) self._log_resource_usage('Overall system resources', 'resource_usage', self.end_time - self.start_time, cpu_percent, cpu_times, io) o = dict( version=1, start=self.start_time, end=self.end_time, duration=self.end_time - self.start_time, resources=[], cpu_percent=cpu_percent, cpu_times=cpu_times, io=io, ) o['tiers'] = self.tiers.tiered_resource_usage() self.tiers.add_resource_fields_to_dict(o) for usage in self.resources.range_usage(): cpu_percent = self.resources.aggregate_cpu_percent(usage.start, usage.end, per_cpu=False) cpu_times = self.resources.aggregate_cpu_times(usage.start, usage.end, per_cpu=False) entry = dict( start=usage.start, end=usage.end, virt=list(usage.virt), swap=list(usage.swap), ) self.tiers.add_resources_to_dict(entry, start=usage.start, end=usage.end) o['resources'].append(entry) return o def _log_resource_usage(self, prefix, m_type, duration, cpu_percent, cpu_times, io, extra_params={}): params = dict( duration=duration, cpu_percent=cpu_percent, io_reads=io.read_count, io_writes=io.write_count, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time, ) params.update(extra_params) message = prefix + ' - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent:.0f}%; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' self.log(logging.WARNING, m_type, params, message)
class ResourceMonitoringMixin(object): """Provides resource monitoring capabilities to scripts. When this class is in the inheritance chain, resource usage stats of the executing script will be recorded. This class requires the VirtualenvMixin in order to install a package used for recording resource usage. While we would like to record resource usage for the entirety of a script, since we require an external package, we can only record resource usage after that package is installed (as part of creating the virtualenv). That's just the way things have to be. """ def __init__(self, *args, **kwargs): super(ResourceMonitoringMixin, self).__init__(*args, **kwargs) self.register_virtualenv_module('psutil>=3.1.1', method='pip', optional=True) self.register_virtualenv_module('mozsystemmonitor==0.3', method='pip', optional=True) self._resource_monitor = None @PostScriptAction('create-virtualenv') def _start_resource_monitoring(self, action, success=None): self.activate_virtualenv() # Resource Monitor requires Python 2.7, however it's currently optional. # Remove when all machines have had their Python version updated (bug 711299). if sys.version_info[:2] < (2, 7): self.warning('Resource monitoring will not be enabled! Python 2.7+ required.') return try: from mozsystemmonitor.resourcemonitor import SystemResourceMonitor self.info("Starting resource monitoring.") self._resource_monitor = SystemResourceMonitor(poll_interval=1.0) self._resource_monitor.start() except Exception: self.warning("Unable to start resource monitor: %s" % traceback.format_exc()) @PreScriptAction def _resource_record_pre_action(self, action): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.begin_phase(action) @PostScriptAction def _resource_record_post_action(self, action, success=None): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.finish_phase(action) @PostScriptRun def _resource_record_post_run(self): if not self._resource_monitor: return # This should never raise an exception. This is a workaround until # mozsystemmonitor is fixed. See bug 895388. try: self._resource_monitor.stop() self._log_resource_usage() # Upload a JSON file containing the raw resource data. try: upload_dir = self.query_abs_dirs()['abs_blob_upload_dir'] with open(os.path.join(upload_dir, 'resource-usage.json'), 'wb') as fh: json.dump(self._resource_monitor.as_dict(), fh, sort_keys=True, indent=4) except (AttributeError, KeyError): self.exception('could not upload resource usage JSON', level=WARNING) except Exception: self.warning("Exception when reporting resource usage: %s" % traceback.format_exc()) def _log_resource_usage(self): rm = self._resource_monitor if rm.start_time is None: return def resources(phase): cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False) io = rm.aggregate_io(phase=phase) swap_in = sum(m.swap.sin for m in rm.measurements) swap_out = sum(m.swap.sout for m in rm.measurements) return cpu_percent, cpu_times, io, (swap_in, swap_out) def log_usage(prefix, duration, cpu_percent, cpu_times, io): message = '{prefix} - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent}; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' # XXX Some test harnesses are complaining about a string being # being fed into a 'f' formatter. This will help diagnose the # issue. cpu_percent_str = str(round(cpu_percent)) + '%' if cpu_percent else "Can't collect data" try: self.info( message.format( prefix=prefix, duration=duration, cpu_percent=cpu_percent_str, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time ) ) except ValueError: self.warning("Exception when formatting: %s" % traceback.format_exc()) cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None) duration = rm.end_time - rm.start_time log_usage('Total resource usage', duration, cpu_percent, cpu_times, io) # Print special messages so usage shows up in Treeherder. if cpu_percent: self._tinderbox_print('CPU usage<br/>{:,.1f}%'.format( cpu_percent)) self._tinderbox_print('I/O read bytes / time<br/>{:,} / {:,}'.format( io.read_bytes, io.read_time)) self._tinderbox_print('I/O write bytes / time<br/>{:,} / {:,}'.format( io.write_bytes, io.write_time)) # Print CPU components having >1%. "cpu_times" is a data structure # whose attributes are measurements. Ideally we'd have an API that # returned just the measurements as a dict or something. cpu_attrs = [] for attr in sorted(dir(cpu_times)): if attr.startswith('_'): continue if attr in ('count', 'index'): continue cpu_attrs.append(attr) cpu_total = sum(getattr(cpu_times, attr) for attr in cpu_attrs) for attr in cpu_attrs: value = getattr(cpu_times, attr) percent = value / cpu_total * 100.0 if percent > 1.00: self._tinderbox_print('CPU {}<br/>{:,.1f} ({:,.1f}%)'.format( attr, value, percent)) # Swap on Windows isn't reported by psutil. if not self._is_windows(): self._tinderbox_print('Swap in / out<br/>{:,} / {:,}'.format( swap_in, swap_out)) for phase in rm.phases.keys(): start_time, end_time = rm.phases[phase] cpu_percent, cpu_times, io, swap = resources(phase) log_usage(phase, end_time - start_time, cpu_percent, cpu_times, io) def _tinderbox_print(self, message): self.info('TinderboxPrint: %s' % message)
class ResourceMonitoringMixin(object): """Provides resource monitoring capabilities to scripts. When this class is in the inheritance chain, resource usage stats of the executing script will be recorded. This class requires the VirtualenvMixin in order to install a package used for recording resource usage. While we would like to record resource usage for the entirety of a script, since we require an external package, we can only record resource usage after that package is installed (as part of creating the virtualenv). That's just the way things have to be. """ def __init__(self, *args, **kwargs): super(ResourceMonitoringMixin, self).__init__(*args, **kwargs) self.register_virtualenv_module('psutil>=3.1.1', method='pip', optional=True) self.register_virtualenv_module('mozsystemmonitor==0.3', method='pip', optional=True) self._resource_monitor = None @PostScriptAction('create-virtualenv') def _start_resource_monitoring(self, action, success=None): self.activate_virtualenv() # Resource Monitor requires Python 2.7, however it's currently optional. # Remove when all machines have had their Python version updated (bug 711299). if sys.version_info[:2] < (2, 7): self.warning( 'Resource monitoring will not be enabled! Python 2.7+ required.' ) return try: from mozsystemmonitor.resourcemonitor import SystemResourceMonitor self.info("Starting resource monitoring.") self._resource_monitor = SystemResourceMonitor(poll_interval=1.0) self._resource_monitor.start() except Exception: self.warning("Unable to start resource monitor: %s" % traceback.format_exc()) @PreScriptAction def _resource_record_pre_action(self, action): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.begin_phase(action) @PostScriptAction def _resource_record_post_action(self, action, success=None): # Resource monitor isn't available until after create-virtualenv. if not self._resource_monitor: return self._resource_monitor.finish_phase(action) @PostScriptRun def _resource_record_post_run(self): if not self._resource_monitor: return # This should never raise an exception. This is a workaround until # mozsystemmonitor is fixed. See bug 895388. try: self._resource_monitor.stop() self._log_resource_usage() # Upload a JSON file containing the raw resource data. try: upload_dir = self.query_abs_dirs()['abs_blob_upload_dir'] with open(os.path.join(upload_dir, 'resource-usage.json'), 'wb') as fh: json.dump(self._resource_monitor.as_dict(), fh, sort_keys=True, indent=4) except (AttributeError, KeyError): self.exception('could not upload resource usage JSON', level=WARNING) except Exception: self.warning("Exception when reporting resource usage: %s" % traceback.format_exc()) def _log_resource_usage(self): rm = self._resource_monitor if rm.start_time is None: return def resources(phase): cpu_percent = rm.aggregate_cpu_percent(phase=phase, per_cpu=False) cpu_times = rm.aggregate_cpu_times(phase=phase, per_cpu=False) io = rm.aggregate_io(phase=phase) swap_in = sum(m.swap.sin for m in rm.measurements) swap_out = sum(m.swap.sout for m in rm.measurements) return cpu_percent, cpu_times, io, (swap_in, swap_out) def log_usage(prefix, duration, cpu_percent, cpu_times, io): message = '{prefix} - Wall time: {duration:.0f}s; ' \ 'CPU: {cpu_percent}; ' \ 'Read bytes: {io_read_bytes}; Write bytes: {io_write_bytes}; ' \ 'Read time: {io_read_time}; Write time: {io_write_time}' # XXX Some test harnesses are complaining about a string being # being fed into a 'f' formatter. This will help diagnose the # issue. cpu_percent_str = str(round( cpu_percent)) + '%' if cpu_percent else "Can't collect data" try: self.info( message.format(prefix=prefix, duration=duration, cpu_percent=cpu_percent_str, io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes, io_read_time=io.read_time, io_write_time=io.write_time)) except ValueError: self.warning("Exception when formatting: %s" % traceback.format_exc()) cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None) duration = rm.end_time - rm.start_time log_usage('Total resource usage', duration, cpu_percent, cpu_times, io) # Print special messages so usage shows up in Treeherder. if cpu_percent: self._tinderbox_print('CPU usage<br/>{:,.1f}%'.format(cpu_percent)) self._tinderbox_print('I/O read bytes / time<br/>{:,} / {:,}'.format( io.read_bytes, io.read_time)) self._tinderbox_print('I/O write bytes / time<br/>{:,} / {:,}'.format( io.write_bytes, io.write_time)) # Print CPU components having >1%. "cpu_times" is a data structure # whose attributes are measurements. Ideally we'd have an API that # returned just the measurements as a dict or something. cpu_attrs = [] for attr in sorted(dir(cpu_times)): if attr.startswith('_'): continue if attr in ('count', 'index'): continue cpu_attrs.append(attr) cpu_total = sum(getattr(cpu_times, attr) for attr in cpu_attrs) for attr in cpu_attrs: value = getattr(cpu_times, attr) percent = value / cpu_total * 100.0 if percent > 1.00: self._tinderbox_print('CPU {}<br/>{:,.1f} ({:,.1f}%)'.format( attr, value, percent)) # Swap on Windows isn't reported by psutil. if not self._is_windows(): self._tinderbox_print('Swap in / out<br/>{:,} / {:,}'.format( swap_in, swap_out)) for phase in rm.phases.keys(): start_time, end_time = rm.phases[phase] cpu_percent, cpu_times, io, swap = resources(phase) log_usage(phase, end_time - start_time, cpu_percent, cpu_times, io) def _tinderbox_print(self, message): self.info('TinderboxPrint: %s' % message)