Exemplo n.º 1
0
def main(args):
    parser = commandline.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--interval',
        default=60,
        type=int,
        help='time (in seconds) between sampling system metrics')
    opts = parser.parse_args(args)
    opts.Freeze()

    # This returns a 0 value the first time it's called.  Call it now and
    # discard the return value.
    psutil.cpu_times_percent()

    # Wait a random amount of time before starting the loop in case sysmon
    # is started at exactly the same time on all machines.
    time.sleep(random.uniform(0, opts.interval))

    # This call returns a context manager that doesn't do anything, so we
    # ignore the return value.
    ts_mon_config.SetupTsMonGlobalState('sysmon')
    # The default prefix is '/chrome/infra/'.
    interface.state.metric_name_prefix = (interface.state.metric_name_prefix +
                                          'chromeos/sysmon/')

    mainloop = _MainLoop(opts.interval)
    mainloop.loop_forever()
Exemplo n.º 2
0
def _SetupConnections(options, build_config):
    """Set up CIDB connections using the appropriate Setup call.

  Args:
    options: Command line options structure.
    build_config: Config object for this build.
  """
    # Outline:
    # 1) Based on options and build_config, decide whether we are a production
    # run, debug run, or standalone run.
    # 2) Set up cidb instance accordingly.
    # 3) Update topology info from cidb, so that any other service set up can use
    # topology.
    # 4) Set up any other services.
    run_type = _GetRunEnvironment(options, build_config)

    if run_type == _ENVIRONMENT_PROD:
        cidb.CIDBConnectionFactory.SetupProdCidb()
        context = ts_mon_config.SetupTsMonGlobalState(
            'cbuildbot', indirect=True, task_num=options.ts_mon_task_num)
    elif run_type == _ENVIRONMENT_DEBUG:
        cidb.CIDBConnectionFactory.SetupDebugCidb()
        context = ts_mon_config.TrivialContextManager()
    else:
        cidb.CIDBConnectionFactory.SetupNoCidb()
        context = ts_mon_config.TrivialContextManager()

    db = cidb.CIDBConnectionFactory.GetCIDBConnectionForBuilder()
    topology.FetchTopologyFromCIDB(db)

    return context
Exemplo n.º 3
0
def main(argv):
    """Standard main() for command line processing.

    @param argv Command line arguments (normally sys.argv).
    """

    parser = GetParser()
    options = parser.parse_args(argv[1:])

    with ts_mon_config.SetupTsMonGlobalState('dump_suite_report'):

        afe = frontend_wrappers.RetryingAFE(timeout_min=5,
                                            delay_sec=10,
                                            server=options.afe)
        tko = frontend_wrappers.RetryingTKO(timeout_min=5, delay_sec=10)

        # Look up and generate entries for all jobs.
        entries = []
        for suite_job_id in options.job_ids:
            logging.debug('Suite job %s:' % suite_job_id)
            suite_entries = suite_report.generate_suite_report(suite_job_id,
                                                               afe=afe,
                                                               tko=tko)
            logging.debug('... generated %d entries' % len(suite_entries))
            entries.extend(suite_entries)

        # Write all entries as JSON.
        if options.output:
            with open(options.output, 'w') as f:
                suite_report.dump_entries_as_json(entries, f)
        else:
            suite_report.dump_entries_as_json(entries, sys.stdout)
Exemplo n.º 4
0
def main():
    """Main entry."""
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.DEBUG)

    with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries',
                                             indirect=True):
        count = 0
        parser, options, args = parse_options()
        if not verify_options_and_args(options, args):
            parser.print_help()
            return 1
        try:
            while True:
                result_log_strs, count = kill_slow_queries(
                    options.user, options.password, options.timeout)
                if result_log_strs:
                    gmail_lib.send_email(
                        options.mail,
                        'Successfully killed slow autotest db queries',
                        'Below are killed queries:\n%s' % result_log_strs)
                    m = 'chromeos/autotest/afe_db/killed_slow_queries'
                    metrics.Counter(m).increment_by(count)
                time.sleep(options.timeout)
        except Exception as e:
            logging.error('Failed to kill slow db queries.\n%s', e)
            gmail_lib.send_email(
                options.mail, 'Failed to kill slow autotest db queries.',
                ('Error occurred during killing slow db queries:\n%s\n'
                 'Detailed logs can be found in /var/log/slow_queries.log on db'
                 ' backup server.\nTo avoid db crash, please check ASAP.') % e)
            raise
Exemplo n.º 5
0
def main():
    parser = commandline.ArgumentParser(description=__doc__,
                                        default_log_level='DEBUG')
    parser.add_argument(
        '--interval',
        default=60,
        type=int,
        help='time (in seconds) between sampling system metrics')
    parser.add_argument(
        '--collect-prod-hosts',
        action='store_true',
        help='[DEPRECATED. Use --collect-host-manifest instead.] '
        'Enable collection of prod host metrics, like roles')
    parser.add_argument(
        '--collect-host-manifest',
        default=None,
        choices=['prod', 'staging'],
        help='Enable collection of server metrics (e.g. roles) for servers in '
        'the given lab environment.')
    opts = parser.parse_args()
    opts.Freeze()

    # This call returns a context manager that doesn't do anything, so we
    # ignore the return value.
    ts_mon_config.SetupTsMonGlobalState('sysmon', auto_flush=False)
    # The default prefix is '/chrome/infra/'.
    interface.state.metric_name_prefix = (interface.state.metric_name_prefix +
                                          'chromeos/sysmon/')

    # Transitional, while we migrate users off of |collect_prod_hosts|
    if opts.collect_host_manifest is not None:
        opts.collect_prod_hosts = True
    collector = _MetricCollector(collect_prod_hosts=opts.collect_prod_hosts)
    loop.SleepLoop(callback=collector, interval=opts.interval).loop_forever()
def main():
    """Main entry."""
    # Clear all loggers to make sure the following basicConfig take effect.
    logging.shutdown()
    reload(logging)
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG)

    with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries',
                                             indirect=True):
        count = 0
        parser, options, args = parse_options()
        if not verify_options_and_args(options, args):
            parser.print_help()
            return 1
        try:
            while True:
                result_log_strs, count = kill_slow_queries(
                    options.user, options.password, options.timeout)
                if result_log_strs:
                    gmail_lib.send_email(
                        options.mail,
                        'Successfully killed slow autotest db queries',
                        'Below are killed queries:\n%s' % result_log_strs)
                    m = 'chromeos/autotest/afe_db/killed_slow_queries'
                    metrics.Counter(m).increment_by(count)
                time.sleep(options.timeout)
        except Exception as e:
            m = 'chromeos/autotest/afe_db/failed_to_kill_query'
            metrics.Counter(m).increment()
            logging.error('Failed to kill slow db queries.\n%s', e)
            raise
Exemplo n.º 7
0
def main(args):
    """Main func.

    @args: A list of system arguments.
    """
    args = _parse_args(args)
    swarming_bots.setup_logging(args.verbose, args.log_file)

    if not args.swarming_proxy:
        logging.error('No swarming proxy instance specified. '
                      'Specify swarming_proxy in [CROS] in shadow_config, '
                      'or use --swarming_proxy')
        return 1

    if not args.swarming_proxy.startswith('https://'):
        swarming_proxy = 'https://' + args.swarming_proxy
    else:
        swarming_proxy = args.swarming_proxy

    global _shut_down
    logging.info("Setting signal handler.")
    signal.signal(signal.SIGINT, handle_signal)
    signal.signal(signal.SIGTERM, handle_signal)

    bot_manager = swarming_bots.BotManager(swarming_bots.parse_range(
        args.id_range),
                                           args.working_dir,
                                           args.swarming_proxy,
                                           specify_bot_id=args.specify_bot_id)
    is_prod = False
    retryable = True
    with ts_mon_config.SetupTsMonGlobalState('swarming_bots', indirect=True):
        while not _shut_down:
            tick(args.afe, bot_manager)
            time.sleep(CHECK_INTERVAL)
Exemplo n.º 8
0
def main(argv):
    parser = commandline.ArgumentParser(description=__doc__)
    parser.add_argument('swarming_server',
                        action='store',
                        help='Swarming server to send no-op requests to.')
    options = parser.parse_args(argv)

    m_timer = 'chromeos/autotest/swarming_proxy/no_op_durations'
    m_count = 'chromeos/autotest/swarming_proxy/no_op_attempts'
    command = commands.RUN_SUITE_PATH
    fields = {'success': False, 'swarming_server': options.swarming_server}
    with ts_mon_config.SetupTsMonGlobalState('swarm_mon', indirect=True):
        while True:
            with metrics.SecondsTimer(m_timer, fields=fields) as f:
                try:
                    with metrics.SuccessCounter(m_count):
                        swarming_lib.RunSwarmingCommand(
                            [command, '--do_nothing'],
                            options.swarming_server,
                            dimensions=[('pool', 'default')],
                            timeout_secs=120)
                    f['success'] = True
                except (cros_build_lib.RunCommandError,
                        timeout_util.TimeoutError):
                    pass
            time.sleep(60)
Exemplo n.º 9
0
def main(argv):
    options = _ParseArguments(argv)
    manager = tasks.ProcessPoolTaskManager(options.max_tasks,
                                           _GetTaskHandler(options),
                                           options.interval)
    queue = service.WorkQueueServer(options.spool)
    try:
        # The ordering for logging setup here matters (alas):
        # The `ts_mon` setup starts a subprocess that makes logging
        # calls, and TimedRotatingFileHandler isn't multiprocess safe.
        # So, we need for the `ts_mon` child and this process to write
        # to different logs.  The gory details are in crbug.com/774597.
        #
        # This is a hack, really.  If you're studying this comment
        # because you have to clean up my mess, I'm truly and profoundly
        # sorry.  But still I wouldn't change a thing...
        #     https://www.youtube.com/watch?v=fFtGfyruroU

        with ts_mon_config.SetupTsMonGlobalState('provision_workqueue',
                                                 indirect=True):
            _SetupLogging(options)
            logging.info('Work queue service starts')
            logging.info('  Spool dir is %s', options.spool)
            logging.info('  Maximum of %d concurrent tasks', options.max_tasks)
            logging.info('  Time per tick is %.3f seconds', options.interval)
            queue.ProcessRequests(manager)
    except KeyboardInterrupt:
        pass
    finally:
        manager.Close()
Exemplo n.º 10
0
def main(argv):
    """Entry point."""
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s - %(name)s - " +
                        "%(levelname)s - %(message)s")
    parser, options, args = parse_options()
    if not verify_options_and_args(options, args):
        parser.print_help()
        sys.exit(1)

    with ts_mon_config.SetupTsMonGlobalState(service_name='sync_server_db',
                                             indirect=True):
        try:
            metrics.Counter(_METRICS_PREFIX + '/start').increment()
            logging.info("Setting signal handler")
            signal.signal(signal.SIGINT, handle_signal)
            signal.signal(signal.SIGTERM, handle_signal)

            while not _shutdown:
                _main(options)
                metrics.Counter(_METRICS_PREFIX +
                                '/tick').increment(fields={'success': True})
                time.sleep(options.sleep)
        except:
            metrics.Counter(_METRICS_PREFIX +
                            '/tick').increment(fields={'success': False})
            raise
Exemplo n.º 11
0
def main():
    """Runs the program."""
    options = parse_options()
    logging_manager.configure_logging(
        test_importer.TestImporterLoggingConfig(), verbose=options.verbose)
    backup_succeeded = False

    with ts_mon_config.SetupTsMonGlobalState(service_name='mysql_db_backup',
                                             indirect=True):
        with metrics.SecondsTimer('chromeos/autotest/afe_db/backup/durations',
                                  fields={'type': options.type}):
            try:
                logging.debug('Start db backup: %s', options.type)
                archiver = MySqlArchiver(options.type, options.keep,
                                         options.gs_bucket)
                dump_file = archiver.dump()
                logging.debug('Uploading backup: %s', options.type)
                archiver.upload_to_google_storage(dump_file)
                archiver.cleanup()
                logging.debug('Db backup completed: %s', options.type)
                backup_succeeded = True
            finally:
                metrics.Counter('chromeos/autotest/db/db_backup/completed'
                                ).increment(fields={
                                    'success': backup_succeeded,
                                    'type': options.type
                                })
Exemplo n.º 12
0
def SetupTsMonGlobalState(*args, **kwargs):
    """Import-safe wrap around chromite.lib.ts_mon_config's setup function.

    @param *args: Args to pass through.
    @param **kwargs: Kwargs to pass through.
    """
    try:
        # TODO(crbug.com/739466) This module import is delayed because it adds
        # 1-2 seconds to the module import time and most users of site_utils
        # don't need it. The correct fix is to break apart site_utils into more
        # meaningful chunks.
        from chromite.lib import ts_mon_config
    except ImportError:
        logging.warn('Unable to import chromite. Monarch is disabled.')
        return TrivialContextManager()

    try:
        context = ts_mon_config.SetupTsMonGlobalState(*args, **kwargs)
        if hasattr(context, '__exit__'):
            return context
    except Exception as e:
        logging.warning(
            'Caught an exception trying to setup ts_mon, '
            'monitoring is disabled: %s',
            e,
            exc_info=True)
    return TrivialContextManager()
Exemplo n.º 13
0
 def testTaskNumWithIndirect(self):
   """The task_num argument should propagate to the flushing subprocess."""
   create_flushing_process = self.PatchObject(
       ts_mon_config, '_CreateTsMonFlushingProcess')
   ts_mon_config.SetupTsMonGlobalState('unittest', indirect=True, task_num=42)
   options = ts_mon_config._GenerateTsMonArgparseOptions(
       'unittest', False, False, None, 42)
   create_flushing_process.assert_called_once_with(options)
Exemplo n.º 14
0
def main():
    """Sets up ts_mon and repeatedly queries MySQL stats"""
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    conn = MySQLConnection('localhost', DEFAULT_USER, DEFAULT_PASSWD)
    conn.Connect()

    with ts_mon_config.SetupTsMonGlobalState('mysql_stats', indirect=True):
        QueryLoop(conn)
Exemplo n.º 15
0
 def testShortLived(self):
   """Tests that configuring ts-mon to use short-lived processes works."""
   self.patchTime()
   with tempfile.NamedTemporaryFile(dir='/var/tmp') as out:
     with ts_mon_config.SetupTsMonGlobalState('metrics_unittest',
                                              short_lived=True,
                                              debug_file=out.name):
       # pylint: disable=protected-access
       self.assertTrue(ts_mon_config._WasSetup)
Exemplo n.º 16
0
def main(argv):
    parser = GetParser()
    options = parser.parse_args(argv)

    creds_file = options.service_acct_json
    project_id = options.project_id
    client = _Client(creds_path=creds_file)

    with ts_mon_config.SetupTsMonGlobalState('export_to_cloud_trace'):
        _WatchAndSendSpans(project_id, client)
def main():
    """Sets up ts_mon and repeatedly queries MySQL stats"""
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    conn = RetryingConnection('localhost', DEFAULT_USER, DEFAULT_PASSWD)
    conn.Connect()

    # TODO(crbug.com/803566) Use indirect=False to mitigate orphan mysql_stats
    # processes overwhelming shards.
    with ts_mon_config.SetupTsMonGlobalState('mysql_stats', indirect=False):
        QueryLoop(conn)
Exemplo n.º 18
0
def main():
    """Main method of gs_offloader."""
    options = parse_options()

    if options.process_all:
        offloader_type = 'all'
    elif options.process_hosts_only:
        offloader_type = 'hosts'
    else:
        offloader_type = 'jobs'

    log_timestamp = time.strftime(LOG_TIMESTAMP_FORMAT)
    if options.log_size > 0:
        log_timestamp = ''
    log_basename = LOG_FILENAME_FORMAT % (offloader_type, log_timestamp)
    log_filename = os.path.join(LOG_LOCATION, log_basename)
    log_formatter = logging.Formatter(LOGGING_FORMAT)
    # Replace the default logging handler with a RotatingFileHandler. If
    # options.log_size is 0, the file size will not be limited. Keeps
    # one backup just in case.
    handler = logging.handlers.RotatingFileHandler(
            log_filename, maxBytes=1024 * options.log_size, backupCount=1)
    handler.setFormatter(log_formatter)
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    logger.addHandler(handler)

    # Nice our process (carried to subprocesses) so we don't overload
    # the system.
    if not options.normal_priority:
        logging.debug('Set process to nice value: %d', NICENESS)
        os.nice(NICENESS)
    if psutil:
        proc = psutil.Process()
        logging.debug('Set process to ionice IDLE')
        proc.ionice(psutil.IOPRIO_CLASS_IDLE)

    # os.listdir returns relative paths, so change to where we need to
    # be to avoid an os.path.join on each loop.
    logging.debug('Offloading Autotest results in %s', RESULTS_DIR)
    os.chdir(RESULTS_DIR)

    signal.signal(signal.SIGALRM, timeout_handler)

    with ts_mon_config.SetupTsMonGlobalState('gs_offloader', indirect=True,
                                             short_lived=False):
        offloader = Offloader(options)
        if not options.delete_only:
            wait_for_gs_write_access(offloader.gs_uri)
        while True:
            offloader.offload_once()
            if options.offload_once:
                break
            time.sleep(SLEEP_TIME_SECS)
Exemplo n.º 19
0
def main():
    """Entry point."""
    arguments = parse_arguments()
    with ts_mon_config.SetupTsMonGlobalState(service_name='test_push',
                                             indirect=True):
        test_push_success = False
        try:
            _main(arguments)
            test_push_success = True
        finally:
            metrics.Counter('chromeos/autotest/test_push/completed').increment(
                fields={'success': test_push_success})
Exemplo n.º 20
0
def main():
    ts_mon_config.SetupTsMonGlobalState('shard_client')

    try:
        metrics.Counter('chromeos/autotest/shard_client/start').increment()
        main_without_exception_handling()
    except Exception as e:
        message = 'Uncaught exception. Terminating shard_client.'
        email_manager.manager.log_stacktrace(message)
        logging.exception(message)
        raise
    finally:
        email_manager.manager.send_queued_emails()
Exemplo n.º 21
0
def main(argv):
    options = PreParseArguments(argv)
    metric_fields = {
        'branch_name': options.branch or 'master',
        'build_config': options.build_config_name,
        'tryjob': options.remote_trybot,
    }

    # Enable Monarch metrics gathering.
    with ts_mon_config.SetupTsMonGlobalState(
            'cbuildbot_launch', common_metric_fields=metric_fields,
            indirect=True):
        return _main(options, argv)
Exemplo n.º 22
0
def Main():
    """Sets up logging and runs matchers against stdin."""
    args = ParseArgs()
    log_daemon_common.SetupLogging(args)

    # Set up metrics sending and go.
    ts_mon_args = {}
    if args.debug_metrics_file:
        ts_mon_args['debug_file'] = args.debug_metrics_file

    with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics',
                                             **ts_mon_args):
        log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
Exemplo n.º 23
0
def main():
    if _monitor_db_host_acquisition:
        logging.info('Please set inline_host_acquisition=False in the shadow '
                     'config before starting the host scheduler.')
        sys.exit(0)
    try:
        options = parse_arguments(sys.argv[1:])
        scheduler_lib.check_production_settings(options)

        # If server database is enabled, check if the server has role
        # `host_scheduler`. If the server does not have host_scheduler role,
        # exception will be raised and host scheduler will not continue to run.
        if server_manager_utils.use_server_db():
            server_manager_utils.confirm_server_has_role(hostname='localhost',
                                                         role='host_scheduler')

        initialize(options.testing)

        with ts_mon_config.SetupTsMonGlobalState(
                'autotest_host_scheduler',
                indirect=True,
                debug_file=options.metrics_file,
        ):
            metrics.Counter('%s/start' % _METRICS_PREFIX).increment()
            process_start_time = time.time()
            host_scheduler = HostScheduler()
            minimum_tick_sec = global_config.global_config.get_config_value(
                'SCHEDULER', 'host_scheduler_minimum_tick_sec', type=float)
            while not _shutdown:
                if _lifetime_expired(options.lifetime_hours,
                                     process_start_time):
                    break
                start = time.time()
                host_scheduler.tick()
                curr_tick_sec = time.time() - start
                if (minimum_tick_sec > curr_tick_sec):
                    time.sleep(minimum_tick_sec - curr_tick_sec)
                else:
                    time.sleep(0.0001)
            logging.info('Shutdown request recieved. Bye! Bye!')
    except server_manager_utils.ServerActionError:
        # This error is expected when the server is not in primary status
        # for host-scheduler role. Thus do not send email for it.
        raise
    except Exception:
        metrics.Counter('%s/uncaught_exception' % _METRICS_PREFIX).increment()
        raise
    finally:
        email_manager.manager.send_queued_emails()
        if _db_manager:
            _db_manager.disconnect()
Exemplo n.º 24
0
    def testResetAfter(self):
        """Tests that the reset_after flag works to send metrics only once."""
        # By mocking out its "time" module, the forked flushing process will think
        # it should call Flush() whenever we send a metric.
        self.patchTime()

        with tempfile.NamedTemporaryFile(dir='/var/tmp') as out:
            # * The indirect=True flag is required for reset_after to work.
            # * Using debug_file, we send metrics to the temporary file instead of
            # sending metrics to production via PubSub.
            with ts_mon_config.SetupTsMonGlobalState('metrics_unittest',
                                                     indirect=True,
                                                     debug_file=out.name):

                def MetricName(i, flushed):
                    return 'test/metric/name/%d/%s' % (i, flushed)

                # Each of these .set() calls will result in a Flush() call.
                for i in range(7):
                    # any extra streams with different fields and reset_after=False
                    # will be cleared only if the below metric is cleared.
                    metrics.Boolean(MetricName(i, True),
                                    reset_after=False).set(
                                        True, fields={'original': False})

                    metrics.Boolean(MetricName(i, True), reset_after=True).set(
                        True, fields={'original': True})

                for i in range(7):
                    metrics.Boolean(MetricName(i, False),
                                    reset_after=False).set(True)

            # By leaving the context, we .join() the flushing process.
            with open(out.name, 'r') as fh:
                content = fh.read()

            # The flushed metrics should be sent only three times, because:
            # * original=False is sent twice
            # * original=True is sent once.
            for i in range(7):
                self.assertEqual(content.count(MetricName(i, True)), 3)

            # The nonflushed metrics are sent once-per-flush.
            # There are 7 of these metrics,
            # * The 0th is sent 7 times,
            # * The 1st is sent 6 times,
            # ...
            # * The 6th is sent 1 time.
            # So the "i"th metric is sent (7-i) times.
            for i in range(7):
                self.assertEqual(content.count(MetricName(i, False)), 7 - i)
def main():
    if _monitor_db_host_acquisition:
        logging.info('Please set inline_host_acquisition=False in the shadow '
                     'config before starting the host scheduler.')
        # The upstart job for the host scheduler understands exit(0) to mean
        # 'don't respawn'. This is desirable when the job scheduler is acquiring
        # hosts inline.
        sys.exit(0)
    try:
        options = parse_arguments(sys.argv[1:])
        scheduler_lib.check_production_settings(options)

        # If server database is enabled, check if the server has role
        # `host_scheduler`. If the server does not have host_scheduler role,
        # exception will be raised and host scheduler will not continue to run.
        if server_manager_utils.use_server_db():
            server_manager_utils.confirm_server_has_role(hostname='localhost',
                                                         role='host_scheduler')

        initialize(options.testing)

        # Start the thread to report metadata.
        metadata_reporter.start()

        ts_mon_config.SetupTsMonGlobalState('autotest_host_scheduler')

        host_scheduler = HostScheduler()
        minimum_tick_sec = global_config.global_config.get_config_value(
            'SCHEDULER', 'minimum_tick_sec', type=float)
        while not _shutdown:
            start = time.time()
            host_scheduler.tick()
            curr_tick_sec = time.time() - start
            if (minimum_tick_sec > curr_tick_sec):
                time.sleep(minimum_tick_sec - curr_tick_sec)
            else:
                time.sleep(0.0001)
    except server_manager_utils.ServerActionError as e:
        # This error is expected when the server is not in primary status
        # for host-scheduler role. Thus do not send email for it.
        raise
    except Exception:
        email_manager.manager.log_stacktrace(
            'Uncaught exception; terminating host_scheduler.')
        raise
    finally:
        email_manager.manager.send_queued_emails()
        if _db_manager:
            _db_manager.disconnect()
        metadata_reporter.abort()
Exemplo n.º 26
0
def main():
    """Main script."""
    options = parse_options()
    log_config = logging_config.LoggingConfig()
    if options.logfile:
        log_config.add_file_handler(file_path=os.path.abspath(options.logfile),
                                    level=logging.DEBUG)

    with ts_mon_config.SetupTsMonGlobalState(service_name='cleanup_tko_db',
                                             indirect=True):
        server = CONFIG.get_config_value('AUTOTEST_WEB',
                                         'global_db_host',
                                         default=CONFIG.get_config_value(
                                             'AUTOTEST_WEB', 'host'))
        user = CONFIG.get_config_value('AUTOTEST_WEB',
                                       'global_db_user',
                                       default=CONFIG.get_config_value(
                                           'AUTOTEST_WEB', 'user'))
        password = CONFIG.get_config_value('AUTOTEST_WEB',
                                           'global_db_password',
                                           default=CONFIG.get_config_value(
                                               'AUTOTEST_WEB', 'password'))
        database = CONFIG.get_config_value('AUTOTEST_WEB',
                                           'global_db_database',
                                           default=CONFIG.get_config_value(
                                               'AUTOTEST_WEB', 'database'))

        logging.info(
            'Starting cleaning up old records in TKO database %s on '
            'server %s.', database, server)

        start_time = time.time()
        try:
            if options.recreate_test_attributes:
                with metrics.SecondsTimer(RECREATE_TEST_ATTRIBUTES_METRIC,
                                          fields={'success': False}) as fields:
                    _recreate_test_attributes(server, user, password, database)
                    fields['success'] = True
            else:
                with metrics.SecondsTimer(CLEANUP_METRIC,
                                          fields={'success': False}) as fields:
                    utils.run_sql_cmd(server, user, password, CLEANUP_TKO_CMD,
                                      database)
                    fields['success'] = True
        except:
            logging.exception('Cleanup failed with exception.')
        finally:
            duration = time.time() - start_time
            logging.info('Cleanup attempt finished in %s seconds.', duration)
Exemplo n.º 27
0
def main():
    """Main method of gs_offloader."""
    options = parse_options()

    if options.process_all:
        offloader_type = 'all'
    elif options.process_hosts_only:
        offloader_type = 'hosts'
    else:
        offloader_type = 'jobs'

    _setup_logging(options, offloader_type)

    if options.enable_timestamp_cache:
        # Extend the cache expiry time by another 1% so the timstamps
        # are available as the results are purged.
        job_timestamp_cache.setup(options.age_to_delete * 1.01)

    # Nice our process (carried to subprocesses) so we don't overload
    # the system.
    if not options.normal_priority:
        logging.debug('Set process to nice value: %d', NICENESS)
        os.nice(NICENESS)
    if psutil:
        proc = psutil.Process()
        logging.debug('Set process to ionice IDLE')
        proc.ionice(psutil.IOPRIO_CLASS_IDLE)

    # os.listdir returns relative paths, so change to where we need to
    # be to avoid an os.path.join on each loop.
    logging.debug('Offloading Autotest results in %s', RESULTS_DIR)
    os.chdir(RESULTS_DIR)

    service_name = 'gs_offloader(%s)' % offloader_type
    with ts_mon_config.SetupTsMonGlobalState(service_name,
                                             indirect=True,
                                             short_lived=False,
                                             debug_file=options.metrics_file):
        with metrics.SuccessCounter('chromeos/autotest/gs_offloader/exit'):
            offloader = Offloader(options)
            if not options.delete_only:
                wait_for_gs_write_access(offloader.gs_uri)
            while True:
                offloader.offload_once()
                if options.offload_once:
                    break
                time.sleep(SLEEP_TIME_SECS)
def main():
    """main script. """
    parser = argparse.ArgumentParser()
    parser.add_argument('--span', type=int, dest='span', default=1,
                        help=('Number of hours that stats should be collected. '
                              'If it is set to 24, the end time of stats being '
                              'collected will set to the mid of the night. '
                              'Default is set to 1 hour.'))
    parser.add_argument('-e', '--email', dest='email', default=None,
                        help='Email any errors to the given email address.')
    options = parser.parse_args()

    boards = host_label_utils.get_all_boards()
    pools = ['bvt', 'suites', 'cq']

    if options.span == 24:
        today = datetime.combine(date.today(), datetime.min.time())
        end_time = time_utils.to_epoch_time(today)
    else:
        now = datetime.now()
        end_time = datetime(year=now.year, month=now.month, day=now.day,
                            hour=now.hour)
        end_time = time_utils.to_epoch_time(end_time)

    start_time = end_time - timedelta(hours=options.span).total_seconds()
    print ('Collecting host stats from %s to %s...' %
           (time_utils.epoch_time_to_date_string(start_time),
            time_utils.epoch_time_to_date_string(end_time)))

    ts_mon_config.SetupTsMonGlobalState('collect_host_stats')

    errors = []
    if not boards:
        errors.append('Error! No board found in metadb.')
    for board in boards:
        for pool in pools:
            error = report_stats(board, pool, start_time, end_time,
                                 options.span)
            if error:
                errors.append(error)
    if options.email and errors:
        gmail_lib.send_email(options.email,
                             'Error occured when collecting host stats.',
                             '\n'.join(errors))
Exemplo n.º 29
0
def main(argv):
    """Entry point for dut_mon."""
    logging.getLogger().setLevel(logging.INFO)

    with ts_mon_config.SetupTsMonGlobalState('dut_mon', indirect=True):
        afe = frontend.AFE()
        counters = collections.defaultdict(lambda: 0)

        field_spec = [ts_mon.StringField('board'),
                      ts_mon.StringField('model'),
                      ts_mon.StringField('pool'),
                      ts_mon.BooleanField('is_locked'),
                      ts_mon.StringField('status'),
                      ]
        dut_count = metrics.Gauge('chromeos/autotest/dut_mon/dut_count',
                                  description='The number of duts in a given '
                                              'state and bucket.',
                                  field_spec=field_spec)
        tick_count = metrics.Counter('chromeos/autotest/dut_mon/tick',
                                     description='Tick counter of dut_mon.')

        while True:
            # Note: We reset all counters to zero in each loop rather than
            # creating a new defaultdict, because we want to ensure that any
            # gauges that were previously set to a nonzero value by this process
            # get set back to zero if necessary.
            for k in counters:
                counters[k] = 0

            logging.info('Fetching all hosts.')
            hosts = afe.get_hosts()
            logging.info('Fetched %s hosts.', len(hosts))
            for host in hosts:
                fields = _get_bucket_for_host(host)
                counters[fields] += 1

            for field, value in counters.iteritems():
                logging.info('%s %s', field, value)
                dut_count.set(value, fields=field.__dict__)

            tick_count.increment()
            logging.info('Sleeping for 2 minutes.')
            time.sleep(120)
Exemplo n.º 30
0
def main():
    parser = commandline.ArgumentParser(description=__doc__,
                                        default_log_level='DEBUG')
    parser.add_argument(
        '--interval',
        default=60,
        type=int,
        help='time (in seconds) between sampling system metrics')
    opts = parser.parse_args()
    opts.Freeze()

    # This call returns a context manager that doesn't do anything, so we
    # ignore the return value.
    ts_mon_config.SetupTsMonGlobalState('sysmon', auto_flush=False)
    # The default prefix is '/chrome/infra/'.
    interface.state.metric_name_prefix = (interface.state.metric_name_prefix +
                                          'chromeos/sysmon/')

    collector = _MetricCollector()
    loop.SleepLoop(callback=collector, interval=opts.interval).loop_forever()