def report_outcome(config, archive, summary, fake_ctx): """ Reports on the final outcome of the command. """ status = get_status(summary) passed = status == 'pass' if not passed and bool(config.get('nuke-on-error')): # only unlock if we locked them in the first place nuke(fake_ctx, fake_ctx.lock) if archive is not None: with file(os.path.join(archive, 'summary.yaml'), 'w') as f: yaml.safe_dump(summary, f, default_flow_style=False) with contextlib.closing(StringIO.StringIO()) as f: yaml.safe_dump(summary, f) log.info('Summary data:\n%s' % f.getvalue()) with contextlib.closing(StringIO.StringIO()) as f: if ('email-on-error' in config and not passed): yaml.safe_dump(summary, f) yaml.safe_dump(config, f) emsg = f.getvalue() subject = "Teuthology error -- %s" % summary['failure_reason'] email_results(subject, "Teuthology", config['email-on-error'], emsg) report.try_push_job_info(config, summary) if passed: log.info(status) else: log.info(str(status).upper()) sys.exit(1)
def unlock_targets(job_config): serializer = report.ResultsSerializer(teuth_config.archive_base) job_info = serializer.job_info(job_config['name'], job_config['job_id']) machine_statuses = query.get_statuses(job_info['targets'].keys()) # only unlock/nuke targets if locked and description matches locked = [] for status in machine_statuses: name = shortname(status['name']) description = status['description'] if not status['locked']: continue if description != job_info['archive_path']: log.warning( "Was going to unlock %s but it was locked by another job: %s", name, description) continue locked.append(name) if not locked: return job_status = get_status(job_info) if job_status == 'pass' or \ (job_config.get('unlock_on_failure', False) and not job_config.get('nuke-on-error', False)): log.info('Unlocking machines...') fake_ctx = create_fake_context(job_config) for machine in locked: teuthology.lock.ops.unlock_one(fake_ctx, machine, job_info['owner'], job_info['archive_path']) if job_status != 'pass' and job_config.get('nuke-on-error', False): log.info('Nuking machines...') fake_ctx = create_fake_context(job_config) nuke(fake_ctx, True)
def reimage(job_config): # Reimage the targets specified in job config # and update their keys in config after reimaging ctx = create_fake_context(job_config) # change the status during the reimaging process report.try_push_job_info(ctx.config, dict(status='waiting')) targets = job_config['targets'] try: reimaged = reimage_machines(ctx, targets, job_config['machine_type']) except Exception as e: log.exception('Reimaging error. Nuking machines...') # Reimage failures should map to the 'dead' status instead of 'fail' report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e))) nuke(ctx, True) raise ctx.config['targets'] = reimaged # change the status to running after the reimaging process report.try_push_job_info(ctx.config, dict(status='running'))
def nuke(targets, owner, log, teuth_config, should_unlock, synch_clocks=True, reboot_all=True): from teuthology.nuke import nuke from teuthology.lock import unlock ctx = argparse.Namespace( config=dict(targets=targets), owner=owner, synch_clocks=synch_clocks, reboot_all=reboot_all, teuthology_config=teuth_config, ) try: nuke(ctx, log) except: log.exception('Could not nuke all targets in %s', targets) # not re-raising the so that parallel calls aren't killed else: if should_unlock: for target in targets.keys(): unlock(ctx, target, owner)
def unlock_targets(job_config): serializer = report.ResultsSerializer(teuth_config.archive_base) job_info = serializer.job_info(job_config['name'], job_config['job_id']) machine_status = query.get_statuses(job_info['targets'].keys()) # only unlock/nuke targets if locked in the first place locked = [shortname(_['name']) for _ in machine_status if _['locked']] if not locked: return job_status = get_status(job_info) if job_status == 'pass' or \ (job_config.get('unlock_on_failure', False) and not job_config.get('nuke-on-error', False)): log.info('Unlocking machines...') fake_ctx = create_fake_context(job_config) for machine in locked: teuthology.lock.ops.unlock_one(fake_ctx, machine, job_info['owner'], job_info['archive_path']) if job_status != 'pass' and job_config.get('nuke-on-error', False): log.info('Nuking machines...') fake_ctx = create_fake_context(job_config) nuke(fake_ctx, True)
def main(): from gevent import monkey; monkey.patch_all(dns=False) from .orchestra import monkey; monkey.patch_all() import logging log = logging.getLogger(__name__) ctx = parse_args() loglevel = logging.INFO if ctx.verbose: loglevel = logging.DEBUG logging.basicConfig( level=loglevel, ) if 'targets' in ctx.config and 'roles' in ctx.config: targets = len(ctx.config['targets']) roles = len(ctx.config['roles']) assert targets >= roles, \ '%d targets are needed for all roles but found %d listed.' % (roles, targets) if ctx.block: assert ctx.lock, \ 'the --block option is only supported with the --lock option' from teuthology.misc import read_config read_config(ctx) log.debug('\n '.join(['Config:', ] + yaml.safe_dump(ctx.config, default_flow_style=False).splitlines())) ctx.summary = dict(success=True) if ctx.owner is None: from teuthology.misc import get_user ctx.owner = get_user() ctx.summary['owner'] = ctx.owner if ctx.description is not None: ctx.summary['description'] = ctx.description if ctx.archive is not None: os.mkdir(ctx.archive) handler = logging.FileHandler( filename=os.path.join(ctx.archive, 'teuthology.log'), ) formatter = logging.Formatter( fmt='%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s', datefmt='%Y-%m-%dT%H:%M:%S', ) handler.setFormatter(formatter) logging.getLogger().addHandler(handler) with file(os.path.join(ctx.archive, 'pid'), 'w') as f: f.write('%d' % os.getpid()) with file(os.path.join(ctx.archive, 'owner'), 'w') as f: f.write(ctx.owner + '\n') with file(os.path.join(ctx.archive, 'orig.config.yaml'), 'w') as f: yaml.safe_dump(ctx.config, f, default_flow_style=False) for task in ctx.config['tasks']: assert 'kernel' not in task, \ 'kernel installation shouldn be a base-level item, not part of the tasks list' init_tasks = [] if ctx.lock: assert 'targets' not in ctx.config, \ 'You cannot specify targets in a config file when using the --lock option' init_tasks.append({'internal.lock_machines': len(ctx.config['roles'])}) init_tasks.extend([ {'internal.save_config': None}, {'internal.check_lock': None}, {'internal.connect': None}, {'internal.check_conflict': None}, ]) if 'kernel' in ctx.config: init_tasks.append({'kernel': ctx.config['kernel']}) init_tasks.extend([ {'internal.base': None}, {'internal.archive': None}, {'internal.coredump': None}, {'internal.syslog': None}, {'internal.timer': None}, ]) ctx.config['tasks'][:0] = init_tasks from teuthology.run_tasks import run_tasks try: run_tasks(tasks=ctx.config['tasks'], ctx=ctx) finally: if not ctx.summary.get('success') and ctx.config.get('nuke-on-error'): from teuthology.nuke import nuke # only unlock if we locked them in the first place nuke(ctx, log, ctx.lock) if ctx.archive is not None: with file(os.path.join(ctx.archive, 'summary.yaml'), 'w') as f: yaml.safe_dump(ctx.summary, f, default_flow_style=False) if not ctx.summary.get('success', True): import sys sys.exit(1)
def main(args): # run dispatcher in job supervisor mode if --supervisor passed if args["--supervisor"]: return supervisor.main(args) verbose = args["--verbose"] tube = args["--tube"] log_dir = args["--log-dir"] archive_dir = args["--archive-dir"] if archive_dir is None: archive_dir = teuth_config.archive_base # setup logging for disoatcher in {log_dir} loglevel = logging.INFO if verbose: loglevel = logging.DEBUG log.setLevel(loglevel) log_file_path = os.path.join(log_dir, f"dispatcher.{tube}.{os.getpid()}") setup_log_file(log_file_path) install_except_hook() load_config(archive_dir=archive_dir) connection = beanstalk.connect() beanstalk.watch_tube(connection, tube) result_proc = None if teuth_config.teuthology_path is None: fetch_teuthology('master') fetch_qa_suite('master') keep_running = True while keep_running: # Check to see if we have a teuthology-results process hanging around # and if so, read its return code so that it can exit. if result_proc is not None and result_proc.poll() is not None: log.debug("teuthology-results exited with code: %s", result_proc.returncode) result_proc = None if sentinel(restart_file_path): restart() elif sentinel(stop_file_path): stop() load_config() job = connection.reserve(timeout=60) if job is None: continue # bury the job so it won't be re-run if it fails job.bury() job_id = job.jid log.info('Reserved job %d', job_id) log.info('Config is: %s', job.body) job_config = yaml.safe_load(job.body) job_config['job_id'] = str(job_id) if job_config.get('stop_worker'): keep_running = False try: job_config, teuth_bin_path = prep_job( job_config, log_file_path, archive_dir, ) except SkipJob: continue # lock machines but do not reimage them if 'roles' in job_config: job_config = lock_machines(job_config) run_args = [ os.path.join(teuth_bin_path, 'teuthology-dispatcher'), '--supervisor', '-v', '--bin-path', teuth_bin_path, '--archive-dir', archive_dir, ] # Create run archive directory if not already created and # job's archive directory create_job_archive(job_config['name'], job_config['archive_path'], archive_dir) job_config_path = os.path.join(job_config['archive_path'], 'orig.config.yaml') # Write initial job config in job archive dir with open(job_config_path, 'w') as f: yaml.safe_dump(job_config, f, default_flow_style=False) run_args.extend(["--job-config", job_config_path]) try: job_proc = subprocess.Popen(run_args) log.info('Job supervisor PID: %s', job_proc.pid) except Exception: error_message = "Saw error while trying to spawn supervisor." log.exception(error_message) if 'targets' in job_config: nuke(supervisor.create_fake_context(job_config), True) report.try_push_job_info( job_config, dict(status='fail', failure_reason=error_message)) # This try/except block is to keep the worker from dying when # beanstalkc throws a SocketError try: job.delete() except Exception: log.exception("Saw exception while trying to delete job")
def main(): from gevent import monkey monkey.patch_all(dns=False) from .orchestra import monkey monkey.patch_all() import logging ctx = parse_args() set_up_logging(ctx) log = logging.getLogger(__name__) if ctx.owner is None: from teuthology.misc import get_user ctx.owner = get_user() write_initial_metadata(ctx) if 'targets' in ctx.config and 'roles' in ctx.config: targets = len(ctx.config['targets']) roles = len(ctx.config['roles']) assert targets >= roles, \ '%d targets are needed for all roles but found %d listed.' % (roles, targets) machine_type = ctx.machine_type if machine_type is None: fallback_default = ctx.config.get('machine_type', 'plana') machine_type = ctx.config.get('machine-type', fallback_default) if ctx.block: assert ctx.lock, \ 'the --block option is only supported with the --lock option' from teuthology.misc import read_config read_config(ctx) log.debug('\n '.join([ 'Config:', ] + yaml.safe_dump(ctx.config, default_flow_style=False).splitlines())) ctx.summary = dict(success=True) ctx.summary['owner'] = ctx.owner if ctx.description is not None: ctx.summary['description'] = ctx.description for task in ctx.config['tasks']: assert 'kernel' not in task, \ 'kernel installation shouldn be a base-level item, not part of the tasks list' init_tasks = [] if ctx.lock: assert 'targets' not in ctx.config, \ 'You cannot specify targets in a config file when using the --lock option' init_tasks.append({ 'internal.lock_machines': (len(ctx.config['roles']), machine_type) }) init_tasks.extend([ { 'internal.save_config': None }, { 'internal.check_lock': None }, { 'internal.connect': None }, { 'internal.check_conflict': None }, { 'internal.check_ceph_data': None }, { 'internal.vm_setup': None }, ]) if 'kernel' in ctx.config: from teuthology.misc import get_distro distro = get_distro(ctx) if distro == 'ubuntu': init_tasks.append({'kernel': ctx.config['kernel']}) init_tasks.extend([ { 'internal.base': None }, { 'internal.archive': None }, { 'internal.coredump': None }, { 'internal.sudo': None }, { 'internal.syslog': None }, { 'internal.timer': None }, ]) ctx.config['tasks'][:0] = init_tasks from teuthology.run_tasks import run_tasks try: run_tasks(tasks=ctx.config['tasks'], ctx=ctx) finally: if not ctx.summary.get('success') and ctx.config.get('nuke-on-error'): from teuthology.nuke import nuke # only unlock if we locked them in the first place nuke(ctx, log, ctx.lock) if ctx.archive is not None: with file(os.path.join(ctx.archive, 'summary.yaml'), 'w') as f: yaml.safe_dump(ctx.summary, f, default_flow_style=False) with contextlib.closing(StringIO.StringIO()) as f: yaml.safe_dump(ctx.summary, f) log.info('Summary data:\n%s' % f.getvalue()) with contextlib.closing(StringIO.StringIO()) as f: if 'email-on-error' in ctx.config and not ctx.summary.get( 'success', False): yaml.safe_dump(ctx.summary, f) yaml.safe_dump(ctx.config, f) emsg = f.getvalue() subject = "Teuthology error -- %s" % ctx.summary[ 'failure_reason'] from teuthology.suite import email_results email_results(subject, "Teuthology", ctx.config['email-on-error'], emsg) if ctx.summary.get('success', True): log.info('pass') else: log.info('FAIL') import sys sys.exit(1)
def main(): from gevent import monkey monkey.patch_all(dns=False) from .orchestra import monkey monkey.patch_all() import logging ctx = parse_args() set_up_logging(ctx) log = logging.getLogger(__name__) if ctx.owner is None: from teuthology.misc import get_user ctx.owner = get_user() write_initial_metadata(ctx) if 'targets' in ctx.config and 'roles' in ctx.config: targets = len(ctx.config['targets']) roles = len(ctx.config['roles']) assert targets >= roles, \ '%d targets are needed for all roles but found %d listed.' % (roles, targets) machine_type = ctx.machine_type if machine_type is None: fallback_default = ctx.config.get('machine_type', 'plana') machine_type = ctx.config.get('machine-type', fallback_default) if ctx.block: assert ctx.lock, \ 'the --block option is only supported with the --lock option' from teuthology.misc import read_config read_config(ctx) log.debug('\n '.join(['Config:', ] + yaml.safe_dump(ctx.config, default_flow_style=False).splitlines())) ctx.summary = dict(success=True) ctx.summary['owner'] = ctx.owner if ctx.description is not None: ctx.summary['description'] = ctx.description for task in ctx.config['tasks']: assert 'kernel' not in task, \ 'kernel installation shouldn be a base-level item, not part of the tasks list' init_tasks = [] if ctx.lock: assert 'targets' not in ctx.config, \ 'You cannot specify targets in a config file when using the --lock option' init_tasks.append({'internal.lock_machines': (len(ctx.config['roles']), machine_type)}) init_tasks.extend([ {'internal.save_config': None}, {'internal.check_lock': None}, {'internal.connect': None}, {'internal.check_conflict': None}, {'internal.check_ceph_data': None}, {'internal.vm_setup': None}, ]) if 'kernel' in ctx.config: from teuthology.misc import get_distro distro = get_distro(ctx) if distro == 'ubuntu': init_tasks.append({'kernel': ctx.config['kernel']}) init_tasks.extend([ {'internal.base': None}, {'internal.archive': None}, {'internal.coredump': None}, {'internal.sudo': None}, {'internal.syslog': None}, {'internal.timer': None}, ]) ctx.config['tasks'][:0] = init_tasks from teuthology.run_tasks import run_tasks try: run_tasks(tasks=ctx.config['tasks'], ctx=ctx) finally: if not ctx.summary.get('success') and ctx.config.get('nuke-on-error'): from teuthology.nuke import nuke # only unlock if we locked them in the first place nuke(ctx, log, ctx.lock) if ctx.archive is not None: with file(os.path.join(ctx.archive, 'summary.yaml'), 'w') as f: yaml.safe_dump(ctx.summary, f, default_flow_style=False) with contextlib.closing(StringIO.StringIO()) as f: yaml.safe_dump(ctx.summary, f) log.info('Summary data:\n%s' % f.getvalue()) with contextlib.closing(StringIO.StringIO()) as f: if 'email-on-error' in ctx.config and not ctx.summary.get('success', False): yaml.safe_dump(ctx.summary, f) yaml.safe_dump(ctx.config, f) emsg = f.getvalue() subject = "Teuthology error -- %s" % ctx.summary['failure_reason'] from teuthology.suite import email_results email_results(subject,"Teuthology",ctx.config['email-on-error'],emsg) if ctx.summary.get('success', True): log.info('pass') else: log.info('FAIL') import sys sys.exit(1)
def test_nuke_internal(): job_config = dict( owner='test_owner', targets={ 'user@host1': 'key1', 'user@host2': 'key2' }, archive_path='/path/to/test_run', machine_type='test_machine', os_type='centos', os_version='8.3', name='test_name', ) locks = [{ 'name': target, 'description': job_config['name'] } for target in job_config['targets'].keys()] ctx = create_fake_context(job_config) # minimal call using defaults with patch.multiple( nuke, nuke_helper=DEFAULT, list_locks=lambda: locks, unlock_one=DEFAULT, ) as m: nuke.nuke(ctx, True) m['nuke_helper'].assert_called_with(ANY, True, False, True) m['unlock_one'].assert_called() # don't unlock with patch.multiple( nuke, nuke_helper=DEFAULT, list_locks=lambda: locks, unlock_one=DEFAULT, ) as m: nuke.nuke(ctx, False) m['nuke_helper'].assert_called_with(ANY, False, False, True) m['unlock_one'].assert_not_called() # mimicing what teuthology-dispatcher --supervisor does with patch.multiple( nuke, nuke_helper=DEFAULT, list_locks=lambda: locks, unlock_one=DEFAULT, ) as m: nuke.nuke(ctx, False, True, False, True, False) m['nuke_helper'].assert_called_with(ANY, False, True, False) m['unlock_one'].assert_not_called() # no targets del ctx.config['targets'] with patch.multiple( nuke, nuke_helper=DEFAULT, unlock_one=DEFAULT, ) as m: nuke.nuke(ctx, True) m['nuke_helper'].assert_not_called() m['unlock_one'].assert_not_called()