def report_outcome(config, archive, summary, fake_ctx): """ Reports on the final outcome of the command. """ status = get_status(summary) passed = status == 'pass' if not passed and bool(config.get('nuke-on-error')): # only unlock if we locked them in the first place nuke(fake_ctx, fake_ctx.lock) if archive is not None: with file(os.path.join(archive, 'summary.yaml'), 'w') as f: yaml.safe_dump(summary, f, default_flow_style=False) with contextlib.closing(StringIO.StringIO()) as f: yaml.safe_dump(summary, f) log.info('Summary data:\n%s' % f.getvalue()) with contextlib.closing(StringIO.StringIO()) as f: if ('email-on-error' in config and not passed): yaml.safe_dump(summary, f) yaml.safe_dump(config, f) emsg = f.getvalue() subject = "Teuthology error -- %s" % summary['failure_reason'] email_results(subject, "Teuthology", config['email-on-error'], emsg) report.try_push_job_info(config, summary) if passed: log.info(status) else: log.info(str(status).upper()) sys.exit(1)
def run_with_watchdog(process, job_config): job_start_time = datetime.utcnow() # Only push the information that's relevant to the watchdog, to save db # load job_info = dict( name=job_config['name'], job_id=job_config['job_id'], ) # Sleep once outside of the loop to avoid double-posting jobs time.sleep(teuth_config.watchdog_interval) hit_max_timeout = False while process.poll() is None: # Kill jobs that have been running longer than the global max run_time = datetime.utcnow() - job_start_time total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds if total_seconds > teuth_config.max_job_time: hit_max_timeout = True log.warning("Job ran longer than {max}s. Killing...".format( max=teuth_config.max_job_time)) try: # kill processes but do not unlock yet so we can save # the logs, coredumps, etc. kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner'], save_logs=True) except Exception: log.exception('Failed to kill job') try: transfer_archives(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config) except Exception: log.exception('Could not save logs') try: # this time remove everything and unlock the machines kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner']) except Exception: log.exception('Failed to kill job and unlock machines') # calling this without a status just updates the jobs updated time report.try_push_job_info(job_info) time.sleep(teuth_config.watchdog_interval) # we no longer support testing theses old branches assert(job_config.get('teuthology_branch') not in ('argonaut', 'bobtail', 'cuttlefish', 'dumpling')) # Let's make sure that paddles knows the job is finished. We don't know # the status, but if it was a pass or fail it will have already been # reported to paddles. In that case paddles ignores the 'dead' status. # If the job was killed, paddles will use the 'dead' status. extra_info = dict(status='dead') if hit_max_timeout: extra_info['failure_reason'] = 'hit max job timeout' report.try_push_job_info(job_info, extra_info)
def lock_machines(job_config): report.try_push_job_info(job_config, dict(status='running')) fake_ctx = supervisor.create_fake_context(job_config, block=True) block_and_lock_machines(fake_ctx, len(job_config['roles']), job_config['machine_type'], reimage=False) job_config = fake_ctx.config return job_config
def run_with_watchdog(process, job_config): job_start_time = datetime.utcnow() # Only push the information that's relevant to the watchdog, to save db # load job_info = dict( name=job_config['name'], job_id=job_config['job_id'], ) # Sleep once outside of the loop to avoid double-posting jobs time.sleep(teuth_config.watchdog_interval) symlink_worker_log(job_config['worker_log'], job_config['archive_path']) while process.poll() is None: # Kill jobs that have been running longer than the global max run_time = datetime.utcnow() - job_start_time total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds if total_seconds > teuth_config.max_job_time: log.warning("Job ran longer than {max}s. Killing...".format( max=teuth_config.max_job_time)) kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner']) # calling this without a status just updates the jobs updated time report.try_push_job_info(job_info) time.sleep(teuth_config.watchdog_interval) # The job finished. Let's make sure paddles knows. branches_sans_reporting = ('argonaut', 'bobtail', 'cuttlefish', 'dumpling') if job_config.get('teuthology_branch') in branches_sans_reporting: # The job ran with a teuthology branch that may not have the reporting # feature. Let's call teuthology-report (which will be from the master # branch) to report the job manually. cmd = "teuthology-report -v -D -r {run_name} -j {job_id}".format( run_name=job_info['name'], job_id=job_info['job_id']) try: log.info("Executing %s" % cmd) report_proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) while report_proc.poll() is None: for line in report_proc.stdout.readlines(): log.info(line.strip()) time.sleep(1) log.info("Reported results via the teuthology-report command") except Exception: log.exception("teuthology-report failed") else: # Let's make sure that paddles knows the job is finished. We don't know # the status, but if it was a pass or fail it will have already been # reported to paddles. In that case paddles ignores the 'dead' status. # If the job was killed, paddles will use the 'dead' status. report.try_push_job_info(job_info, dict(status='dead'))
def check_packages(ctx, config): """ Checks gitbuilder to determine if there are missing packages for this job. If there are missing packages, fail the job. """ for task in ctx.config['tasks']: if task.keys()[0] == 'buildpackages': log.info("Checking packages skipped because " "the task buildpackages was found.") return log.info("Checking packages...") os_type = ctx.config.get("os_type") sha1 = ctx.config.get("sha1") # We can only do this check if there are a defined sha1 and os_type # in the job config. if os_type and sha1: package = get_builder_project()("ceph", ctx.config) template = "Checking packages for os_type '{os}', " \ "flavor '{flav}' and ceph hash '{ver}'" log.info( template.format( os=package.os_type, flav=package.flavor, ver=package.sha1, ) ) if package.version: log.info("Found packages for ceph version {ver}".format( ver=package.version )) else: msg = "Packages for distro '{d}' and ceph hash '{ver}' not found" msg = msg.format( d=package.distro, ver=package.sha1, ) log.error(msg) # set the failure message and update paddles with the status ctx.summary["failure_reason"] = msg set_status(ctx.summary, "dead") report.try_push_job_info(ctx.config, dict(status='dead')) raise VersionNotFoundError(package.base_url) else: log.info( "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format( os=os_type, ver=sha1, ) )
def prep_job(job_config, log_file_path, archive_dir): job_id = job_config['job_id'] safe_archive = safepath.munge(job_config['name']) job_config['worker_log'] = log_file_path archive_path_full = os.path.join(archive_dir, safe_archive, str(job_id)) job_config['archive_path'] = archive_path_full # If the teuthology branch was not specified, default to master and # store that value. teuthology_branch = job_config.get('teuthology_branch', 'master') job_config['teuthology_branch'] = teuthology_branch try: if teuth_config.teuthology_path is not None: teuth_path = teuth_config.teuthology_path else: teuth_path = fetch_teuthology(branch=teuthology_branch) # For the teuthology tasks, we look for suite_branch, and if we # don't get that, we look for branch, and fall back to 'master'. # last-in-suite jobs don't have suite_branch or branch set. ceph_branch = job_config.get('branch', 'master') suite_branch = job_config.get('suite_branch', ceph_branch) suite_repo = job_config.get('suite_repo') if suite_repo: teuth_config.ceph_qa_suite_git_url = suite_repo job_config['suite_path'] = os.path.normpath( os.path.join( fetch_qa_suite(suite_branch), job_config.get('suite_relpath', ''), )) except BranchNotFoundError as exc: log.exception("Branch not found; marking job as dead") report.try_push_job_info(job_config, dict(status='dead', failure_reason=str(exc))) raise SkipJob() except MaxWhileTries as exc: log.exception("Failed to fetch or bootstrap; marking job as dead") report.try_push_job_info(job_config, dict(status='dead', failure_reason=str(exc))) raise SkipJob() teuth_bin_path = os.path.join(teuth_path, 'virtualenv', 'bin') if not os.path.isdir(teuth_bin_path): raise RuntimeError("teuthology branch %s at %s not bootstrapped!" % (teuthology_branch, teuth_bin_path)) return job_config, teuth_bin_path
def schedule_job(job_config, num=1): """ Schedule a job. :param job_config: The complete job dict :param num: The number of times to schedule the job """ num = int(num) job = yaml.safe_dump(job_config) tube = job_config.pop('tube') beanstalk = teuthology.beanstalk.connect() beanstalk.use(tube) while num > 0: jid = beanstalk.put( job, ttr=60 * 60 * 24, priority=job_config['priority'], ) print 'Job scheduled with name {name} and ID {jid}'.format( name=job_config['name'], jid=jid) job_config['job_id'] = str(jid) report.try_push_job_info(job_config, dict(status='queued')) num -= 1
def run_with_watchdog(process, job_config): job_start_time = datetime.utcnow() # Only push the information that's relevant to the watchdog, to save db # load job_info = dict( name=job_config['name'], job_id=job_config['job_id'], ) # Sleep once outside of the loop to avoid double-posting jobs time.sleep(teuth_config.watchdog_interval) symlink_worker_log(job_config['worker_log'], job_config['archive_path']) while process.poll() is None: # Kill jobs that have been running longer than the global max run_time = datetime.utcnow() - job_start_time total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds if total_seconds > teuth_config.max_job_time: log.warning("Job ran longer than {max}s. Killing...".format( max=teuth_config.max_job_time)) kill_job(job_info['name'], job_info['job_id'], teuth_config.archive_base, job_config['owner']) # calling this without a status just updates the jobs updated time report.try_push_job_info(job_info) time.sleep(teuth_config.watchdog_interval) # we no longer support testing theses old branches assert (job_config.get('teuthology_branch') not in ('argonaut', 'bobtail', 'cuttlefish', 'dumpling')) # Let's make sure that paddles knows the job is finished. We don't know # the status, but if it was a pass or fail it will have already been # reported to paddles. In that case paddles ignores the 'dead' status. # If the job was killed, paddles will use the 'dead' status. report.try_push_job_info(job_info, dict(status='dead'))
def reimage(job_config): # Reimage the targets specified in job config # and update their keys in config after reimaging ctx = create_fake_context(job_config) # change the status during the reimaging process report.try_push_job_info(ctx.config, dict(status='waiting')) targets = job_config['targets'] try: reimaged = reimage_machines(ctx, targets, job_config['machine_type']) except Exception as e: log.exception('Reimaging error. Nuking machines...') # Reimage failures should map to the 'dead' status instead of 'fail' report.try_push_job_info(ctx.config, dict(status='dead', failure_reason='Error reimaging machines: ' + str(e))) nuke(ctx, True) raise ctx.config['targets'] = reimaged # change the status to running after the reimaging process report.try_push_job_info(ctx.config, dict(status='running'))
def main(args): verbose = args["--verbose"] archive = args["--archive"] owner = args["--owner"] config = args["<config>"] name = args["--name"] description = args["--description"] machine_type = args["--machine-type"] block = args["--block"] lock = args["--lock"] suite_path = args["--suite-path"] os_type = args["--os-type"] os_version = args["--os-version"] set_up_logging(verbose, archive) # print the command being ran log.debug("Teuthology command: {0}".format(get_teuthology_command(args))) if owner is None: args["--owner"] = owner = get_user() config = setup_config(config) if archive is not None and 'archive_path' not in config: config['archive_path'] = archive write_initial_metadata(archive, config, name, description, owner) report.try_push_job_info(config, dict(status='running')) machine_type = get_machine_type(machine_type, config) args["--machine-type"] = machine_type if block: assert lock, \ 'the --block option is only supported with the --lock option' log.info('\n '.join([ 'Config:', ] + yaml.safe_dump(config, default_flow_style=False).splitlines())) args["summary"] = get_summary(owner, description) ceph_repo = config.get('repo') if ceph_repo: teuth_config.ceph_git_url = ceph_repo suite_repo = config.get('suite_repo') if suite_repo: teuth_config.ceph_qa_suite_git_url = suite_repo # overwrite the config values of os_{type,version} if corresponding # command-line arguments are provided if os_type: config["os_type"] = os_type if os_version: config["os_version"] = os_version config["tasks"] = validate_tasks(config) init_tasks = get_initial_tasks(lock, config, machine_type) # prepend init_tasks to the front of the task list config['tasks'][:0] = init_tasks if suite_path is not None: config['suite_path'] = suite_path # fetches the tasks and returns a new suite_path if needed config["suite_path"] = fetch_tasks_if_needed(config) # If the job has a 'use_shaman' key, use that value to override the global # config's value. if config.get('use_shaman') is not None: teuth_config.use_shaman = config['use_shaman'] # create a FakeNamespace instance that mimics the old argparse way of doing # things we do this so we can pass it to run_tasks without porting those # tasks to the new way of doing things right now args["<config>"] = config fake_ctx = FakeNamespace(args) # store on global config if interactive-on-error, for contextutil.nested() # FIXME this should become more generic, and the keys should use # '_' uniformly if fake_ctx.config.get('interactive-on-error'): teuthology.config.config.ctx = fake_ctx try: run_tasks(tasks=config['tasks'], ctx=fake_ctx) finally: # print to stdout the results and possibly send an email on any errors report_outcome(config, archive, fake_ctx.summary, fake_ctx)
def main(ctx): if ctx.owner is None: ctx.owner = 'scheduled_{user}'.format(user=get_user()) read_config(ctx) beanstalk = teuthology.beanstalk.connect() tube = ctx.worker beanstalk.use(tube) if ctx.show: for job_id in ctx.show: job = beanstalk.peek(job_id) if job is None and ctx.verbose: print 'job {jid} is not in the queue'.format(jid=job_id) else: print '--- job {jid} priority {prio} ---\n'.format( jid=job_id, prio=job.stats()['pri']), job.body return if ctx.delete: for job_id in ctx.delete: job = beanstalk.peek(job_id) if job is None: print 'job {jid} is not in the queue'.format(jid=job_id) else: job.delete() name = yaml.safe_load(job.body).get('name') if name: report.try_delete_jobs(name, job_id) return # strip out targets; the worker will allocate new ones when we run # the job with --lock. if ctx.config.get('targets'): del ctx.config['targets'] job_config = dict( name=ctx.name, last_in_suite=ctx.last_in_suite, email=ctx.email, description=ctx.description, owner=ctx.owner, verbose=ctx.verbose, machine_type=ctx.worker, ) # Merge job_config and ctx.config job_config.update(ctx.config) if ctx.timeout is not None: job_config['results_timeout'] = ctx.timeout job = yaml.safe_dump(job_config) num = ctx.num while num > 0: jid = beanstalk.put( job, ttr=60 * 60 * 24, priority=ctx.priority, ) print 'Job scheduled with name {name} and ID {jid}'.format( name=ctx.name, jid=jid) job_config['job_id'] = str(jid) report.try_push_job_info(job_config, dict(status='queued')) num -= 1
def main(args): # run dispatcher in job supervisor mode if --supervisor passed if args["--supervisor"]: return supervisor.main(args) verbose = args["--verbose"] tube = args["--tube"] log_dir = args["--log-dir"] archive_dir = args["--archive-dir"] if archive_dir is None: archive_dir = teuth_config.archive_base # setup logging for disoatcher in {log_dir} loglevel = logging.INFO if verbose: loglevel = logging.DEBUG log.setLevel(loglevel) log_file_path = os.path.join(log_dir, f"dispatcher.{tube}.{os.getpid()}") setup_log_file(log_file_path) install_except_hook() load_config(archive_dir=archive_dir) connection = beanstalk.connect() beanstalk.watch_tube(connection, tube) result_proc = None if teuth_config.teuthology_path is None: fetch_teuthology('master') fetch_qa_suite('master') keep_running = True while keep_running: # Check to see if we have a teuthology-results process hanging around # and if so, read its return code so that it can exit. if result_proc is not None and result_proc.poll() is not None: log.debug("teuthology-results exited with code: %s", result_proc.returncode) result_proc = None if sentinel(restart_file_path): restart() elif sentinel(stop_file_path): stop() load_config() job = connection.reserve(timeout=60) if job is None: continue # bury the job so it won't be re-run if it fails job.bury() job_id = job.jid log.info('Reserved job %d', job_id) log.info('Config is: %s', job.body) job_config = yaml.safe_load(job.body) job_config['job_id'] = str(job_id) if job_config.get('stop_worker'): keep_running = False try: job_config, teuth_bin_path = prep_job( job_config, log_file_path, archive_dir, ) except SkipJob: continue # lock machines but do not reimage them if 'roles' in job_config: job_config = lock_machines(job_config) run_args = [ os.path.join(teuth_bin_path, 'teuthology-dispatcher'), '--supervisor', '-v', '--bin-path', teuth_bin_path, '--archive-dir', archive_dir, ] # Create run archive directory if not already created and # job's archive directory create_job_archive(job_config['name'], job_config['archive_path'], archive_dir) job_config_path = os.path.join(job_config['archive_path'], 'orig.config.yaml') # Write initial job config in job archive dir with open(job_config_path, 'w') as f: yaml.safe_dump(job_config, f, default_flow_style=False) run_args.extend(["--job-config", job_config_path]) try: job_proc = subprocess.Popen(run_args) log.info('Job supervisor PID: %s', job_proc.pid) except Exception: error_message = "Saw error while trying to spawn supervisor." log.exception(error_message) if 'targets' in job_config: nuke(supervisor.create_fake_context(job_config), True) report.try_push_job_info( job_config, dict(status='fail', failure_reason=error_message)) # This try/except block is to keep the worker from dying when # beanstalkc throws a SocketError try: job.delete() except Exception: log.exception("Saw exception while trying to delete job")
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = teuthology.lock.query.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith( 'scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) try: newly_locked = teuthology.lock.ops.lock_many( ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) except Exception: # Lock failures should map to the 'dead' status instead of 'fail' set_status(ctx.summary, 'dead') raise all_locked.update(newly_locked) log.info('{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, )) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if teuthology.lock.query.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if teuthology.lock.ops.do_update_keys(keys_dict)[0]: log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.keys(): stats = teuthology.lock.query.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False).splitlines() log.info('\n '.join([ 'Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested)) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = (ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False)) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].keys(): teuthology.lock.ops.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = teuthology.lock.query.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) try: newly_locked = teuthology.lock.ops.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) except Exception: # Lock failures should map to the 'dead' status instead of 'fail' set_status(ctx.summary, 'dead') raise all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if teuthology.lock.query.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if teuthology.lock.keys.do_update_keys(keys_dict)[0]: log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = teuthology.lock.query.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): teuthology.lock.ops.unlock_one(ctx, machine, ctx.owner, ctx.archive)