def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info("Creating archive directory...") archive_dir = misc.get_archive_dir(ctx) run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False)) try: yield except Exception: # we need to know this below set_status(ctx.summary, "fail") raise finally: passed = get_status(ctx.summary) == "pass" if ctx.archive is not None and not (ctx.config.get("archive-on-error") and passed): log.info("Transferring archived files...") logdir = os.path.join(ctx.archive, "remote") if not os.path.exists(logdir): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info("Removing archive directory...") run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, ) ) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=['install', '-d', '-m0755', '--', archive_dir], wait=False, )) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=['rm', '-rf', '--', archive_dir], wait=False, ), )
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available teuthology.lock.ops.block_and_lock_machines(ctx, total_requested, machine_type) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = (ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False)) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].keys(): teuthology.lock.ops.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = teuth_config.reserve_machines assert isinstance(to_reserve, int), 'reserve_machines must be integer' assert (to_reserve >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: how_many = how_many - len(newly_locked) assert how_many > 0, "lock_machines: how_many counter went" \ "negative, this shouldn't happen" log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def test_get_only_success_false(self): summary = dict(success=False) status = job_status.get_status(summary) assert status == 'fail'
def test_legacy_fail(self): summary = dict(success=True) summary['success'] = False status = job_status.get_status(summary) assert status == 'fail'
def report_job(self, run_name, job_id, job_info=None, dead=False): """ Report a single job to the results server. :param run_name: The name of the run. The run must already exist. :param job_id: The job's id :param job_info: The job's info dict. Optional - if not present, we look at the archive. """ if job_info is not None and not isinstance(job_info, dict): raise TypeError("job_info must be a dict") run_uri = "{base}/runs/{name}/jobs/".format( base=self.base_uri, name=run_name, ) if job_info is None: job_info = self.serializer.job_info(run_name, job_id) if dead and get_status(job_info) is None: set_status(job_info, 'dead') job_json = json.dumps(job_info) headers = {'content-type': 'application/json'} inc = random.uniform(0, 1) with safe_while(sleep=1, increment=inc, action=f'report job {job_id}') as proceed: while proceed(): response = self.session.post(run_uri, data=job_json, headers=headers) if response.status_code == 200: return # This call is wrapped in a try/except because of: # http://tracker.ceph.com/issues/8166 try: resp_json = response.json() except ValueError: resp_json = dict() if resp_json: msg = resp_json.get('message', '') else: msg = response.text if msg and msg.endswith('already exists'): job_uri = os.path.join(run_uri, job_id, '') response = self.session.put(job_uri, data=job_json, headers=headers) if response.status_code == 200: return elif msg: self.log.error( "POST to {uri} failed with status {status}: {msg}". format( uri=run_uri, status=response.status_code, msg=msg, )) response.raise_for_status()
def test_get_only_success_true(self): summary = dict(success=True) status = job_status.get_status(summary) assert status == 'pass'
def test_get_status_none(self): summary = dict() status = job_status.get_status(summary) assert status is None
def test_get_status_dead(self): summary = dict(status='dead') status = job_status.get_status(summary) assert status == 'dead'
def test_get_status_fail(self): summary = dict(status='fail') status = job_status.get_status(summary) assert status == 'fail'
def test_get_status_pass(self): summary = dict(status='pass') status = job_status.get_status(summary) assert status == 'pass'
def test_set_then_get_status_dead(self): summary = dict() job_status.set_status(summary, 'dead') status = job_status.get_status(summary) assert status == 'dead'
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get("arch") log.info("Locking machines...") assert isinstance(config[0], int), "config[0] must be an integer" machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), "reserve_machines must be integer" assert reserved >= 0, "reserve_machines should >= 0" # change the status during the locking process report.try_push_job_info(ctx.config, dict(status="waiting")) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error("Error listing machines, trying again") time.sleep(20) continue else: raise RuntimeError("Error listing machines") # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith("scheduled"): if ctx.block: log.info( "waiting for more %s machines to be free (need %s + %s, have %s)...", machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, "not enough machines free; need %s + %s, have %s" % (reserved, requested, len(machines)) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( "{newly_locked} {mtype} machines locked this try, " "{total_locked}/{total_requested} locked so far".format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info("Waiting for virtual machines to come up") keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info("virtual machine is still unavailable") if loopcount == 40: loopcount = 0 log.info("virtual machine(s) still not up, " + "recreating unresponsive ones.") for guest in vmlist: if guest not in keys_dict.keys(): log.info("recreating: " + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats["ssh_pub_key"] ctx.config["targets"] = newscandict else: ctx.config["targets"] = all_locked locked_targets = yaml.safe_dump(ctx.config["targets"], default_flow_style=False).splitlines() log.info("\n ".join(["Locked targets:"] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status="running")) break elif not ctx.block: assert 0, "not enough machines are available" else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested ) ) log.warn("Could not lock enough machines, waiting...") time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ctx.config.get("unlock_on_failure", False) and not ctx.config.get("nuke-on-error", False) if get_status(ctx.summary) == "pass" or unlock_on_failure: log.info("Unlocking machines...") for machine in ctx.config["targets"].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = 5 # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith( 'scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False).splitlines() log.info('\n '.join([ 'Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ get_status(ctx.summary) == 'pass': log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)