def vm_setup(ctx, config): """ Look for virtual machines and handle their initialization """ all_tasks = [x.keys()[0] for x in ctx.config['tasks']] need_ansible = False if 'kernel' in all_tasks and 'ansible.cephlab' not in all_tasks: need_ansible = True ansible_hosts = set() with parallel(): editinfo = os.path.join(os.path.dirname(__file__), 'edit_sudoers.sh') for rem in ctx.cluster.remotes.iterkeys(): if misc.is_vm(rem.shortname): ansible_hosts.add(rem.shortname) r = rem.run(args=['test', '-e', '/ceph-qa-ready'], stdout=StringIO(), check_status=False) if r.returncode != 0: p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE) p2 = subprocess.Popen([ 'ssh', '-o', 'StrictHostKeyChecking=no', '-t', '-t', str(rem), 'sudo', 'sh' ], stdin=p1.stdout, stdout=subprocess.PIPE) _, err = p2.communicate() if err: log.error("Edit of /etc/sudoers failed: %s", err) if need_ansible and ansible_hosts: log.info("Running ansible on %s", list(ansible_hosts)) ansible_config = dict(hosts=list(ansible_hosts), ) with ansible.CephLab(ctx, config=ansible_config): pass
def vm_setup(ctx, config): """ Look for virtual machines and handle their initialization """ all_tasks = [x.keys()[0] for x in ctx.config['tasks']] need_chef = False if 'chef' in all_tasks or 'kernel' in all_tasks: need_chef = True with parallel() as p: editinfo = os.path.join(os.path.dirname(__file__),'edit_sudoers.sh') for rem in ctx.cluster.remotes.iterkeys(): mname = rem.shortname if misc.is_vm(mname): r = rem.run(args=['test', '-e', '/ceph-qa-ready',], stdout=StringIO(), check_status=False,) if r.returncode != 0: p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE) p2 = subprocess.Popen( [ 'ssh', '-o', 'StrictHostKeyChecking=no', '-t', '-t', str(rem), 'sudo', 'sh' ], stdin=p1.stdout, stdout=subprocess.PIPE ) _, err = p2.communicate() if err: log.info("Edit of /etc/sudoers failed: %s", err) if need_chef: p.spawn(_download_and_run_chef, rem)
def connect(ctx, config): log.info("Opening connections...") from ..orchestra import connection, remote from ..orchestra import cluster remotes = [] machs = [] for name in ctx.config["targets"].iterkeys(): machs.append(name) for t, key in ctx.config["targets"].iteritems(): log.debug("connecting to %s", t) try: if ctx.config["sshkeys"] == "ignore": key = None except (AttributeError, KeyError): pass for machine in ctx.config["targets"].iterkeys(): if teuthology.is_vm(machine): key = None break remotes.append( remote.Remote(name=t, ssh=connection.connect(user_at_host=t, host_key=key, keep_alive=True), console=None) ) ctx.cluster = cluster.Cluster() if "roles" in ctx.config: for rem, roles in zip(remotes, ctx.config["roles"]): assert all(isinstance(role, str) for role in roles), "Roles in config must be strings: %r" % roles ctx.cluster.add(rem, roles) log.info("roles: %s - %s" % (rem, roles)) else: for rem in remotes: ctx.cluster.add(rem, rem.name)
def connect(ctx, config): """ Open a connection to a remote host. """ log.info('Opening connections...') from ..orchestra import remote from ..orchestra import cluster remotes = [] machs = [] for name in ctx.config['targets'].iterkeys(): machs.append(name) for t, key in ctx.config['targets'].iteritems(): log.debug('connecting to %s', t) try: if ctx.config['sshkeys'] == 'ignore': key = None except (AttributeError, KeyError): pass if key.startswith('ssh-rsa ') or key.startswith('ssh-dss '): if teuthology.is_vm(t): key = None remotes.append( remote.Remote(name=t, host_key=key, keep_alive=True, console=None)) ctx.cluster = cluster.Cluster() if 'roles' in ctx.config: for rem, roles in zip(remotes, ctx.config['roles']): assert all(isinstance(role, str) for role in roles), \ "Roles in config must be strings: %r" % roles ctx.cluster.add(rem, roles) log.info('roles: %s - %s' % (rem, roles)) else: for rem in remotes: ctx.cluster.add(rem, rem.name)
def vm_setup(ctx, config): """ Look for virtual machines and handle their initialization """ with parallel() as p: editinfo = os.path.join(os.path.dirname(__file__), 'edit_sudoers.sh') for remote in ctx.cluster.remotes.iterkeys(): mname = re.match(".*@([^\.]*)\.?.*", str(remote)).group(1) if teuthology.is_vm(mname): r = remote.run( args=[ 'test', '-e', '/ceph-qa-ready', ], stdout=StringIO(), check_status=False, ) if r.exitstatus != 0: p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE) p2 = subprocess.Popen( ['ssh', '-t', '-t', str(remote), 'sudo', 'sh'], stdin=p1.stdout, stdout=subprocess.PIPE) _, err = p2.communicate() if err: log.info("Edit of /etc/sudoers failed: %s", err) p.spawn(_handle_vm_init, remote)
def vm_setup(ctx, config): """ Look for virtual machines and handle their initialization """ all_tasks = [x.keys()[0] for x in ctx.config["tasks"]] need_ansible = False if "kernel" in all_tasks and "ansible.cephlab" not in all_tasks: need_ansible = True ansible_hosts = set() with parallel(): editinfo = os.path.join(os.path.dirname(__file__), "edit_sudoers.sh") for rem in ctx.cluster.remotes.iterkeys(): if misc.is_vm(rem.shortname): ansible_hosts.add(rem.shortname) r = rem.run(args=["test", "-e", "/ceph-qa-ready"], stdout=StringIO(), check_status=False) if r.returncode != 0: p1 = subprocess.Popen(["cat", editinfo], stdout=subprocess.PIPE) p2 = subprocess.Popen( ["ssh", "-o", "StrictHostKeyChecking=no", "-t", "-t", str(rem), "sudo", "sh"], stdin=p1.stdout, stdout=subprocess.PIPE, ) _, err = p2.communicate() if err: log.error("Edit of /etc/sudoers failed: %s", err) if need_ansible and ansible_hosts: log.info("Running ansible on %s", list(ansible_hosts)) ansible_config = dict(hosts=list(ansible_hosts)) with ansible.CephLab(ctx, config=ansible_config): pass
def getRemoteConsole(name, ipmiuser=None, ipmipass=None, ipmidomain=None, logfile=None, timeout=20): """ Return either VirtualConsole or PhysicalConsole depending on name. """ if misc.is_vm(name): return console.VirtualConsole(name) return console.PhysicalConsole( name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
def getRemoteConsole(name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20): """ Return either VirtualConsole or PhysicalConsole depending on name. """ if misc.is_vm(name): return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout) return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
def getRemoteConsole(name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20): if misc.is_vm(name): return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout) return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
def chcon(self, file_path, context): """ Set the SELinux context of a given file. VMs and non-RPM-based hosts will skip this operation because ours currently have SELinux disabled. :param file_path: The path to the file :param context: The SELinux context to be used """ if self.os.package_type != 'rpm': return if misc.is_vm(self.shortname): return self.run(args="sudo chcon {con} {path}".format( con=context, path=file_path))
def vm_setup(ctx, config): """ Look for virtual machines and handle their initialization """ with parallel() as p: editinfo = os.path.join(os.path.dirname(__file__),'edit_sudoers.sh') for remote in ctx.cluster.remotes.iterkeys(): mname = re.match(".*@([^\.]*)\.?.*", str(remote)).group(1) if teuthology.is_vm(mname): r = remote.run(args=['test', '-e', '/ceph-qa-ready',], stdout=StringIO(), check_status=False,) if r.exitstatus != 0: p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE) p2 = subprocess.Popen(['ssh', '-t', '-t', str(remote), 'sudo', 'sh'], stdin=p1.stdout, stdout=subprocess.PIPE) _, err = p2.communicate() if err: log.info("Edit of /etc/sudoers failed: %s", err) p.spawn(_handle_vm_init, remote)
def vm_setup(ctx, config): """ Look for virtual machines and handle their initialization """ all_tasks = [x.keys()[0] for x in ctx.config['tasks']] need_chef = False if 'chef' in all_tasks or 'kernel' in all_tasks: need_chef = True with parallel() as p: editinfo = os.path.join(os.path.dirname(__file__), 'edit_sudoers.sh') for rem in ctx.cluster.remotes.iterkeys(): mname = rem.shortname if misc.is_vm(mname): r = rem.run( args=[ 'test', '-e', '/ceph-qa-ready', ], stdout=StringIO(), check_status=False, ) if r.returncode != 0: p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE) p2 = subprocess.Popen([ 'ssh', '-o', 'StrictHostKeyChecking=no', '-t', '-t', str(rem), 'sudo', 'sh' ], stdin=p1.stdout, stdout=subprocess.PIPE) _, err = p2.communicate() if err: log.info("Edit of /etc/sudoers failed: %s", err) if need_chef: p.spawn(_download_and_run_chef, rem)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] machine_types = teuthology.get_multi_machine_types(machine_type) how_many = config[0] while True: # make sure there are enough machines up machines = lock.list_locks() if machines is None: if ctx.block: log.warn('error listing machines, trying again') time.sleep(20) continue else: assert 0, 'error listing machines' is_up = lambda machine: machine['up'] and machine['type'] in machine_types # noqa num_up = len(filter(is_up, machines)) assert num_up >= how_many, 'not enough machines are up' # make sure there are machines for non-automated jobs to run is_up_and_free = lambda machine: machine['up'] and machine['locked'] == 0 and machine['type'] in machine_types # noqa up_and_free = filter(is_up_and_free, machines) num_free = len(up_and_free) if num_free < 6 and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s see %s)...', how_many, num_free, ) time.sleep(10) continue else: assert 0, 'not enough machines free' newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive) if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if teuthology.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keyscan_out = '' loopcount = 0 while len(keyscan_out.splitlines()) != len(vmlist): loopcount += 1 time.sleep(10) keyscan_out, current_locks = lock.keyscan_check(ctx, vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keyscan_out: log.info('recreating: ' + guest) lock.destroy_if_vm(ctx, 'ubuntu@' + guest) lock.create_if_vm(ctx, 'ubuntu@' + guest) if lock.update_keys(ctx, keyscan_out, current_locks): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(ctx, dkey) newscandict[dkey] = stats['sshpubkey'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked # FIXME: Ugh. log.info('\n '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines())) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ ctx.summary.get('success', False): log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = 5 while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) <= to_reserve and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s see %s)...', how_many, len(machines), ) time.sleep(10) continue else: assert 0, 'not enough machines free' newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked # FIXME: Ugh. log.info('\n '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines())) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ ctx.summary.get('success', False): log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = teuth_config.reserve_machines assert isinstance(to_reserve, int), 'reserve_machines must be integer' assert (to_reserve >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: how_many = how_many - len(newly_locked) assert how_many > 0, "lock_machines: how_many counter went" \ "negative, this shouldn't happen" log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), 'reserve_machines must be integer' assert (reserved >= 0), 'reserve_machines should >= 0' # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith('scheduled'): if ctx.block: log.info( 'waiting for more %s machines to be free (need %s + %s, have %s)...', machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (reserved, requested, len(machines))) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( '{newly_locked} {mtype} machines locked this try, ' '{total_locked}/{total_requested} locked so far'.format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = all_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False ).splitlines() log.info('\n '.join(['Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" \ "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested) ) log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ( ctx.config.get('unlock_on_failure', False) and not ctx.config.get('nuke-on-error', False) ) if get_status(ctx.summary) == 'pass' or unlock_on_failure: log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get("arch") log.info("Locking machines...") assert isinstance(config[0], int), "config[0] must be an integer" machine_type = config[1] total_requested = config[0] # We want to make sure there are always this many machines available reserved = teuth_config.reserve_machines assert isinstance(reserved, int), "reserve_machines must be integer" assert reserved >= 0, "reserve_machines should >= 0" # change the status during the locking process report.try_push_job_info(ctx.config, dict(status="waiting")) all_locked = dict() requested = total_requested while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=requested + reserved) if machines is None: if ctx.block: log.error("Error listing machines, trying again") time.sleep(20) continue else: raise RuntimeError("Error listing machines") # make sure there are machines for non-automated jobs to run if len(machines) < reserved + requested and ctx.owner.startswith("scheduled"): if ctx.block: log.info( "waiting for more %s machines to be free (need %s + %s, have %s)...", machine_type, reserved, requested, len(machines), ) time.sleep(10) continue else: assert 0, "not enough machines free; need %s + %s, have %s" % (reserved, requested, len(machines)) newly_locked = lock.lock_many(ctx, requested, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) all_locked.update(newly_locked) log.info( "{newly_locked} {mtype} machines locked this try, " "{total_locked}/{total_requested} locked so far".format( newly_locked=len(newly_locked), mtype=machine_type, total_locked=len(all_locked), total_requested=total_requested, ) ) if len(all_locked) == total_requested: vmlist = [] for lmach in all_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info("Waiting for virtual machines to come up") keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = misc.ssh_keyscan(vmlist) log.info("virtual machine is still unavailable") if loopcount == 40: loopcount = 0 log.info("virtual machine(s) still not up, " + "recreating unresponsive ones.") for guest in vmlist: if guest not in keys_dict.keys(): log.info("recreating: " + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in all_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats["ssh_pub_key"] ctx.config["targets"] = newscandict else: ctx.config["targets"] = all_locked locked_targets = yaml.safe_dump(ctx.config["targets"], default_flow_style=False).splitlines() log.info("\n ".join(["Locked targets:"] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status="running")) break elif not ctx.block: assert 0, "not enough machines are available" else: requested = requested - len(newly_locked) assert requested > 0, "lock_machines: requested counter went" "negative, this shouldn't happen" log.info( "{total} machines locked ({new} new); need {more} more".format( total=len(all_locked), new=len(newly_locked), more=requested ) ) log.warn("Could not lock enough machines, waiting...") time.sleep(10) try: yield finally: # If both unlock_on_failure and nuke-on-error are set, don't unlock now # because we're just going to nuke (and unlock) later. unlock_on_failure = ctx.config.get("unlock_on_failure", False) and not ctx.config.get("nuke-on-error", False) if get_status(ctx.summary) == "pass" or unlock_on_failure: log.info("Unlocking machines...") for machine in ctx.config["targets"].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)
def syslog(ctx, config): """ start syslog / stop syslog on exit. """ if ctx.archive is None: # disable this whole feature if we're not going to archive the data anyway yield return log.info('Starting syslog monitoring...') archive_dir = misc.get_archive_dir(ctx) log_dir = '{adir}/syslog'.format(adir=archive_dir) run.wait( ctx.cluster.run( args=[ 'mkdir', '-p', '-m0755', '--', log_dir, ], wait=False, ) ) CONF = '/etc/rsyslog.d/80-cephtest.conf' kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir) misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir) conf_lines = [ 'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log), '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format( misc_log=misc_log), ] conf_fp = StringIO('\n'.join(conf_lines)) try: for rem in ctx.cluster.remotes.iterkeys(): # Exclude downburst VMs for now; they have SELinux disabled if rem.os.package_type == 'rpm' and not misc.is_vm(rem.shortname): log_context = 'system_u:object_r:var_log_t:s0' for log_path in (kern_log, misc_log): rem.run( args="touch {log} && sudo chcon {con} {log}".format( log=log_path, con=log_context), ) misc.sudo_write_file( remote=rem, path=CONF, data=conf_fp, ) conf_fp.seek(0) run.wait( ctx.cluster.run( args=[ 'sudo', 'service', # a mere reload (SIGHUP) doesn't seem to make # rsyslog open the files 'rsyslog', 'restart', ], wait=False, ), ) yield finally: log.info('Shutting down syslog monitoring...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', CONF, run.Raw('&&'), 'sudo', 'service', 'rsyslog', 'restart', ], wait=False, ), ) # race condition: nothing actually says rsyslog had time to # flush the file fully. oh well. log.info('Checking logs for errors...') for rem in ctx.cluster.remotes.iterkeys(): log.debug('Checking %s', rem.name) r = rem.run( args=[ 'egrep', '--binary-files=text', '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)), run.Raw('|'), 'grep', '-v', 'task .* blocked for more than .* seconds', run.Raw('|'), 'grep', '-v', 'lockdep is turned off', run.Raw('|'), 'grep', '-v', 'trying to register non-static key', run.Raw('|'), 'grep', '-v', 'DEBUG: fsize', # xfs_fsr run.Raw('|'), 'grep', '-v', 'CRON', # ignore cron noise run.Raw('|'), 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 run.Raw('|'), 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 run.Raw('|'), 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output run.Raw('|'), 'grep', '-v', 'INFO: possible irq lock inversion dependency detected', # FIXME see #2590 and #147 run.Raw('|'), 'grep', '-v', 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', run.Raw('|'), 'grep', '-v', 'INFO: recovery required on readonly', run.Raw('|'), 'head', '-n', '1', ], stdout=StringIO(), ) stdout = r.stdout.getvalue() if stdout != '': log.error('Error in syslog on %s: %s', rem.name, stdout) set_status(ctx.summary, 'fail') if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ "'{error}' in syslog".format(error=stdout) log.info('Compressing syslogs...') run.wait( ctx.cluster.run( args=[ 'find', '{adir}/syslog'.format(adir=archive_dir), '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), )
def lock_machines(ctx, config): """ Lock machines. Called when the teuthology run finds and locks new machines. This is not called if the one has teuthology-locked machines and placed those keys in the Targets section of a yaml file. """ # It's OK for os_type and os_version to be None here. If we're trying # to lock a bare metal machine, we'll take whatever is available. If # we want a vps, defaults will be provided by misc.get_distro and # misc.get_distro_version in provision.create_if_vm os_type = ctx.config.get("os_type") os_version = ctx.config.get("os_version") arch = ctx.config.get('arch') log.info('Locking machines...') assert isinstance(config[0], int), 'config[0] must be an integer' machine_type = config[1] how_many = config[0] # We want to make sure there are always this many machines available to_reserve = 5 # change the status during the locking process report.try_push_job_info(ctx.config, dict(status='waiting')) while True: # get a candidate list of machines machines = lock.list_locks(machine_type=machine_type, up=True, locked=False, count=how_many + to_reserve) if machines is None: if ctx.block: log.error('Error listing machines, trying again') time.sleep(20) continue else: raise RuntimeError('Error listing machines') # make sure there are machines for non-automated jobs to run if len(machines) < to_reserve + how_many and ctx.owner.startswith( 'scheduled'): if ctx.block: log.info( 'waiting for more machines to be free (need %s + %s, have %s)...', to_reserve, how_many, len(machines), ) time.sleep(10) continue else: assert 0, ('not enough machines free; need %s + %s, have %s' % (to_reserve, how_many, len(machines))) newly_locked = lock.lock_many(ctx, how_many, machine_type, ctx.owner, ctx.archive, os_type, os_version, arch) if not newly_locked and not isinstance(newly_locked, list): raise RuntimeError('Invalid parameters specified') if len(newly_locked) == how_many: vmlist = [] for lmach in newly_locked: if misc.is_vm(lmach): vmlist.append(lmach) if vmlist: log.info('Waiting for virtual machines to come up') keys_dict = dict() loopcount = 0 while len(keys_dict) != len(vmlist): loopcount += 1 time.sleep(10) keys_dict = lock.ssh_keyscan(vmlist) log.info('virtual machine is still unavailable') if loopcount == 40: loopcount = 0 log.info('virtual machine(s) still not up, ' + 'recreating unresponsive ones.') for guest in vmlist: if guest not in keys_dict.keys(): log.info('recreating: ' + guest) full_name = misc.canonicalize_hostname(guest) provision.destroy_if_vm(ctx, full_name) provision.create_if_vm(ctx, full_name) if lock.do_update_keys(keys_dict): log.info("Error in virtual machine keys") newscandict = {} for dkey in newly_locked.iterkeys(): stats = lockstatus.get_status(dkey) newscandict[dkey] = stats['ssh_pub_key'] ctx.config['targets'] = newscandict else: ctx.config['targets'] = newly_locked locked_targets = yaml.safe_dump( ctx.config['targets'], default_flow_style=False).splitlines() log.info('\n '.join([ 'Locked targets:', ] + locked_targets)) # successfully locked machines, change status back to running report.try_push_job_info(ctx.config, dict(status='running')) break elif not ctx.block: assert 0, 'not enough machines are available' log.warn('Could not lock enough machines, waiting...') time.sleep(10) try: yield finally: if ctx.config.get('unlock_on_failure', False) or \ get_status(ctx.summary) == 'pass': log.info('Unlocking machines...') for machine in ctx.config['targets'].iterkeys(): lock.unlock_one(ctx, machine, ctx.owner, ctx.archive)