def get_oar_job_vm5k_resources(jobs): """Retrieve the hosts list and (ip, mac) list from a list of oar_job and return the resources dict needed by vm5k_deployment """ resources = {} for oar_job_id, site in jobs: logger.detail('Retrieving resources from %s:%s', style.emph(site), oar_job_id) oar_job_id = int(oar_job_id) wait_oar_job_start(oar_job_id, site) logger.debug('Retrieving hosts') hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)] logger.debug('Retrieving subnet') ip_mac, _ = get_oar_job_subnets(oar_job_id, site) kavlan = None if len(ip_mac) == 0: logger.debug('Retrieving kavlan') kavlan = get_oar_job_kavlan(oar_job_id, site) if kavlan: assert(len(kavlan) == 1) kavlan = kavlan[0] ip_mac = get_kavlan_ip_mac(kavlan, site) resources[site] = {'hosts': hosts, 'ip_mac': ip_mac[300:], 'kavlan': kavlan} return resources
def wait_hosts_down(hosts, timeout=300): """ """ timer = Timer() up_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_') with fdopen(fd, 'w') as f: f.write('\n' + '\n'.join(up_hosts)) while len(up_hosts) > 0 and timer.elapsed() < timeout: nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" % (hosts_file, ), shell=True).run() logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(), nmap.stdout.strip()) for line in nmap.stdout.strip().split('\n'): if 'Down' in line: ip = line.split()[1] get_host = Process('host ' + ip + '| cut -f 5 -d " "', shell=True).run() host = get_host.stdout.strip()[0:-1] if host in up_hosts: logger.detail(host + ' is down') up_hosts.remove(host) Process('rm ' + hosts_file).run() return len(up_hosts) == 0
def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace('(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def get_oar_job_vm5k_resources(jobs): """Retrieve the hosts list and (ip, mac) list from a list of oar_job and return the resources dict needed by vm5k_deployment """ resources = {} for oar_job_id, site in jobs: logger.detail('Retrieving resources from %s:%s', style.emph(site), oar_job_id) oar_job_id = int(oar_job_id) wait_oar_job_start(oar_job_id, site) logger.debug('Retrieving hosts') hosts = [host.address for host in get_oar_job_nodes(oar_job_id, site)] logger.debug('Retrieving subnet') ip_mac, _ = get_oar_job_subnets(oar_job_id, site) kavlan = None if len(ip_mac) == 0: logger.debug('Retrieving kavlan') kavlan = get_oar_job_kavlan(oar_job_id, site) if kavlan: assert (len(kavlan) == 1) kavlan = kavlan[0] ip_mac = get_kavlan_ip_mac(kavlan, site) resources[site] = { 'hosts': hosts, 'ip_mac': ip_mac[300:], 'kavlan': kavlan } return resources
def _configure_apt(self): """Create the sources.list file """ logger.detail('Configuring APT') # Create sources.list file fd, tmpsource = mkstemp(dir='/tmp/', prefix='sources.list_') f = fdopen(fd, 'w') f.write('deb http://ftp.debian.org/debian wheezy main contrib non-free\n' + \ 'deb http://ftp.debian.org/debian wheezy-backports main contrib non-free\n' + \ 'deb http://security.debian.org/ wheezy/updates main contrib non-free\n') f.close() # Create preferences file fd, tmppref = mkstemp(dir='/tmp/', prefix='preferences_') f = fdopen(fd, 'w') f.write('Package: * \nPin: release a=wheezy \nPin-Priority: 900\n\n' + \ 'Package: * \nPin: release a=wheezy-backports \nPin-Priority: 875\n\n') f.close() # Create apt.conf file fd, tmpaptconf = mkstemp(dir='/tmp/', prefix='apt.conf_') f = fdopen(fd, 'w') f.write('APT::Acquire::Retries=20;\n') f.close() TaktukPut(self.hosts, [tmpsource, tmppref, tmpaptconf], remote_location='/etc/apt/').run() cmd = 'cd /etc/apt && ' + \ 'mv ' + tmpsource.split('/')[-1] + ' sources.list &&' + \ 'mv ' + tmppref.split('/')[-1] + ' preferences &&' + \ 'mv ' + tmpaptconf.split('/')[-1] + ' apt.conf' apt_conf = self.fact.get_remote(cmd, self.hosts).run() self._actions_hosts(apt_conf) Local('rm ' + tmpsource + ' ' + tmppref + ' ' + tmpaptconf).run()
def _create_backing_file(self, disks=None, backing_file_dir='/tmp'): """ """ if not self.copy_actions: self._start_disk_copy(disks) if not self.copy_actions.ended: logger.info("Waiting for the end of the disks copy") self.copy_actions.wait() if isinstance(self.copy_actions, ParallelActions): mv_actions = [] for act in self.copy_actions.actions: fname = act.local_files[0].split('/')[-1] mv_actions.append(self.fact.get_remote("mv %s/" % backing_file_dir + fname + " %s/orig_" % backing_file_dir + fname, self.hosts)) mv = ParallelActions(mv_actions).run() if not disks: disks = self.backing_files for bf in disks: raw_disk = '%s/orig_' % backing_file_dir + bf.split('/')[-1] to_disk = '%s/' % backing_file_dir + bf.split('/')[-1] self.fact.get_remote('cp ' + raw_disk + ' ' + to_disk, self.hosts).run() logger.info('Copying ssh key on ' + to_disk + ' ...') cmd = 'modprobe nbd max_part=16; ' + \ 'qemu-nbd --connect=/dev/nbd0 ' + to_disk + \ ' ; sleep 3 ; partprobe /dev/nbd0 ; ' + \ 'part=`fdisk -l /dev/nbd0 |grep dev|grep Linux| grep -v swap|cut -f 1 -d " "` ; ' + \ 'mount $part /mnt ; mkdir -p /mnt/root/.ssh ; ' + \ 'cat /root/.ssh/authorized_keys >> /mnt/root/.ssh/authorized_keys ; ' + \ 'cp -r /root/.ssh/id_rsa* /mnt/root/.ssh/ ;' + \ 'umount /mnt; qemu-nbd -d /dev/nbd0' logger.detail(cmd) copy_on_vm_base = self.fact.get_remote(cmd, self.hosts).run() self._actions_hosts(copy_on_vm_base)
def _libvirt_bridged_network(self, bridge): logger.detail('Configuring libvirt network') # Creating an XML file describing the network root = Element('network') name = SubElement(root, 'name') name.text = 'default' SubElement(root, 'forward', attrib={'mode': 'bridge'}) SubElement(root, 'bridge', attrib={'name': bridge}) fd, network_xml = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(prettify(root)) f.close() logger.debug('Destroying existing network') destroy = self.fact.get_remote('virsh net-destroy default; ' + 'virsh net-undefine default', self.hosts) put = TaktukPut(self.hosts, [network_xml], remote_location='/root/') start = self.fact.get_remote( 'virsh net-define /root/' + \ network_xml.split('/')[-1] + ' ; ' + \ 'virsh net-start default; virsh net-autostart default;', self.hosts) netconf = SequentialActions([destroy, put, start]).run() self._actions_hosts(netconf)
def _libvirt_bridged_network(self, bridge): logger.detail('Configuring libvirt network') # Creating an XML file describing the network root = Element('network') name = SubElement(root, 'name') name.text = 'default' SubElement(root, 'forward', attrib={'mode': 'bridge'}) SubElement(root, 'bridge', attrib={'name': bridge}) fd, network_xml = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(prettify(root)) f.close() logger.debug('Destroying existing network') destroy = self.fact.get_remote( 'virsh net-destroy default; ' + 'virsh net-undefine default', self.hosts) put = TaktukPut(self.hosts, [network_xml], remote_location='/root/') start = self.fact.get_remote( 'virsh net-define /root/' + \ network_xml.split('/')[-1] + ' ; ' + \ 'virsh net-start default; virsh net-autostart default;', self.hosts) netconf = SequentialActions([destroy, put, start]).run() self._actions_hosts(netconf)
def _enable_bridge(self, name='br0'): """We need a bridge to have automatic DHCP configuration for the VM.""" logger.detail('Configuring the bridge') hosts_br = self._get_bridge(self.hosts) nobr_hosts = [] for host, br in hosts_br.iteritems(): if br is None: logger.debug('No bridge on host %s', style.host(host)) nobr_hosts.append(host) elif br != name: logger.debug('Wrong bridge on host %s, destroying it', style.host(host)) SshProcess('ip link set ' + br + ' down ; brctl delbr ' + br, host).run() nobr_hosts.append(host) else: logger.debug('Bridge %s is present on host %s', style.emph('name'), style.host(host)) nobr_hosts = map(lambda x: x.address if isinstance(x, Host) else x, nobr_hosts) if len(nobr_hosts) > 0: logger.debug('Creating bridge on %s', hosts_list(nobr_hosts)) script = 'export br_if=`ip route |grep default |cut -f 5 -d " "`; \n' + \ 'ifdown $br_if ; \n' + \ 'sed -i "s/$br_if inet dhcp/$br_if inet manual/g" /etc/network/interfaces ; \n' + \ 'sed -i "s/auto $br_if//g" /etc/network/interfaces ; \n' + \ 'echo " " >> /etc/network/interfaces ; \n' + \ 'echo "auto ' + name + '" >> /etc/network/interfaces ; \n' + \ 'echo "iface ' + name + ' inet dhcp" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_ports $br_if" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_stp off" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_maxwait 0" >> /etc/network/interfaces ; \n' + \ 'echo " bridge_fd 0" >> /etc/network/interfaces ; \n' + \ 'ifup ' + name fd, br_script = mkstemp(dir='/tmp/', prefix='create_br_') f = fdopen(fd, 'w') f.write(script) f.close() self.fact.get_fileput(nobr_hosts, [br_script]).run() self.fact.get_remote('nohup sh ' + br_script.split('/')[-1], nobr_hosts).run() logger.debug('Waiting for network restart') if_up = False nmap_tries = 0 while (not if_up) and nmap_tries < 20: sleep(20) nmap_tries += 1 nmap = Process('nmap ' + ' '.join([host for host in nobr_hosts]) + ' -p 22').run() for line in nmap.stdout.split('\n'): if 'Nmap done' in line: if_up = line.split()[2] == line.split()[5].replace( '(', '') logger.debug('Network has been restarted') logger.detail('All hosts have the bridge %s', style.emph(name))
def _configure_apt(self): """Create the sources.list file """ logger.detail('Configuring APT') # Create sources.list file fd, tmpsource = mkstemp(dir='/tmp/', prefix='sources.list_') f = fdopen(fd, 'w') f.write('deb http://ftp.debian.org/debian %s main contrib non-free\n' % self.debian_name + \ 'deb http://ftp.debian.org/debian %s-backports main contrib non-free\n' % self.debian_name+ \ 'deb http://security.debian.org/ %s/updates main contrib non-free\n' % self.debian_name) f.close() # Create preferences file fd, tmppref = mkstemp(dir='/tmp/', prefix='preferences_') f = fdopen(fd, 'w') f.write('Package: * \nPin: release a=%s \nPin-Priority: 900\n\n' % self.debian_name + \ 'Package: * \nPin: release a=%s-backports \nPin-Priority: 875\n\n' % self.debian_name) f.close() # Create apt.conf file fd, tmpaptconf = mkstemp(dir='/tmp/', prefix='apt.conf_') f = fdopen(fd, 'w') f.write('APT::Acquire::Retries=20;\n') f.close() TaktukPut(self.hosts, [tmpsource, tmppref, tmpaptconf], remote_location='/etc/apt/').run() cmd = 'cd /etc/apt && ' + \ 'mv ' + tmpsource.split('/')[-1] + ' sources.list &&' + \ 'mv ' + tmppref.split('/')[-1] + ' preferences &&' + \ 'mv ' + tmpaptconf.split('/')[-1] + ' apt.conf' apt_conf = self.fact.get_remote(cmd, self.hosts).run() self._actions_hosts(apt_conf) Local('rm ' + tmpsource + ' ' + tmppref + ' ' + tmpaptconf).run()
def __get_site(site): logger.detail(site) site_attrs_th = threading.Thread(target=__get_site_attrs, args=(site, )) site_attrs_th.start() site_network_th = threading.Thread(target=__get_site_network, args=(site, )) site_network_th.start() cluster_attrs_th = {} host_attrs_th = {} for cluster in _get_site_clusters_uncached(site): t = threading.Thread(target=__get_cluster_attrs, args=(site, cluster)) t.start() cluster_attrs_th[cluster] = t t = threading.Thread(target=__get_host_attrs, args=(site, cluster)) t.start() host_attrs_th[cluster] = t for t in [site_attrs_th, site_network_th] + list( cluster_attrs_th.values()) + list(host_attrs_th.values()): t.join() threading.currentThread().site_data = site_attrs_th.site_data threading.currentThread().network_data = site_network_th.network_data threading.currentThread().cluster_data = {} threading.currentThread().host_data = {} for cluster in cluster_attrs_th: threading.currentThread( ).cluster_data[cluster] = cluster_attrs_th[cluster].cluster_data threading.currentThread( ).host_data[cluster] = host_attrs_th[cluster].host_data
def _libvirt_uniquify(self): logger.detail('Making libvirt host unique') cmd = 'uuid=`uuidgen` ' + \ '&& sed -i "s/.*host_uuid.*/host_uuid=\\"${uuid}\\"/g" ' + \ '/etc/libvirt/libvirtd.conf ' + \ '&& service libvirtd restart' logger.debug(cmd) self.fact.get_remote(cmd, self.hosts).run()
def _libvirt_check_service(self): """ """ logger.detail('Checking libvirt service name') cmd = "if [ ! -e /etc/init.d/libvirtd ]; " + \ " then if [ -e /etc/init.d/libvirt-bin ]; " + \ " then ln -s /etc/init.d/libvirt-bin /etc/init.d/libvirtd; " + \ " else echo 1; " + \ " fi; " + \ "else echo 0; fi" check_libvirt = self.fact.get_remote(cmd, self.hosts).run() self._actions_hosts(check_libvirt)
def _read_api_cache(cache_dir): """Read the picke files from cache_dir and return two dicts - network = the network_equipements of all sites and backbone - hosts = the hosts of all sites """ data = {} logger.detail('Reading data from cache ...') for e in ['network', 'sites', 'clusters', 'hosts', 'hierarchy']: with open(cache_dir + e, 'rb') as f: data[e] = load(f) return data
def configure_apt_proxy(vms): """Override apt proxy-guess with server as proxy""" hosts_vms = {} for vm in vms: if not vm['host'] in hosts_vms: hosts_vms[vm['host']] = [] hosts_vms[vm['host']].append(vm['ip']) conf = [] for server, clients in hosts_vms.iteritems(): server = Host(server) logger.detail('Configuring %s as APT proxy for %s', style.host(server.address), ','.join(clients)) conf.append(TaktukRemote(' echo \'Acquire::http::Proxy \"http://' + server.address + ':9999" ; \' > /etc/apt/apt.conf.d/proxy-guess', clients)) ParallelActions(conf).run()
def create_disks(vms): """ Return an action to create the disks for the VMs on the hosts""" logger.detail(', '.join([vm['id'] for vm in sorted(vms)])) hosts_cmds = {} for vm in vms: if vm['real_file']: cmd = cmd_disk_real(vm) else: cmd = cmd_disk_qcow2(vm) logger.detail(vm['id'] + ': ' + cmd) hosts_cmds[vm['host']] = cmd if not vm['host'] in hosts_cmds \ else hosts_cmds[vm['host']] + cmd logger.debug(pformat(hosts_cmds.values())) return TaktukRemote('{{hosts_cmds.values()}}', list(hosts_cmds.keys()))
def configure_apt_proxy(vms): """Override apt proxy-guess with server as proxy""" hosts_vms = {} for vm in vms: if not vm['host'] in hosts_vms: hosts_vms[vm['host']] = [] hosts_vms[vm['host']].append(vm['ip']) conf = [] for server, clients in hosts_vms.iteritems(): server = Host(server) logger.detail('Configuring %s as APT proxy for %s', style.host(server.address), ','.join(clients)) conf.append( TaktukRemote( ' echo \'Acquire::http::Proxy \"http://' + server.address + ':9999" ; \' > /etc/apt/apt.conf.d/proxy-guess', clients)) ParallelActions(conf).run()
def install_vms(vms): """ Return an action to install the VM on the hosts""" logger.detail(', '.join([vm['id'] for vm in sorted(vms)])) hosts_cmds = {} for vm in vms: cmd = 'virt-install -d --import --connect qemu:///system ' + \ '--nographics --noautoconsole --noreboot --name=' + vm['id'] + ' '\ '--network network=default,mac=' + vm['mac'] + ' --ram=' + \ str(vm['mem']) + ' --disk path=/tmp/' + vm['id'] + \ '.qcow2,device=disk,bus=virtio,format=qcow2,size=' + \ str(vm['hdd']) + ',cache=none ' + \ '--vcpus=' + str(vm['n_cpu']) + ' --cpuset=' + vm['cpuset'] if vm['tap']: cmd += '--network tap,script=no,ifname=' + vm['tap'] cmd += ' ; ' hosts_cmds[vm['host']] = cmd if not vm['host'] in hosts_cmds \ else hosts_cmds[vm['host']] + cmd return TaktukRemote('{{hosts_cmds.values()}}', list(hosts_cmds.keys()))
def _is_cache_old_and_reachable(cache_dir): """Try to read the api_commit stored in the cache_dir and compare it with latest commit, return True if remote commit is different from cache commit""" try: with open(cache_dir + 'api_commit') as f: local_commit = f.readline() except: logger.detail('No commit version found') return True try: api_commit = get_resource_attributes('')['version'] except: logger.warning('Unable to check API, reverting to cache') return False if local_commit != get_resource_attributes('')['version']: logger.info('Cache is outdated, will retrieve the latest commit') return True else: logger.detail('Already at the latest commit') return False
def get_job_by_name(job_name, sites=None): """ """ logger.detail('Looking for a job named %s', style.emph(job_name)) if not sites: sites = get_g5k_sites() oargrid_jobs = get_current_oargrid_jobs() if len(oargrid_jobs) > 0: for g_job in oargrid_jobs: for job in get_oargrid_job_oar_jobs(g_job): info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Oargridjob %s found !', style.emph(g_job)) return g_job, None running_jobs = get_current_oar_jobs(sites) for job in running_jobs: info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Job %s found on site %s !', style.emph(job[0]), style.host(job[1])) return job return None, None
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'): """Install and configure apt-cacher on one server""" hosts = map(Host, hosts) logger.info('Installing apt-cacher on %s', ','.join([style.host(host.address) for host in hosts])) logger.detail('Package') package = TaktukRemote( 'export DEBIAN_MASTER=noninteractive ; apt-get update ; ' + 'apt-get install -o Dpkg::Options::="--force-confdef" -o ' + 'Dpkg::Options::="--force-confnew" -y apt-cacher-ng', hosts).run() if not package.ok: logger.error('Unable to install apt-cacher-ng on %s') return logger.detail('Directory creation') log_dir = base_dir + '/log' cache_dir = base_dir + '/cache' mkdirs = TaktukRemote( 'mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir + '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir, hosts).run() if not mkdirs.ok: logger.error('Unable to create the directories') return cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \ '#g" /etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \ '/etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \ 'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'service apt-cacher-ng restart' configure = TaktukRemote(cmd, hosts).run() if not configure.ok: logger.error('Unable to configure and restart the service') return logger.info('apt-cacher-ng up and running on %s', ','.join([style.host(host.address) for host in hosts]))
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'): """Install and configure apt-cacher on one server""" hosts = map(Host, hosts) logger.info('Installing apt-cacher on %s', ','.join([style.host(host.address) for host in hosts])) logger.detail('Package') package = TaktukRemote('export DEBIAN_MASTER=noninteractive ; apt-get update ; ' + 'apt-get install -o Dpkg::Options::="--force-confdef" -o ' + 'Dpkg::Options::="--force-confnew" -y apt-cacher-ng', hosts).run() if not package.ok: logger.error('Unable to install apt-cacher-ng on %s') return logger.detail('Directory creation') log_dir = base_dir + '/log' cache_dir = base_dir + '/cache' mkdirs = TaktukRemote('mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir + '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir, hosts).run() if not mkdirs.ok: logger.error('Unable to create the directories') return cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \ '#g" /etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \ '/etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \ 'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'service apt-cacher-ng restart' configure = TaktukRemote(cmd, hosts).run() if not configure.ok: logger.error('Unable to configure and restart the service') return logger.info('apt-cacher-ng up and running on %s', ','.join([style.host(host.address) for host in hosts]))
def wait_hosts_up(hosts, timeout=300): """ """ down_hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) fd, hosts_file = mkstemp(dir='/tmp/', prefix='hosts_') f = fdopen(fd, 'w') f.write('\n' + '\n'.join(down_hosts)) f.close() timer = Timer() while len(down_hosts) > 0 and timer.elapsed() < timeout: nmap = Process("nmap -v -oG - -i %s -p 22 |grep Host|grep Status" % (hosts_file, ), shell=True).run() logger.debug('timer: %s \nnmap output: \n%s', timer.elapsed(), nmap.stdout.strip()) for line in nmap.stdout.strip().split('\n'): s = line.split()[2] host = s[s.find("(") + 1:s.find(")")] if host in down_hosts: logger.detail('%s is up', host) down_hosts.remove(host) Process('rm ' + hosts_file).run() sleep(3) return len(down_hosts) == 0
def _create_backing_file(self, disks=None): """ """ if not self.copy_actions: self._start_disk_copy(disks) if not self.copy_actions.ended: logger.info("Waiting for the end of the disks copy") self.copy_actions.wait() if isinstance(self.copy_actions, ParallelActions): mv_actions = [] for act in self.copy_actions.actions: fname = act.local_files[0].split('/')[-1] mv_actions.append( self.fact.get_remote( "mv /tmp/" + fname + " /tmp/orig_" + fname, self.hosts)) mv = ParallelActions(mv_actions).run() if not disks: disks = self.backing_files for bf in disks: raw_disk = '/tmp/orig_' + bf.split('/')[-1] to_disk = '/tmp/' + bf.split('/')[-1] self.fact.get_remote('cp ' + raw_disk + ' ' + to_disk, self.hosts).run() logger.info('Copying ssh key on ' + to_disk + ' ...') cmd = 'modprobe nbd max_part=16; ' + \ 'qemu-nbd --connect=/dev/nbd0 ' + to_disk + \ ' ; sleep 3 ; partprobe /dev/nbd0 ; ' + \ 'part=`fdisk -l /dev/nbd0 |grep dev|grep Linux| grep -v swap|cut -f 1 -d " "` ; ' + \ 'mount $part /mnt ; mkdir -p /mnt/root/.ssh ; ' + \ 'cat /root/.ssh/authorized_keys >> /mnt/root/.ssh/authorized_keys ; ' + \ 'cp -r /root/.ssh/id_rsa* /mnt/root/.ssh/ ;' + \ 'umount /mnt; qemu-nbd -d /dev/nbd0' logger.detail(cmd) copy_on_vm_base = self.fact.get_remote(cmd, self.hosts).run() self._actions_hosts(copy_on_vm_base)
def __get_site(site): logger.detail(site) site_attrs_th = threading.Thread(target = __get_site_attrs, args = (site,)) site_attrs_th.start() site_network_th = threading.Thread(target = __get_site_network, args = (site,)) site_network_th.start() cluster_attrs_th = {} host_attrs_th = {} for cluster in _get_site_clusters_uncached(site): t = threading.Thread(target = __get_cluster_attrs, args = (site, cluster)) t.start() cluster_attrs_th[cluster] = t t = threading.Thread(target = __get_host_attrs, args = (site, cluster)) t.start() host_attrs_th[cluster] = t for t in [ site_attrs_th, site_network_th ] + list(cluster_attrs_th.values()) + list(host_attrs_th.values()): t.join() threading.currentThread().site_data = site_attrs_th.site_data threading.currentThread().network_data = site_network_th.network_data threading.currentThread().cluster_data = {} threading.currentThread().host_data = {} for cluster in cluster_attrs_th: threading.currentThread().cluster_data[cluster] = cluster_attrs_th[cluster].cluster_data threading.currentThread().host_data[cluster] = host_attrs_th[cluster].host_data
def _write_api_cache(cache_dir, data): """write Grid'5000 API data into cache directory""" if not path.exists(cache_dir): makedirs(cache_dir) logger.detail('No cache found, directory created') else: logger.detail('Cache directory is present') logger.detail('Writing data to cache ...') for e, d in data.items(): with open(cache_dir + e, 'wb') as f: dump(d, f) with open(cache_dir + 'api_commit', 'w') as f: f.write(data['network']['backbone'][0]['version'])
def __get_site_attrs(site): logger.detail(site + " attrs") threading.currentThread().site_data = get_resource_attributes('sites/' + site)
def __get_backbone(): logger.detail("backbone network") threading.currentThread().backbone_data = get_resource_attributes( '/network_equipments')['items']
def wait_vms_have_started(vms, restart=True): """Scan port 22 on all vms, distributed on hosts""" # Creating file with list of VMs ip fd, tmpfile = tempfile.mkstemp(prefix='vmips') f = fdopen(fd, 'w') for vm in vms: f.write(vm['ip'] + '\n') f.close() # getting the list of host hosts = list(set([vm['host'] for vm in vms])) hosts.sort() # Pushing file on all hosts TaktukPut(hosts, [tmpfile]).run() logger.debug(pformat(hosts)) # Splitting nmap scan n_vm_scan = ceil(len(vms) / len(hosts)) + 1 cmds = [] for i in range(len(hosts)): start = str(int(i * n_vm_scan)) end = str(int((i + 1) * n_vm_scan)) cmds.append("awk 'NR>=" + start + " && NR<" + end + "' " + tmpfile.split('/')[-1] + " > nmap_file ; " + "nmap -v -oG - -i nmap_file -p 22") logger.debug('%s', pformat(cmds)) nmap = TaktukRemote('{{cmds}}', hosts) nmap_tries = 0 all_up = False started_vms = [] old_started = started_vms[:] while (not all_up) and nmap_tries < 10: sleep(15) logger.detail('nmap_tries %s', nmap_tries) nmap.run() for p in nmap.processes: for line in p.stdout.split('\n'): if 'Status' in line: split_line = line.split(' ') ip = split_line[1] state = split_line[3].strip() if state == 'Up': vm = [vm for vm in vms if vm['ip'] == ip] if len(vm) > 0: vm[0]['state'] = 'OK' started_vms = [vm for vm in vms if vm['state'] == 'OK'] all_up = len(started_vms) == len(vms) if started_vms != old_started: old_started = started_vms else: if restart: restart_vms([vm for vm in vms if vm['state'] == 'KO']) nmap_tries += 1 if nmap_tries == 1: activate_vms([vm for vm in vms if vm['state'] == 'KO']) if not all_up: logger.info(str(nmap_tries) + ': ' + str(len(started_vms)) + '/' + str(len(vms))) nmap.reset() TaktukRemote('rm ' + tmpfile.split('/')[-1], hosts).run() Process('rm ' + tmpfile).run() if all_up: logger.info('All VM have been started') return True else: logger.error('All VM have not been started') return False
def __get_site_network(site): logger.detail(site + " network") threading.currentThread().network_data = {} for equip in get_resource_attributes('sites/' + site + '/network_equipments')['items']: threading.currentThread().network_data[equip['uid']] = equip
def __get_backbone(): logger.detail("backbone network") threading.currentThread().backbone_data = get_resource_attributes('/network_equipments')['items']
def __get_cluster_attrs(site, cluster): logger.detail(cluster + " attrs") threading.currentThread().cluster_data = get_resource_attributes( 'sites/' + site + '/clusters/' + cluster)
def __get_cluster_attrs(site, cluster): logger.detail(cluster + " attrs") threading.currentThread().cluster_data = get_resource_attributes('sites/' + site + '/clusters/' + cluster)
def __get_host_attrs(site, cluster): logger.detail(cluster + " hosts") threading.currentThread().host_data = {} for host in get_resource_attributes('sites/' + site + '/clusters/' + cluster + '/nodes')['items']: threading.currentThread().host_data[host['uid']] = host
def _get_site_planning_API(site, site_planning, ignore_besteffort): try: alive_nodes = set([ str(node['network_address']) for node in get_resource_attributes( '/sites/' + site + '/internal/oarapi/resources/details.json?limit=2^30')['items'] if node['type'] == 'default' and node['state'] != 'Dead' and node['maintenance'] != 'YES' ]) for host in alive_nodes: host_cluster = get_host_cluster(str(host)) if host_cluster in site_planning: site_planning[host_cluster].update( {host: { 'busy': [], 'free': [] }}) if 'vlans' in site_planning: site_planning['vlans'] = {} for vlan in _get_vlans_API(site): site_planning['vlans'][vlan] = {'busy': [], 'free': []} # STORAGE AND SUBNETS MISSING # Retrieving jobs site_jobs = get_resource_attributes( '/sites/' + site + '/jobs?limit=1073741824&state=waiting,launching,running')['items'] jobs_links = [ link['href'] for job in site_jobs for link in job['links'] \ if link['rel'] == 'self' and (ignore_besteffort == False or job['queue'] != 'besteffort') ] threads = [] for link in jobs_links: t = Thread(target=_get_job_link_attr_API, args=('/' + str(link).split('/', 2)[2], )) t.broken = False t.attr = None t.ex = None threads.append(t) t.start() for t in threads: t.join() if t.broken: raise t.ex attr = t.attr try: start_time = attr['started_at'] if attr[ 'started_at'] != 0 else attr['scheduled_at'] end_time = start_time + attr['walltime'] except: continue start_time, end_time = _fix_job(start_time, end_time) nodes = attr['assigned_nodes'] for node in nodes: cluster = node.split('.', 1)[0].split('-')[0] if cluster in site_planning and node in site_planning[cluster]: site_planning[cluster][node]['busy'].append( (start_time, end_time)) if 'vlans' in site_planning and 'vlans' in attr['resources_by_type'] \ and int(attr['resources_by_type']['vlans'][0]) > 3: kavname = 'kavlan-' + str( attr['resources_by_type']['vlans'][0]) site_planning['vlans'][kavname]['busy'].append( (start_time, end_time)) if 'subnets' in site_planning and 'subnets' in attr[ 'resources_by_type']: for subnet in attr['resources_by_type']['subnets']: if subnet not in site_planning['subnets']: site_planning['subnets'][subnet] = { 'busy': [], 'free': [] } site_planning['subnets'][subnet]['busy'].append( (start_time, end_time)) # STORAGE IS MISSING except Exception as e: logger.warn( 'error connecting to oar database / getting planning from ' + site) logger.detail("exception:\n" + format_exc()) currentThread().broken = True
def wait_vms_have_started(vms, restart=True): """Scan port 22 on all vms, distributed on hosts""" # Creating file with list of VMs ip fd, tmpfile = tempfile.mkstemp(prefix='vmips') f = fdopen(fd, 'w') for vm in vms: f.write(vm['ip'] + '\n') f.close() # getting the list of host hosts = list(set([vm['host'] for vm in vms])) hosts.sort() # Pushing file on all hosts TaktukPut(hosts, [tmpfile]).run() logger.debug(pformat(hosts)) # Splitting nmap scan n_vm_scan = ceil(len(vms) / len(hosts)) + 1 cmds = [] for i in range(len(hosts)): start = str(int(i * n_vm_scan)) end = str(int((i + 1) * n_vm_scan)) cmds.append("awk 'NR>=" + start + " && NR<" + end + "' " + tmpfile.split('/')[-1] + " > nmap_file ; " + "nmap -v -oG - -i nmap_file -p 22") logger.debug('%s', pformat(cmds)) nmap = TaktukRemote('{{cmds}}', hosts) nmap_tries = 0 all_up = False started_vms = [] old_started = started_vms[:] while (not all_up) and nmap_tries < 10: sleep(15) logger.detail('nmap_tries %s', nmap_tries) nmap.run() for p in nmap.processes: for line in p.stdout.split('\n'): if 'Status' in line: split_line = line.split(' ') ip = split_line[1] state = split_line[3].strip() if state == 'Up': vm = [vm for vm in vms if vm['ip'] == ip] if len(vm) > 0: vm[0]['state'] = 'OK' started_vms = [vm for vm in vms if vm['state'] == 'OK'] all_up = len(started_vms) == len(vms) if started_vms != old_started: old_started = started_vms else: if restart: restart_vms([vm for vm in vms if vm['state'] == 'KO']) nmap_tries += 1 if nmap_tries == 1: activate_vms([vm for vm in vms if vm['state'] == 'KO']) if not all_up: logger.info( str(nmap_tries) + ': ' + str(len(started_vms)) + '/' + str(len(vms))) nmap.reset() TaktukRemote('rm ' + tmpfile.split('/')[-1], hosts).run() Process('rm ' + tmpfile).run() if all_up: logger.info('All VM have been started') return True else: logger.error('All VM have not been started') return False
def _get_site_planning_PGSQL(site, site_planning, ignore_besteffort): try: with G5kAutoPortForwarder( site, 'oardb.' + site + '.grid5000.fr', g5k_configuration['oar_pgsql_ro_port']) as (host, port): conn = psycopg2.connect( host=host, port=port, user=g5k_configuration['oar_pgsql_ro_user'], password=g5k_configuration['oar_pgsql_ro_password'], database=g5k_configuration['oar_pgsql_ro_db']) try: cur = conn.cursor() # Retrieving alive resources sql = """SELECT DISTINCT R.type, R.network_address, R.vlan, R.subnet_address FROM resources R WHERE state <> 'Dead' AND R.maintenance <> 'YES';""" cur.execute(sql) for data in cur.fetchall(): if data[0] == "default": cluster = get_host_cluster(data[1]) if cluster in site_planning: site_planning[cluster][data[1]] = { 'busy': [], 'free': [] } if data[0] in ['kavlan', 'kavlan-global'] \ and 'vlans' in site_planning: site_planning['vlans']['kavlan-' + data[2]] = { 'busy': [], 'free': [] } if data[0] == "subnet" and 'subnet' in site_planning: site_planning['subnets'][data[3]] = { 'busy': [], 'free': [] } sql = ( """SELECT J.job_id, J.state, GJP.start_time AS start_time, GJP.start_time+MJD.moldable_walltime, array_agg(DISTINCT R.network_address) AS hosts, array_agg(DISTINCT R.vlan) AS vlan, array_agg(DISTINCT R.subnet_address) AS subnets FROM jobs J LEFT JOIN moldable_job_descriptions MJD ON MJD.moldable_job_id=J.job_id LEFT JOIN gantt_jobs_predictions GJP ON GJP.moldable_job_id=MJD.moldable_id INNER JOIN gantt_jobs_resources AR ON AR.moldable_job_id=MJD.moldable_id LEFT JOIN resources R ON AR.resource_id=R.resource_id WHERE ( J.state='Launching' OR J.state='Running' OR J.state='Waiting') """ + (""" AND queue_name<>'besteffort'""" if ignore_besteffort else """""") + """GROUP BY J.job_id, GJP.start_time, MJD.moldable_walltime ORDER BY J.start_time""") # CONVERT(SUBSTRING_INDEX(SUBSTRING_INDEX(R.network_address,'.',1),'-',-1), SIGNED)""" cur.execute(sql) for job in cur.fetchall(): start_time = job[2] end_time = job[3] start_time, end_time = _fix_job(start_time, end_time) if len(job[4]) > 0: for host in job[4]: if host != '': cluster = get_host_cluster(host) if cluster in site_planning: if host in site_planning[cluster]: site_planning[cluster][host][ 'busy'].append( (start_time, end_time)) if job[5][0] and 'vlans' in site_planning: for vlan in job[5]: if isinstance(vlan, str) and int(vlan) > 3: # only routed vlan site_planning['vlans']['kavlan-' + vlan]['busy'].append( (start_time, end_time)) if len(job[6]) > 0 and 'subnet' in site_planning: for subnet in job[6]: site_planning['subnets'][subnet]['busy'].append( (start_time, end_time)) finally: conn.close() except Exception as e: logger.warn( 'error connecting to oar database / getting planning from ' + site) logger.detail("exception:\n" + format_exc()) currentThread().broken = True
vms = [] for host in state.findall('.//host'): for vm in host.findall('.//vm'): vms.append({'id': vm.get('id'), 'n_cpu': int(_default_xml_value('n_cpu')), 'cpuset': _default_xml_value('cpuset'), 'mem': int(_default_xml_value('mem')), 'hdd': int(_default_xml_value('hdd')), 'backing_file': _default_xml_value('backing_file'), 'ip': _default_xml_value('ip'), 'mac': _default_xml_value('mac'), 'host': host.get('id')}) while True: logger.detail('Cleaning all VMS from XML file') for el_host in state.findall('.//host'): for vm in el_host.findall('./vm'): el_host.remove(vm) logger.info('Retrieving VMS position and load') get_vms_load = TaktukRemote("get_cpu_consumptions.sh", hosts).run() vms_loads = {} hosts_vms = {host: [] for host in hosts} for p in get_vms_load.processes: for line in p.stdout.strip().split('\n'): logger.detail(p.host.address) tmp_load = line.split(' ') logger.detail(tmp_load) try: vms_loads[tmp_load[0]] = float(tmp_load[1]) + float(tmp_load[2]) + float(tmp_load[-1])