def _get_ip_mac(self, resources): """ """ if len(resources.keys()) == 1: # mono site self.ip_mac = resources[resources.keys()[0]]['ip_mac'] self.kavlan = resources[resources.keys()[0]]['kavlan'] elif 'global' in resources: # multi site in a global kavlan self.ip_mac = resources['global']['ip_mac'] self.kavlan = resources['global']['kavlan'] self.kavlan_site = resources['global']['site'] else: # multi site in prod network self.ip_mac = { site: resource['ip_mac'] for site, resource in resources.iteritems() } if isinstance(self.ip_mac, list) and len(self.ip_mac) == 0: logger.error('No ip_range given in the resources') exit() elif isinstance(self.ip_mac, dict): for ip_mac in self.ip_mac.itervalues(): if len(ip_mac) == 0: logger.error('No ip_range given in the resources') exit()
def get_hosts_jobs(hosts, walltime, out_of_chart=False): """Find the first slot when the hosts are available and return a list of jobs_specs :param hosts: list of hosts :param walltime: duration of reservation """ hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) planning = get_planning(elements=hosts, out_of_chart=out_of_chart) limits = _slots_limits(planning) walltime = get_seconds(walltime) for limit in limits: all_host_free = True for site_planning in planning.itervalues(): for cluster, cluster_planning in site_planning.iteritems(): if cluster in get_g5k_clusters(): for host_planning in cluster_planning.itervalues(): host_free = False for free_slot in host_planning['free']: if free_slot[0] <= limit and free_slot[ 1] >= limit + walltime: host_free = True if not host_free: all_host_free = False if all_host_free: startdate = limit break else: logger.error('Unable to find a slot for %s', hosts) return None jobs_specs = [] for site in planning.keys(): site_hosts = map(get_host_longname, filter(lambda h: get_host_site(h) == site, hosts)) sub_res = "{host in ('" + "','".join(site_hosts) + "')}/nodes=" + str( len(site_hosts)) jobs_specs.append((OarSubmission(resources=sub_res, reservation_date=startdate), site)) return jobs_specs
def get_hosts_jobs(hosts, walltime, out_of_chart=False): """Find the first slot when the hosts are available and return a list of jobs_specs :param hosts: list of hosts :param walltime: duration of reservation """ hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) planning = get_planning(elements=hosts, out_of_chart=out_of_chart) limits = _slots_limits(planning) walltime = get_seconds(walltime) for limit in limits: all_host_free = True for site_planning in planning.itervalues(): for cluster, cluster_planning in site_planning.iteritems(): if cluster in get_g5k_clusters(): for host_planning in cluster_planning.itervalues(): host_free = False for free_slot in host_planning['free']: if free_slot[0] <= limit and free_slot[1] >= limit + walltime: host_free = True if not host_free: all_host_free = False if all_host_free: startdate = limit break else: logger.error('Unable to find a slot for %s', hosts) return None jobs_specs = [] for site in planning.keys(): site_hosts = map(get_host_longname, filter(lambda h: get_host_site(h) == site, hosts)) sub_res = "{host in ('" + "','".join(site_hosts) + "')}/nodes=" + str(len(site_hosts)) jobs_specs.append((OarSubmission(resources=sub_res, reservation_date=startdate), site)) return jobs_specs
def _get_ip_mac(self, resources): """ """ if len(resources.keys()) == 1: # mono site self.ip_mac = resources[resources.keys()[0]]['ip_mac'] self.kavlan = resources[resources.keys()[0]]['kavlan'] elif 'global' in resources: # multi site in a global kavlan self.ip_mac = resources['global']['ip_mac'] self.kavlan = resources['global']['kavlan'] self.kavlan_site = resources['global']['site'] else: # multi site in prod network self.ip_mac = {site: resource['ip_mac'] for site, resource in resources.iteritems()} if isinstance(self.ip_mac, list) and len(self.ip_mac) == 0: logger.error('No ip_range given in the resources') exit() elif isinstance(self.ip_mac, dict): for ip_mac in self.ip_mac.itervalues(): if len(ip_mac) == 0: logger.error('No ip_range given in the resources') exit()
def _update_hosts_state(self, hosts_ok, hosts_ko): """ """ for host in hosts_ok: if host: if isinstance(host, Host): host = host.address self.state.find(".//host/[@id='" + host + "']").set('state', 'OK') for host in hosts_ko: if host: if isinstance(host, Host): host = host.address self.state.find(".//host/[@id='" + host + "']").set('state', 'KO') self.hosts.remove(host) if len(self.hosts) == 0: logger.error('No hosts available, because %s are KO', hosts_list(hosts_ko)) exit() if self.vms: distribute_vms(self.vms, self.hosts, self.distribution) self._set_vms_ip_mac()
def _update_hosts_state(self, hosts_ok, hosts_ko): """ """ for host in hosts_ok: if host: if isinstance(host, Host): host = host.address self.state.find(".//host/[@id='" + host + "']").set( 'state', 'OK') for host in hosts_ko: if host: if isinstance(host, Host): host = host.address self.state.find(".//host/[@id='" + host + "']").set( 'state', 'KO') self.hosts.remove(host) if len(self.hosts) == 0: logger.error('No hosts available, because %s are KO', hosts_list(hosts_ko)) exit() if self.vms: distribute_vms(self.vms, self.hosts, self.distribution) self._set_vms_ip_mac()
def _check_xml_elements(self, xml, resources, strict=False): sites, clusters, hosts = self._get_xml_elements(xml) ok = True if not sites == self.sites: logger.error('List of sites from resources differs from infile' + \ '\n resource %s \n infile %s', self.sites, sites) ok = False if not clusters == self.clusters: logger.error('List of clusters from resources differs from infile' + \ '\n resource %s \n infile %s', self.clusters, clusters) ok = False if strict: if not hosts == self.hosts: logger.error('List of hosts from resources differs from infile' + \ '\n resource %s \n infile %s', self.hosts, hosts) ok = False else: res_hosts = {} for host in self.hosts: cluster = get_host_cluster(host) if cluster in res_hosts: res_hosts[cluster] += 1 else: res_hosts[cluster] = 1 xml_hosts = {} for host in hosts: cluster = get_host_cluster(host) if cluster in xml_hosts: xml_hosts[cluster] += 1 else: xml_hosts[cluster] = 1 if not res_hosts == xml_hosts: logger.error('List of hosts from resources differs from infile' + \ '\n resource %s \n infile %s', self.hosts, hosts) ok = False else: for i in range(len(hosts)): el_host = xml.find(".//host/[@id='" + hosts[i] + "']") el_host.attrib['id'] = self.hosts[i] return ok
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'): """Install and configure apt-cacher on one server""" hosts = map(Host, hosts) logger.info('Installing apt-cacher on %s', ','.join([style.host(host.address) for host in hosts])) logger.detail('Package') package = TaktukRemote('export DEBIAN_MASTER=noninteractive ; apt-get update ; ' + 'apt-get install -o Dpkg::Options::="--force-confdef" -o ' + 'Dpkg::Options::="--force-confnew" -y apt-cacher-ng', hosts).run() if not package.ok: logger.error('Unable to install apt-cacher-ng on %s') return logger.detail('Directory creation') log_dir = base_dir + '/log' cache_dir = base_dir + '/cache' mkdirs = TaktukRemote('mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir + '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir, hosts).run() if not mkdirs.ok: logger.error('Unable to create the directories') return cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \ '#g" /etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \ '/etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \ 'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'service apt-cacher-ng restart' configure = TaktukRemote(cmd, hosts).run() if not configure.ok: logger.error('Unable to configure and restart the service') return logger.info('apt-cacher-ng up and running on %s', ','.join([style.host(host.address) for host in hosts]))
def setup_aptcacher_server(hosts, base_dir='/tmp/apt-cacher-ng'): """Install and configure apt-cacher on one server""" hosts = map(Host, hosts) logger.info('Installing apt-cacher on %s', ','.join([style.host(host.address) for host in hosts])) logger.detail('Package') package = TaktukRemote( 'export DEBIAN_MASTER=noninteractive ; apt-get update ; ' + 'apt-get install -o Dpkg::Options::="--force-confdef" -o ' + 'Dpkg::Options::="--force-confnew" -y apt-cacher-ng', hosts).run() if not package.ok: logger.error('Unable to install apt-cacher-ng on %s') return logger.detail('Directory creation') log_dir = base_dir + '/log' cache_dir = base_dir + '/cache' mkdirs = TaktukRemote( 'mkdir -p ' + log_dir + '; mkdir -p ' + cache_dir + '; chown -R apt-cacher-ng:apt-cacher-ng ' + base_dir, hosts).run() if not mkdirs.ok: logger.error('Unable to create the directories') return cmd = 'sed -i "s#/var/cache/apt-cacher-ng#' + cache_dir + \ '#g" /etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s#/var/log/apt-cacher-ng#' + log_dir + '#g" ' + \ '/etc/apt-cacher-ng/acng.conf ;' + \ 'sed -i "s/3142/9999/g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'sed -i "s?#Proxy: http://www-proxy.example.net:80?Proxy: ' + \ 'http://proxy:3128?g" /etc/apt-cacher-ng/acng.conf ; ' + \ 'service apt-cacher-ng restart' configure = TaktukRemote(cmd, hosts).run() if not configure.ok: logger.error('Unable to configure and restart the service') return logger.info('apt-cacher-ng up and running on %s', ','.join([style.host(host.address) for host in hosts]))
def main(): args = parser.parse_args() login='' try: login=os.getlogin() except OSError: login=args.user ## TODO make it more robust, if arg.user is empty # Retrieve the right number of lines try: nodesFile = open(args.nodes_address_file) nodesInfos = [next(nodesFile) for x in range(args.nbNodes)] except IOError as e: logger.error("I/O error({0}) on "+args.nodes_address_file+": {1}".format(e.errno, e.strerror)) sys.exit() if len(nodesInfos) < int(args.nbNodes): logger.error("There is no enough addresses in the file (%d requested/%d available)" % (args.nbNodes, len(nodesInfos))) sys.exit() hosts = [s.strip().split(':')[0] for s in nodesInfos] service_node = str(args.service_node) logger.info("Killing injector processes in service node %s" % service_node) cmd = 'pkill -9 -f dhtinjector.jar ; rm -rf ~/SLOTH-EXP-TMP/INJECTOR_HOME/dhtinjector-log-* ;' launch_sloths = Remote(cmd,service_node, connection_params={'user': login}).run() logger.info("Putting node addresses file %s into service node %s" % (args.nodes_address_file, service_node)) cp = TaktukPut(service_node, [str(args.nodes_address_file)], remote_location=str(args.nodes_address_file)).run() injectorLogFileBase = 'injectorLog_' + str(args.experimentId) + '_' + str(args.dataMode) injectorLogFile = injectorLogFileBase + '.csv' checkFile = 'summary_' + str(args.experimentId) + '_' + str(args.dataMode) + '.log' dhtLogFile = 'dhtinjector_log_'+str(args.experimentId)+'_'+str(args.dataMode)+'.log' failuresFile = 'failures_'+str(args.experimentId)+'_'+str(args.dataMode)+'.log' # # Mandatory replacements in injector.properties file # cmdLines = [ 'cd ~/SLOTH-EXP-TMP/INJECTOR_HOME/.' ,'sed "s/peers.number.*/peers.number = ' + str(args.nbNodes) + '/g" ./config/injector.properties.template > /tmp/injector.properties' ,'cp /tmp/injector.properties ./config/injector.properties' ,'sed "s/injection.mode.*/injection.mode = in_vivo/g" ./config/injector.properties > /tmp/injector.properties' ,'cp /tmp/injector.properties ./config/injector.properties' ,'sed "s:dht.peersaddress.*:dht.peersaddress = "'+args.nodes_address_file+'":g" ./config/injector.properties > /tmp/injector.properties' ,'cp /tmp/injector.properties ./config/injector.properties' ] # # Optional replacements in injector.properties file, according to command-line arguments # if (args.duration is not None): cmdLines = cmdLines + [ 'sed "s/injector.duration.*/injector.duration = ' + str(args.duration) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.nbObjects is not None): cmdLines = cmdLines + [ 'sed "s/objects.number.*/objects.number = ' + str(args.nbObjects) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.objectMaxSize is not None): cmdLines = cmdLines + [ 'sed "s/object.maxsize.*/object.maxsize = ' + str(args.objectMaxSize) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.getPeriod is not None): cmdLines = cmdLines + [ 'sed "s/injector.getperiod.*/injector.getperiod = ' + str(args.getPeriod) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.putPeriod is not None): cmdLines = cmdLines + [ 'sed "s/injector.putperiod.*/injector.putperiod = ' + str(args.putPeriod) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.removalPeriod is not None): cmdLines = cmdLines + [ 'sed "s/injector.removalperiod.*/injector.removalperiod = ' + str(args.removalPeriod) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.crashPeriod is not None): cmdLines = cmdLines + [ 'sed "s/injector.crashperiod.*/injector.crashperiod = ' + str(args.crashPeriod) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.removalDuration is not None): cmdLines = cmdLines + [ 'sed "s/injector.removalduration.*/injector.removalduration = ' + str(args.removalDuration) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] if (args.crashDuration is not None): cmdLines = cmdLines + [ 'sed "s/injector.crashduration.*/injector.crashduration = ' + str(args.crashDuration) + '/g" ./config/injector.properties > /tmp/injector.properties' , 'cp /tmp/injector.properties ./config/injector.properties' ] ############################################################################################### # # Command to actually run the injector, loading the ./config/injector.properties file # cmdLines = cmdLines + [ 'java -DdataMode='+ str(args.dataMode)+ ' -DexperimentId='+ str(args.experimentId) +' -jar target/scala-2.10/dhtinjector.jar 2>&1 > ' + dhtLogFile + ' 0<&- 2>&-' ] cmd = ";".join(cmdLines) logger.info("%s/executing command %s" % (service_node, "\n".join(cmdLines))) launch_sloths = Remote(cmd,service_node, connection_params={'user': login}).run() ############################################################################################## # # Execute remote commands to obtain total count of failures # cmdLines = [ 'cd ~/SLOTH-EXP-TMP/INJECTOR_HOME/.' ,'mv ./injectorLog.csv ' + injectorLogFile + ' 2>&1 > /tmp/errorFile' ,'./querycsv.py -i '+injectorLogFile+' -o '+failuresFile+' "SELECT * FROM '+injectorLogFileBase+' WHERE status == \\\"FAILURE\\\""' # Grab last lines from dhtLogFile with the summary data ,'tail -n 23 '+dhtLogFile+' > '+checkFile ,'./querycsv.py -i '+injectorLogFile+" 'SELECT COUNT(*) AS total_failures FROM " + injectorLogFileBase + " WHERE status == \"FAILURE\"' >>" + checkFile ,'./querycsv.py -i '+injectorLogFile+" 'SELECT COUNT(*) AS get_failures FROM " + injectorLogFileBase + " WHERE status == \"FAILURE\" and operation == \"Get()\"' >> " + checkFile ,'./querycsv.py -i '+injectorLogFile+" 'SELECT COUNT(*) AS put_failures FROM " + injectorLogFileBase + " WHERE status == \"FAILURE\" and operation == \"Put()\"' >> " + checkFile ] cmd = ";".join(cmdLines) logger.info("%s/executing command %s" % (service_node, "\n" + "\n".join(cmdLines))) launch_sloths = Remote(cmd,service_node, connection_params={'user': login}).run() logger.info("The injector has been launched.")
def wait_vms_have_started(vms, restart=True): """Scan port 22 on all vms, distributed on hosts""" # Creating file with list of VMs ip fd, tmpfile = tempfile.mkstemp(prefix='vmips') f = fdopen(fd, 'w') for vm in vms: f.write(vm['ip'] + '\n') f.close() # getting the list of host hosts = list(set([vm['host'] for vm in vms])) hosts.sort() # Pushing file on all hosts TaktukPut(hosts, [tmpfile]).run() logger.debug(pformat(hosts)) # Splitting nmap scan n_vm_scan = ceil(len(vms) / len(hosts)) + 1 cmds = [] for i in range(len(hosts)): start = str(int(i * n_vm_scan)) end = str(int((i + 1) * n_vm_scan)) cmds.append("awk 'NR>=" + start + " && NR<" + end + "' " + tmpfile.split('/')[-1] + " > nmap_file ; " + "nmap -v -oG - -i nmap_file -p 22") logger.debug('%s', pformat(cmds)) nmap = TaktukRemote('{{cmds}}', hosts) nmap_tries = 0 all_up = False started_vms = [] old_started = started_vms[:] while (not all_up) and nmap_tries < 10: sleep(15) logger.detail('nmap_tries %s', nmap_tries) nmap.run() for p in nmap.processes: for line in p.stdout.split('\n'): if 'Status' in line: split_line = line.split(' ') ip = split_line[1] state = split_line[3].strip() if state == 'Up': vm = [vm for vm in vms if vm['ip'] == ip] if len(vm) > 0: vm[0]['state'] = 'OK' started_vms = [vm for vm in vms if vm['state'] == 'OK'] all_up = len(started_vms) == len(vms) if started_vms != old_started: old_started = started_vms else: if restart: restart_vms([vm for vm in vms if vm['state'] == 'KO']) nmap_tries += 1 if nmap_tries == 1: activate_vms([vm for vm in vms if vm['state'] == 'KO']) if not all_up: logger.info(str(nmap_tries) + ': ' + str(len(started_vms)) + '/' + str(len(vms))) nmap.reset() TaktukRemote('rm ' + tmpfile.split('/')[-1], hosts).run() Process('rm ' + tmpfile).run() if all_up: logger.info('All VM have been started') return True else: logger.error('All VM have not been started') return False
def wait_vms_have_started(vms, restart=True): """Scan port 22 on all vms, distributed on hosts""" # Creating file with list of VMs ip fd, tmpfile = tempfile.mkstemp(prefix='vmips') f = fdopen(fd, 'w') for vm in vms: f.write(vm['ip'] + '\n') f.close() # getting the list of host hosts = list(set([vm['host'] for vm in vms])) hosts.sort() # Pushing file on all hosts TaktukPut(hosts, [tmpfile]).run() logger.debug(pformat(hosts)) # Splitting nmap scan n_vm_scan = ceil(len(vms) / len(hosts)) + 1 cmds = [] for i in range(len(hosts)): start = str(int(i * n_vm_scan)) end = str(int((i + 1) * n_vm_scan)) cmds.append("awk 'NR>=" + start + " && NR<" + end + "' " + tmpfile.split('/')[-1] + " > nmap_file ; " + "nmap -v -oG - -i nmap_file -p 22") logger.debug('%s', pformat(cmds)) nmap = TaktukRemote('{{cmds}}', hosts) nmap_tries = 0 all_up = False started_vms = [] old_started = started_vms[:] while (not all_up) and nmap_tries < 10: sleep(15) logger.detail('nmap_tries %s', nmap_tries) nmap.run() for p in nmap.processes: for line in p.stdout.split('\n'): if 'Status' in line: split_line = line.split(' ') ip = split_line[1] state = split_line[3].strip() if state == 'Up': vm = [vm for vm in vms if vm['ip'] == ip] if len(vm) > 0: vm[0]['state'] = 'OK' started_vms = [vm for vm in vms if vm['state'] == 'OK'] all_up = len(started_vms) == len(vms) if started_vms != old_started: old_started = started_vms else: if restart: restart_vms([vm for vm in vms if vm['state'] == 'KO']) nmap_tries += 1 if nmap_tries == 1: activate_vms([vm for vm in vms if vm['state'] == 'KO']) if not all_up: logger.info( str(nmap_tries) + ': ' + str(len(started_vms)) + '/' + str(len(vms))) nmap.reset() TaktukRemote('rm ' + tmpfile.split('/')[-1], hosts).run() Process('rm ' + tmpfile).run() if all_up: logger.info('All VM have been started') return True else: logger.error('All VM have not been started') return False
def distribute_vms(vms, hosts, distribution='round-robin'): """Distribute the virtual machines on the hosts. :param vms: a list of VMs dicts which host key will be updated :param hosts: a list of hosts :param distribution: a string defining the distribution type: 'round-robin', 'concentrated', 'n_by_hosts', 'random """ logger.debug('Initial virtual machines distribution \n%s', "\n".join([vm['id'] + ": " + str(vm['host']) for vm in vms])) if distribution in ['round-robin', 'concentrated', 'random']: attr = get_CPU_RAM_FLOPS(hosts) dist_hosts = hosts[:] iter_hosts = cycle(dist_hosts) host = iter_hosts.next() for vm in vms: remaining = attr[host].copy() while remaining['RAM'] - vm['mem'] <= 0 \ or remaining['CPU'] - vm['n_cpu'] / 3 <= 0: dist_hosts.remove(host) if len(dist_hosts) == 0: req_mem = sum([vm['mem'] for vm in vms]) req_cpu = sum([vm['n_cpu'] for vm in vms]) / 3 logger.error( 'Not enough ressources ! \n' + 'RAM'.rjust(20) + 'CPU'.rjust(10) + '\n' + 'Needed'.ljust(15) + '%s Mb'.ljust(15) + '%s \n' + 'Available'.ljust(15) + '%s Mb'.ljust(15) + '%s \n' + 'Maximum number of VM is %s', req_mem, req_cpu, attr['TOTAL']['RAM'], attr['TOTAL']['CPU'], style.emph(str(get_max_vms(hosts, vm['mem'])))) exit() iter_hosts = cycle(dist_hosts) host = iter_hosts.next() remaining = attr[host].copy() vm['host'] = host remaining['RAM'] -= vm['mem'] remaining['CPU'] -= vm['n_cpu'] / 3 attr[host] = remaining.copy() if distribution == 'round-robin': host = iter_hosts.next() remaining = attr[host].copy() if distribution == 'random': for i in range(randint(0, len(dist_hosts))): host = iter_hosts.next() remaining = attr[host].copy() elif distribution == 'n_by_hosts': n_by_host = int(len(vms) / len(hosts)) i_vm = 0 for host in hosts: for i in range(n_by_host): vms[i_vm]['host'] = host i_vm += 1 if len(vms) % len(hosts) != 0: logger.warning('Reducing number of VMs to have %s by host', style.emph(n_by_host)) vms[:] = vms[0:n_by_host * len(hosts)] else: logger.debug('No valid distribution given') logger.debug('Final virtual machines distribution \n%s', "\n".join([vm['id'] + ": " + str(vm['host']) for vm in vms]))
logger.info('Program: %s', set_style(prog, 'emph')) if options.plots: if 'grid5000.fr' in getfqdn(): options.plots = False logger.warning('Plots are disabled on Grid5000 frontend until the migration to Wheezy') resources = {} for element in options.resources.split(','): if ':' in element: element_uid, n_nodes = element.split(':') elif options.mode != 'free': element_uid, n_nodes = element, 0 else: logger.error('You must specify the number of host element:n_nodes when using free mode') exit() resources[element_uid] = int(n_nodes) planning = Planning(resources, oar_date_to_unixts(options.startdate), oar_date_to_unixts(options.enddate), options.kavlan_global) planning.compute() if options.plots: draw_gantt(planning.planning) planning.compute_slots(options.walltime)
def main(): args = parser.parse_args() otherFlags = "--known-peers-file " + args.nodes_address_file if args.no_stabilization: otherFlags += " --no-stabilization" login = str(os.getlogin()) logger.info("Running Experiment: %d with user %s" %(args.experimentId, login)) # Retrieve the right number of lines try: nodesFile = open(args.nodes_address_file) nodesInfos = [next(nodesFile) for x in range(args.nbNodes)] nodesFile.close() except IOError as e: logger.error("I/O error({0}) on " + args.nodes_address_file + \ ": {1}".format(e.errno, e.strerror)) sys.exit(1) if len(nodesInfos) != args.nbNodes: logger.error("There is no enough addresses in the file") sys.exit(1) hosts = [s.strip().split(':')[0] for s in nodesInfos] akkaports = [s.strip().split(':')[1] for s in nodesInfos] httpports = [s.strip().split(':')[2] for s in nodesInfos] flags = ['-fd'] * args.nbNodes #@ Build delay according to the peer ID logger.info("Constructing hashes of peer ids") hp_shas = [((h+':'+p), int(hashlib.sha1(h+':'+p).hexdigest(),16)) for (h,p) in zip(hosts, akkaports)] hp_shas.sort(key=lambda t: t[1]) logger.info("Creating sorted peers list and corresponding delays") sorted_peers = [h for (h,sha) in hp_shas ] delays = [sorted_peers.index(h+':'+p)*.5 for (h,p) in zip(hosts, akkaports)] logger.info("Setting -ifd flags for initial peer") index = delays.index(0) flags[index]='-ifd' ## Overwrite the nodes address file logger.info("Overwriting nodes address file, now with sorted peers") nodesFile = open(args.nodes_address_file, 'w') nhosts = [s.strip().split(':')[0] for s in sorted_peers] nakkaports = [s.strip().split(':')[1] for s in sorted_peers] for (h,p) in zip(nhosts,nakkaports): nodesFile.write("%s:%s:%d\n" % (h,p,int(p)+5000)) nodesFile.close() logger.info("Initial peer: %s:%s with flags %s" % (hosts[index],httpports[index],flags[index])) # Copy the known address file filtered_hosts = list(set(hosts)) logger.info("Putting addresses file %s into hosts %s" % (args.nodes_address_file, filtered_hosts)) cp = TaktukPut(filtered_hosts, [str(args.nodes_address_file)], remote_location=str(args.nodes_address_file)).run() rm_tmp_cmd = '; '.join([ ## 'rm -rf /tmp/sloth' ## , 'mkdir -p /tmp/sloth/%d' % args.experimentId 'mkdir -p /tmp/sloth/%d' % args.experimentId ]) logger.info("Recreating /tmp/sloth/%d folder on hosts %s" % (args.experimentId, filtered_hosts)) TaktukRemote(rm_tmp_cmd, filtered_hosts, connection_params={'user':login}).run() startNodeCmd = ' '.join([ './startNode.sh ' + args.dataMode + ' {{akkaports}}' , str(args.experimentId) + ' --mode ' + args.dataMode , '--port {{akkaports}}' , '--http-port {{httpports}} {{flags}} ' + otherFlags , '2>&1 > /tmp/sloth/'+str(args.experimentId)+'/sloth_launcher_{{akkaports}}_' + args.dataMode + '.log 0<&- 2>&- &' ]) cmd = '; '.join([ 'cd ~/SLOTH-EXP-TMP/SLOTH_HOME' , 'sleep {{delays}}' , startNodeCmd ]) logger.info("Launching peers with command: %s" % cmd) remoteCmdsFile = open("remote_cmds_%d.info" % args.experimentId, 'w') logger.info("Writing peer-specific commands into %s" % remoteCmdsFile) for h in hosts: remoteCmdsFile.write(remote_substitute(cmd, [Host(h) for h in hosts], hosts.index(h), (globals(), locals()))) remoteCmdsFile.write("\n") remoteCmdsFile.close() logger.info("Launching peers... this may take a while ...") launch_sloths = TaktukRemote(cmd, hosts, connection_params={'user': login}).run() p_nb=0; for peer in launch_sloths.processes: if not peer.ok: logger.error(peer.host) logger.error(peer.stdout) logger.error(peer.stderr) else: p_nb= p_nb + 1 logger.info("%d Peers have been launched" % (p_nb)) if p_nb != args.nbNodes: logger.error("Unfortunately you requested %d peers, so bye bye (you can try to relaunch it with %d peers, it can run ...)" % (args.nbNodes, p_nb)) sys.exit(1)
def get_planning(elements=['grid5000'], vlan=False, subnet=False, storage=False, out_of_chart=False, starttime=None, endtime=None, ignore_besteffort=True, queues='default'): """Retrieve the planning of the elements (site, cluster) and others resources. Element planning structure is ``{'busy': [(123456,123457), ... ], 'free': [(123457,123460), ... ]}.`` :param elements: a list of Grid'5000 elements ('grid5000', <site>, <cluster>) :param vlan: a boolean to ask for KaVLAN computation :param subnet: a boolean to ask for subnets computation :param storage: a boolean to ask for sorage computation :param out_of_chart: if True, consider that days outside weekends are busy :param starttime: start of time period for which to compute the planning, defaults to now + 1 minute :param endtime: end of time period for which to compute the planning, defaults to 4 weeks from now :param ignore_besteffort: True by default, to consider the resources with besteffort jobs as available :param queues: list of oar queues for which to get the planning Return a dict whose keys are sites, whose values are dict whose keys are cluster, subnets, kavlan or storage, whose values are planning dicts, whose keys are hosts, subnet address range, vlan number or chunk id planning respectively. """ if not starttime: starttime = int(time() + timedelta_to_seconds(timedelta(minutes=1))) starttime = int(get_unixts(starttime)) if not endtime: endtime = int(starttime + timedelta_to_seconds(timedelta(weeks=4, minutes=1))) endtime = int(get_unixts(endtime)) if 'grid5000' in elements: sites = elements = get_g5k_sites() else: sites = list( set([site for site in elements if site in get_g5k_sites()] + [ get_cluster_site(cluster) for cluster in elements if cluster in get_g5k_clusters(queues=queues) ] + [ get_host_site(host) for host in elements if host in get_g5k_hosts() or get_host_shortname(host) in get_g5k_hosts() ])) if len(sites) == 0: logger.error('Wrong elements given: %s' % (elements, )) return None planning = {} for site in sites: planning[site] = {} for cluster in get_site_clusters(site, queues=queues): planning[site][cluster] = {} for site in sites: if vlan: planning[site].update({'vlans': {}}) if subnet: planning[site].update({'subnets': {}}) if storage: planning[site].update({'storage': {}}) if _retrieve_method == 'API': _get_planning_API(planning, ignore_besteffort) elif _retrieve_method == 'PostgreSQL': _get_planning_PGSQL(planning, ignore_besteffort) if out_of_chart: _add_charter_to_planning(planning, starttime, endtime) for site_pl in planning.values(): for res_pl in site_pl.values(): for el_planning in res_pl.values(): el_planning['busy'].sort() _merge_el_planning(el_planning['busy']) _trunc_el_planning(el_planning['busy'], starttime, endtime) _fill_el_planning_free(el_planning, starttime, endtime) # cleaning real_planning = deepcopy(planning) for site, site_pl in planning.items(): for cl, cl_pl in site_pl.items(): if cl in ['vlans']: continue keep_cluster = False for h in cl_pl: if not (get_host_site(h) in elements or get_host_cluster(h) in elements or get_host_shortname(h) in elements or h in elements): del real_planning[site][cl][h] else: keep_cluster = True if not keep_cluster: del real_planning[site][cl] return real_planning
def distribute_vms(vms, hosts, distribution='round-robin'): """Distribute the virtual machines on the hosts. :param vms: a list of VMs dicts which host key will be updated :param hosts: a list of hosts :param distribution: a string defining the distribution type: 'round-robin', 'concentrated', 'n_by_hosts', 'random """ logger.debug('Initial virtual machines distribution \n%s', "\n".join([vm['id'] + ": " + str(vm['host']) for vm in vms])) if distribution in ['round-robin', 'concentrated', 'random']: attr = get_CPU_RAM_FLOPS(hosts) dist_hosts = hosts[:] iter_hosts = cycle(dist_hosts) host = iter_hosts.next() for vm in vms: remaining = attr[host].copy() while remaining['RAM'] - vm['mem'] <= 0 \ or remaining['CPU'] - vm['n_cpu'] / 3 <= 0: dist_hosts.remove(host) if len(dist_hosts) == 0: req_mem = sum([vm['mem'] for vm in vms]) req_cpu = sum([vm['n_cpu'] for vm in vms]) / 3 logger.error('Not enough ressources ! \n' + 'RAM'.rjust(20) + 'CPU'.rjust(10) + '\n' + 'Needed'.ljust(15) + '%s Mb'.ljust(15) + '%s \n' + 'Available'.ljust(15) + '%s Mb'.ljust(15) + '%s \n' + 'Maximum number of VM is %s', req_mem, req_cpu, attr['TOTAL']['RAM'], attr['TOTAL']['CPU'], style.emph(str(get_max_vms(hosts, vm['mem'])))) exit() iter_hosts = cycle(dist_hosts) host = iter_hosts.next() remaining = attr[host].copy() vm['host'] = host remaining['RAM'] -= vm['mem'] remaining['CPU'] -= vm['n_cpu'] / 3 attr[host] = remaining.copy() if distribution == 'round-robin': host = iter_hosts.next() remaining = attr[host].copy() if distribution == 'random': for i in range(randint(0, len(dist_hosts))): host = iter_hosts.next() remaining = attr[host].copy() elif distribution == 'n_by_hosts': n_by_host = int(len(vms) / len(hosts)) i_vm = 0 for host in hosts: for i in range(n_by_host): vms[i_vm]['host'] = host i_vm += 1 if len(vms) % len(hosts) != 0: logger.warning('Reducing number of VMs to have %s by host', style.emph(n_by_host)) vms[:] = vms[0:n_by_host * len(hosts)] else: logger.debug('No valid distribution given') logger.debug('Final virtual machines distribution \n%s', "\n".join([vm['id'] + ": " + str(vm['host']) for vm in vms]))
def add_equip(self, equip, site): """Add a network equipment """ if equip not in self.data['network'][site]: logger.warn('Equipment %s not described in API' % (equip, )) return data = self.data['network'][site][equip] if self.has_node(equip): recurse = False else: logger.debug('Adding equipment %s', equip) self.add_node(equip, kind=data['kind'], backplane=data['backplane_bps']) recurse = True lc_data = data['linecards'] multiple_linecards = self._equip_uses_multiple_linecards(equip, site) equip_bw = data['backplane_bps'] for i_lc, lc in enumerate(lc_data): lc_node = _get_linecard_name(equip, i_lc) if 'ports' in lc: for i_port, port in enumerate(lc['ports']): if 'uid' in port: uid = _parse_port_uid(port['uid']) if not self._is_in_api(site, uid): do_once( (site, uid), logger.warn, 'unable to get kind of %s in %s, is it in g5k api?' % (uid, site)) continue kind = port.get('kind') kind2 = self._get_node_kind(site, uid) if not kind: kind = kind2 if kind != 'node': do_once( (equip, i_lc, i_port), logger.warn, 'missing kind in port %s:%s %s of %s, using %s from %s' % (i_lc, i_port, port, equip, kind, uid)) elif not kind2: logger.warn('missing kind in %s' % (uid, )) elif kind != kind2: logger.warn( 'mismatching kind %s in port %s:%s %s of %s and kind %s from %s. Using %s' % (kind, i_lc, i_port, port, equip, kind2, uid, kind2)) kind = kind2 if not kind: logger.error('unable to find kind of %s' % (uid, )) port_bw = lc['rate'] if 'rate' not in port else port[ 'rate'] if kind == 'virtual': # in this situation, we don't know what # kind is the target equipment, we need to # discover it if uid in self.data['network'][site]: pass elif uid in self.data['hosts']: kind = 'virtual-node' logger.warn( 'virtual link from %s(%s:%s %s) to node %s' % (equip, i_lc, i_port, port, uid)) else: pass if self.has_node(uid): if kind in ['node', 'virtual-node']: for e in self.get_host_adapters(uid): if e['switch'] == equip: if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge( equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) self._checked_add_edge( lc_node, uid, _unique_link_key( lc_node, uid + '-' + e['device']), bandwidth=port_bw, active=e['mounted']) else: self._checked_add_edge( equip, uid, _unique_link_key( equip, uid + '-' + e['device']), bandwidth=min( port_bw, equip_bw), active=e['mounted']) elif kind in ['switch', 'router'] and recurse: if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge(equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) target_lc, target_port = self._get_target_lc_and_port( equip, i_lc, i_port, site) if not target_lc is None: if self._equip_uses_multiple_linecards( uid, site): self._checked_add_edge( lc_node, _get_linecard_name( uid, target_lc), _unique_link_key( lc_node, _get_linecard_name( uid, target_lc)), bandwidth=port_bw, active=True) else: self._checked_add_edge( lc_node, uid, _unique_link_key(lc_node, uid), bandwidth=port_bw, active=True) else: logger.error( 'unable to find the target linecard of link between %s(%s:%s %s) and %s. Skipping this link!' % (equip, i_lc, i_port, port, uid)) else: target_lc, target_port = self._get_target_lc_and_port( equip, i_lc, i_port, site) if not target_lc is None: if self._equip_uses_multiple_linecards( uid, site): self._checked_add_edge( equip, _get_linecard_name( uid, target_lc), _unique_link_key( equip, _get_linecard_name( uid, target_lc)), bandwidth=min( port_bw, equip_bw), active=True) else: self._checked_add_edge( equip, uid, _unique_link_key(equip, uid), bandwidth=min( port_bw, equip_bw), active=True) else: logger.error( 'unable to find the target linecard of link between %s(%s:%s %s) and %s. Skipping this link!' % (equip, i_lc, i_port, port, uid)) if 'renater' in uid: # if uid != 'renater-' + site: # logger.error('renater node in %s has name %s which is not of the form renater-%s. Forcing to renater-%s' % (site, uid, site, site)) # uid = 'renater-' + site self.add_node(uid, kind='renater') if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge(equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) self._checked_add_edge(lc_node, uid, _unique_link_key( lc_node, uid), bandwidth=port_bw, active=True) else: self._checked_add_edge( equip, uid, _unique_link_key(equip, uid), bandwidth=min(port_bw, equip_bw), active=True) elif kind in ['switch', 'router']: if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge(equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) if recurse: self.add_equip(uid, site)
from os import environ, mkdir from execo import logger from networkx import is_isomorphic from networkx.readwrite import json_graph from execo_g5k.topology import g5k_graph, treemap from argparse import ArgumentParser from execo_g5k.api_utils import get_g5k_sites parser = ArgumentParser(prog=sys.argv[0], description='Update topology maps on the' + ' Grid\'5000 wiki') parser.add_argument('site', help='Choose the site') args = parser.parse_args() site = args.site if site not in get_g5k_sites(): logger.error('%s is not a valid G5K site') _json_dir = environ['HOME'] + '/.execo/topology/' try: mkdir(_json_dir) except: pass logger.setLevel('WARNING') g = g5k_graph([site]) logger.setLevel('INFO') try: with open(_json_dir + site + '.json', 'r') as infile: old_json = json.load(infile) g_old = json_graph.node_link_graph(old_json) if is_isomorphic(g, g_old):
import datetime from numpy import array, median import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from execo import TaktukRemote, logger, default_connection_params default_connection_params['user'] = '******' logger.info('Measuring boot time') # Reading VMs list for directory given in arg run_dir = sys.argv[1] if not run_dir: logger.error('No directory specified') exit() vms = [] f = open(run_dir + '/vms.list') for line in f: tmp = line.split() vms.append({'id': tmp[1], 'ip': tmp[0]}) f.close() # Measuring boot_duration now = time.time() get_uptime = TaktukRemote('cat /proc/uptime', [vm['ip'] for vm in vms]).run() boot_time = {} for p in get_uptime.processes: