def test_create_reservation_different_site(self): conf = { 'resources': { 'a': { 'controller': 1, 'compute': 1, 'network': 1, }, 'b': { 'compute': 10 } }, 'provider': { 'name': 'test', 'vlans': {} } } api.get_cluster_site = mock.Mock() api.get_cluster_site.side_effect = ['mysite', 'myothersite'] provider = G5k() jobs_specs = provider._create_reservation(conf) expected = [(OarSubmission("{cluster='a'}/nodes=3", name='test'), 'mysite'), (OarSubmission("{cluster='b'}/nodes=10", name='test'), 'myothersite')] self.equalsJobSpecs(expected, jobs_specs)
def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
def test_create_reservation_vlan_different_site(self): conf = { 'resources': { 'a': { 'controller': 1, 'compute': 1, 'network': 1, } }, 'provider': { 'name': 'test', 'vlans': { 'myothersite': "{type='kavlan'}/vlan=1" }, } } api.get_cluster_site = mock.Mock(return_value='mysite') provider = G5k() jobs_specs = provider._create_reservation(conf) expected = [(OarSubmission("{cluster='a'}/nodes=3", name='test'), 'mysite'), (OarSubmission("{type='kavlan'}/vlan=1", name='test'), 'myothersite')] self.equalsJobSpecs(expected, jobs_specs)
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission(resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={ 'user': default_frontend_connection_params['user'] }).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def prepare_bench(self): """bench configuration and compilation, copy binaries to frontends return True if preparation is ok """ logger.info("preparation: configure and compile benchmark") # the involved sites. We will do the compilation on the first of these. sites = list(set(map(get_cluster_site, self.parameters['cluster']))) # generate the bench compilation configuration bench_list = '\n'.join([ 'lu\t%s\t%s' % (size, n_core) for n_core in self.parameters['n_core'] for size in self.parameters['size'] ]) # Reserving a node because compiling on the frontend is forbidden # and because we need mpif77 jobs = oarsub([(OarSubmission(resources = "nodes=1", job_type = 'allow_classic_ssh', walltime ='0:10:00'), sites[0])]) if jobs[0][0]: try: logger.info("copying bench archive to %s" % (sites[0],)) copy_bench = Put([sites[0]], ['NPB3.3-MPI.tar.bz2']).run() logger.info("extracting bench archive on %s" % (sites[0],)) extract_bench = Remote('tar -xjf NPB3.3-MPI.tar.bz2', [sites[0]]).run() logger.info("waiting job start %s" % (jobs[0],)) wait_oar_job_start(*jobs[0], prediction_callback = pred_cb) logger.info("getting nodes of %s" % (jobs[0],)) nodes = get_oar_job_nodes(*jobs[0]) logger.info("configure bench compilation") conf_bench = Remote('echo "%s" > ~/NPB3.3-MPI/config/suite.def' % bench_list, nodes).run() logger.info("compil bench") compilation = Remote('cd NPB3.3-MPI && make clean && make suite', nodes).run() logger.info("compil finished") except: logger.error("unable to compile bench") return False finally: oardel(jobs) # Copying binaries to all other frontends frontends = sites[1:] rsync = Remote('rsync -avuP ~/NPB3.3-MPI/ {{frontends}}:NPB3.3-MPI', [get_host_site(nodes[0])] * len(frontends)) rsync.run() return compilation.ok and rsync.ok
def get_hosts_jobs(hosts, walltime, out_of_chart=False): """Find the first slot when the hosts are available and return a list of jobs_specs :param hosts: list of hosts :param walltime: duration of reservation """ hosts = map(lambda x: x.address if isinstance(x, Host) else x, hosts) planning = get_planning(elements=hosts, out_of_chart=out_of_chart) limits = _slots_limits(planning) walltime = get_seconds(walltime) for limit in limits: all_host_free = True for site_planning in planning.itervalues(): for cluster, cluster_planning in site_planning.iteritems(): if cluster in get_g5k_clusters(): for host_planning in cluster_planning.itervalues(): host_free = False for free_slot in host_planning['free']: if free_slot[0] <= limit and free_slot[ 1] >= limit + walltime: host_free = True if not host_free: all_host_free = False if all_host_free: startdate = limit break else: logger.error('Unable to find a slot for %s', hosts) return None jobs_specs = [] for site in planning.keys(): site_hosts = map(get_host_longname, filter(lambda h: get_host_site(h) == site, hosts)) sub_res = "{host in ('" + "','".join(site_hosts) + "')}/nodes=" + str( len(site_hosts)) jobs_specs.append((OarSubmission(resources=sub_res, reservation_date=startdate), site)) return jobs_specs
def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission( resources="nodes=%d" % (max(1, comb['cores'] / n_core), ), sql_properties="cluster='%s'" % comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([ (submission, get_cluster_site(comb['cluster'])) ])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)]
def _create_reservation(self, conf): """Create the OAR Job specs.""" provider_conf = conf['provider'] criteria = {} # NOTE(msimonin): Traverse all cluster demands in alphebetical order # test_create_reservation_different_site needs to know the order for cluster, roles in sorted(conf["resources"].items(), key=lambda x: x[0]): site = api.get_cluster_site(cluster) nb_nodes = reduce(operator.add, map(int, roles.values())) criterion = "{cluster='%s'}/nodes=%s" % (cluster, nb_nodes) criteria.setdefault(site, []).append(criterion) for site, vlan in provider_conf["vlans"].items(): criteria.setdefault(site, []).append(vlan) # Compute the specification for the reservation jobs_specs = [(OarSubmission(resources='+'.join(c), name=provider_conf["name"]), s) for s, c in criteria.items()] logging.info("Criteria for the reservation: %s" % pf(jobs_specs)) return jobs_specs
def _make_reservation(self): """Make a new reservation.""" # Extract the list of criteria (ie, `oarsub -l # *criteria*`) in order to compute a specification for the # reservation. criteria = {} # Actual criteria are : # - Number of node per site for cluster, roles in self.config["resources"].items(): site = get_cluster_site(cluster) nb_nodes = reduce(operator.add, map(int, roles.values())) criterion = "{cluster='%s'}/nodes=%s" % (cluster, nb_nodes) criteria.setdefault(site, []).append(criterion) for site, vlan in self.config["vlans"].items(): criteria.setdefault(site, []).append(vlan) # Compute the specification for the reservation jobs_specs = [(OarSubmission(resources = '+'.join(c), name = self.config["name"]), s) for s, c in criteria.items()] logger.info("Criteria for the reservation: %s" % pf(jobs_specs)) # Make the reservation gridjob, _ = EX5.oargridsub( jobs_specs, reservation_date=self.config['reservation'], walltime=self.config['walltime'].encode('ascii', 'ignore'), job_type='deploy' ) # TODO - move this upper to not have a side effect here if gridjob is not None: self.gridjob = gridjob logger.info("Using new oargrid job %s" % style.emph(self.gridjob)) else: logger.error("No oar job was created.") sys.exit(26)
logger.error("Unable to deploy & compile Bench.") oardel(jobs) return False logger.info("Benchs completed. Deleting jobs.") oardel(jobs) return True if __name__ == "__main__": args = sys.argv[1:] prop = args[0] site = args[1] folder = args[2] if len(args) >= 5: date = args[3] walltime = args[4] submission = OarSubmission(resources = "nodes=1", job_type = 'deploy', walltime = walltime, reservation_date = date, sql_properties = prop) launch_bench(submission, site, folder) else: walltime = args[3] submission = OarSubmission(resources = "nodes=1", job_type = 'deploy', walltime = walltime, sql_properties = prop) launch_bench(submission, site, folder)
def run(self): sweeper = self.create_paramsweeper() while True: comb = sweeper.get_next() if not comb: break comb_dir = self.result_dir + '/' + slugify(comb) if not os.path.isdir(comb_dir): os.mkdir(comb_dir) comb_file = comb_dir + '/trace' g5k_configuration['kadeploy3'] = comb['version'] logger.info('Treating combination %s', pformat(comb)) get_version = SshProcess( comb['version'] + ' -v', comb['site'], connection_params=default_frontend_connection_params).run() logger.info(get_version.stdout) resources = "" if comb['kavlan']: resources += "{type='kavlan'}/vlan=1+" resources += "nodes=" + str(comb['n_nodes']) sub = OarSubmission(resources=resources, job_type='deploy', walltime="0:30:00", name='Kadeploy_Tests') logger.info('Performing submission of %s on site %s', resources, comb['site']) jobs = oarsub([(sub, comb['site'])]) if jobs[0][0]: try: logger.info('Waiting for job to start') wait_oar_job_start(jobs[0][0], jobs[0][1]) hosts = get_oar_job_nodes(jobs[0][0], jobs[0][1]) logger.info('Deployment of %s', ' '.join([host.address for host in hosts])) kavlan = get_oar_job_kavlan(jobs[0][0], jobs[0][1]) if kavlan: logger.info('In kavlan %s', kavlan) deployment = Deployment(hosts, env_name=comb['env'], vlan=kavlan) deployed, undeployed = deploy(deployment, stdout_handlers=[comb_file], stderr_handlers=[comb_file]) finally: logger.info('Destroying job %s on %s', str(jobs[0][0]), jobs[0][1]) oardel([(jobs[0][0], jobs[0][1])]) else: deployed = [] if len(undeployed) == 0: logger.info('%s is OK', slugify(comb)) elif len(deployed) == 0: logger.error('%s is KO', slugify(comb)) else: logger.warning('%s encountered problems with some hosts', slugify(comb)) sweeper.done(comb)
def get_jobs_specs(resources, excluded_elements=None, name=None): """ Generate the several job specifications from the dict of resources and the blacklisted elements :param resources: a dict, whose keys are Grid'5000 element and values the corresponding number of n_nodes :param excluded_elements: a list of elements that won't be used :param name: the name of the jobs that will be given """ jobs_specs = [] if excluded_elements == None: excluded_elements = [] # Creating the list of sites used sites = [] real_resources = resources.copy() for resource in resources: if resource in get_g5k_sites() and resource not in sites: sites.append(resource) if resource in get_g5k_clusters(queues=None): if resource not in excluded_elements: site = get_cluster_site(resource) if site not in sites: sites.append(site) if site not in real_resources: real_resources[site] = 0 # Checking if we need a Kavlan, a KaVLAN global or none get_kavlan = 'kavlan' in resources if get_kavlan: kavlan = 'kavlan' n_sites = 0 for resource in real_resources: if resource in sites: n_sites += 1 if n_sites > 1: kavlan += '-global' break blacklisted_hosts = {} for element in excluded_elements: if element not in get_g5k_clusters(queues=None) + get_g5k_sites(): site = get_host_site(element) if not 'site' in blacklisted_hosts: blacklisted_hosts[site] = [element] else: blacklisted_hosts[site].append(element) for site in sites: sub_resources = '' # Adding a KaVLAN if needed if get_kavlan: if not 'global' in kavlan: sub_resources = "{type='" + kavlan + "'}/vlan=1+" get_kavlan = False elif site in resources['kavlan']: sub_resources = "{type='" + kavlan + "'}/vlan=1+" get_kavlan = False base_sql = '{' end_sql = '}/' # Creating blacklist SQL string for hosts host_blacklist = False str_hosts = '' if site in blacklisted_hosts and len(blacklisted_hosts[site]) > 0: str_hosts = ''.join([ "host not in ('" + get_host_longname(host) + "') and " for host in blacklisted_hosts[site] ]) host_blacklist = True #Adding the clusters blacklist str_clusters = str_hosts if host_blacklist else '' cl_blacklist = False clusters_nodes = 0 for cluster in get_site_clusters(site, queues=None): if cluster in resources and resources[cluster] > 0: if str_hosts == '': sub_resources += "{cluster='" + cluster + "'}" else: sub_resources += base_sql + str_hosts + "cluster='" + \ cluster + "'" + end_sql sub_resources += "/nodes=" + str(resources[cluster]) + '+' clusters_nodes += resources[cluster] if cluster in excluded_elements: str_clusters += "cluster not in ('" + cluster + "') and " cl_blacklist = True # Generating the site blacklist string from host and cluster blacklist str_site = '' if host_blacklist or cl_blacklist: str_site += base_sql if not cl_blacklist: str_site += str_hosts[:-4] else: str_site += str_clusters[:-4] str_site = str_site + end_sql if real_resources[site] > 0: sub_resources += str_site + "nodes=" + str(real_resources[site]) +\ '+' if sub_resources != '': jobs_specs.append((OarSubmission(resources=sub_resources[:-1], name=name), site)) return jobs_specs
def run(self): """Run the experiment""" already_configured = self.options.already_configured reservation_job_id = int(self.options.reservation_id) \ if self.options.reservation_id is not None else None is_a_test = self.options.is_a_test if is_a_test: logger.warn('THIS IS A TEST! This run will use only a few ' 'resources') # make the result folder writable for all os.chmod(self.result_dir, 0o777) # Import configuration with open(self.args[0]) as config_file: config = json.load(config_file) # backup configuration copy(self.args[0], self.result_dir) site = config["grid5000_site"] resources = config["resources"] nb_experiment_nodes = config["nb_experiment_nodes"] walltime = str(config["walltime"]) env_name = config["kadeploy_env_name"] workloads = config["workloads"] # check if workloads exists (Suppose that the same NFS mount point # is present on the remote and the local environment for workload_file in workloads: with open(workload_file): pass # copy the workloads files to the results dir copy(workload_file, self.result_dir) # define the workloads parameters self.parameters = {'workload_filename': workloads} logger.info('Workloads: {}'.format(workloads)) # define the iterator over the parameters combinations self.sweeper = ParamSweeper(os.path.join(self.result_dir, "sweeps"), sweep(self.parameters)) # Due to previous (using -c result_dir) run skip some combination logger.info('Skipped parameters:' + '{}'.format(str(self.sweeper.get_skipped()))) logger.info('Number of parameters combinations {}'.format( str(len(self.sweeper.get_remaining())))) logger.info('combinations {}'.format(str( self.sweeper.get_remaining()))) if reservation_job_id is not None: jobs = [(reservation_job_id, site)] else: jobs = oarsub([(OarSubmission(resources=resources, job_type='deploy', walltime=walltime), site)]) job_id, site = jobs[0] if job_id: try: logger.info("waiting job start %s on %s" % (job_id, site)) wait_oar_job_start(job_id, site, prediction_callback=prediction_callback) logger.info("getting nodes of %s on %s" % (job_id, site)) nodes = get_oar_job_nodes(job_id, site) # sort the nodes nodes = sorted(nodes, key=lambda node: node.address) # get only the necessary nodes under the switch if nb_experiment_nodes > len(nodes): raise RuntimeError('The number of given node in the ' 'reservation ({}) do not match the ' 'requested resources ' '({})'.format(len(nodes), nb_experiment_nodes)) nodes = nodes[:nb_experiment_nodes] logger.info("deploying nodes: {}".format(str(nodes))) deployed, undeployed = deploy( Deployment(nodes, env_name=env_name), check_deployed_command=already_configured) if undeployed: logger.warn("NOT deployed nodes: {}".format( str(undeployed))) raise RuntimeError('Deployement failed') if not already_configured: # install OAR install_cmd = "apt-get update; apt-get install -y " node_packages = "oar-node" logger.info("installing OAR nodes: {}".format( str(nodes[1:]))) install_oar_nodes = Remote( install_cmd + node_packages, nodes[1:], connection_params={'user': '******'}) install_oar_nodes.start() server_packages = ( "oar-server oar-server-pgsql oar-user " "oar-user-pgsql postgresql python3-pip " "libjson-perl postgresql-server-dev-all") install_oar_sched_cmd = """ mkdir -p /opt/oar_sched; \ cd /opt/oar_sched; \ git clone https://github.com/oar-team/oar3.git; \ cd oar3; \ git checkout dce942bebc2; \ pip3 install -e .; \ cd /usr/lib/oar/schedulers; \ ln -s /usr/local/bin/kamelot; \ pip3 install psycopg2 """ logger.info("installing OAR server node: {}".format( str(nodes[0]))) install_master = SshProcess( install_cmd + server_packages + ";" + install_oar_sched_cmd, nodes[0], connection_params={'user': '******'}) install_master.run() install_oar_nodes.wait() if not install_master.ok: Report(install_master) configure_oar_cmd = """ sed -i \ -e 's/^\(DB_TYPE\)=.*/\\1="Pg"/' \ -e 's/^\(DB_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(DB_PORT\)=.*/\\1="5432"/' \ -e 's/^\(DB_BASE_PASSWD\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_LOGIN\)=.*/\\1="oar"/' \ -e 's/^\(DB_BASE_PASSWD_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(DB_BASE_LOGIN_RO\)=.*/\\1="oar_ro"/' \ -e 's/^\(SERVER_HOSTNAME\)=.*/\\1="localhost"/' \ -e 's/^\(SERVER_PORT\)=.*/\\1="16666"/' \ -e 's/^\(LOG_LEVEL\)\=\"2\"/\\1\=\"3\"/' \ -e 's#^\(LOG_FILE\)\=.*#\\1="{result_dir}/oar.log"#' \ -e 's/^\(JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD\=\"cpuset\".*\)/#\\1/' \ -e 's/^#\(CPUSET_PATH\=\"\/oar\".*\)/\\1/' \ -e 's/^\(FINAUD_FREQUENCY\)\=.*/\\1="0"/' \ /etc/oar/oar.conf """.format(result_dir=self.result_dir) configure_oar = Remote(configure_oar_cmd, nodes, connection_params={'user': '******'}) configure_oar.run() logger.info("OAR is configured on all nodes") # Configure server create_db = "oar-database --create --db-is-local" config_oar_sched = ( "oarnotify --remove-queue default;" "oarnotify --add-queue default,1,kamelot") start_oar = "systemctl start oar-server.service" logger.info("configuring OAR database: {}".format( str(nodes[0]))) config_master = SshProcess( create_db + ";" + config_oar_sched + ";" + start_oar, nodes[0], connection_params={'user': '******'}) config_master.run() # propagate SSH keys logger.info("configuring OAR SSH") oar_key = "/tmp/.ssh" Process('rm -rf ' + oar_key).run() Process( 'scp -o BatchMode=yes -o PasswordAuthentication=no ' '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' '-o ConnectTimeout=20 -rp -o User=root ' + nodes[0].address + ":/var/lib/oar/.ssh" ' ' + oar_key).run() # Get(nodes[0], "/var/lib/oar/.ssh", [oar_key], connection_params={'user': '******'}).run() Put(nodes[1:], [oar_key], "/var/lib/oar/", connection_params={ 'user': '******' }).run() add_resources_cmd = """ oarproperty -a cpu || true; \ oarproperty -a core || true; \ oarproperty -c -a host || true; \ oarproperty -a mem || true; \ """ for node in nodes[1:]: add_resources_cmd = add_resources_cmd + "oarnodesetting -a -h {node} -p host={node} -p cpu=1 -p core=4 -p cpuset=0 -p mem=16; \\\n".format( node=node.address) add_resources = SshProcess( add_resources_cmd, nodes[0], connection_params={'user': '******'}) add_resources.run() if add_resources.ok: logger.info("oar is now configured!") else: raise RuntimeError( "error in the OAR configuration: Abort!") # TODO backup de la config de OAR # Do the replay logger.info('begining the replay') while len(self.sweeper.get_remaining()) > 0: combi = self.sweeper.get_next() workload_file = os.path.basename( combi['workload_filename']) oar_replay = SshProcess( script_path + "/oar_replay.py " + combi['workload_filename'] + " " + self.result_dir + " oar_gant_" + workload_file, nodes[0]) oar_replay.stdout_handlers.append(self.result_dir + '/' + workload_file + '.out') logger.info("replaying workload: {}".format(combi)) oar_replay.run() if oar_replay.ok: logger.info("Replay workload OK: {}".format(combi)) self.sweeper.done(combi) else: logger.info("Replay workload NOT OK: {}".format(combi)) self.sweeper.cancel(combi) raise RuntimeError("error in the OAR replay: Abort!") except: traceback.print_exc() ipdb.set_trace() finally: if is_a_test: ipdb.set_trace() if reservation_job_id is None: logger.info("delete job: {}".format(jobs)) oardel(jobs)