def grid_reload_from_id(gridjob): logger.info("Reloading the resources from oargrid job %s", gridjob) gridjob = int(gridjob) nodes = ex5.get_oargrid_job_nodes(gridjob) job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] subnets = [] for (job_id, site) in job_sites: vlan_ids = ex5.get_oar_job_kavlan(job_id, site) vlans.extend([{ "site": site, "vlan_id": vlan_id } for vlan_id in vlan_ids]) # NOTE(msimonin): this currently returned only one subnet # even if several are reserved # We'll need to patch execo the same way it has been patched for vlans ipmac, info = ex5.get_oar_job_subnets(job_id, site) if not ipmac: logger.debug("No subnet information found for this job") continue subnet = { "site": site, "ipmac": ipmac, } subnet.update(info) # Mandatory key when it comes to concretize resources subnet.update({"network": info["ip_prefix"]}) subnets.append(subnet) return nodes, vlans, subnets
def concretize_resources(resources, gridjob, reservation_type): if reservation_type == "oar": nodes = ex5.get_oar_job_nodes(gridjob) else: nodes = ex5.get_oargrid_job_nodes(gridjob) concretize_nodes(resources, nodes) if reservation_type == "oar": # This block is in charge of detecting the site of the oar reservation site_candidates = [] for network_description in resources.get("machines", []): cluster = network_description.get("cluster") site_candidates += [ex5.get_cluster_site(cluster)] for network_description in resources.get("networks", []): site_candidates += [network_description.get("site", "unknown")] if len(set(site_candidates)) == 1: site = site_candidates[0] else: raise "Could not detect the g5k site of the oarjob %s" % gridjob job_sites = [(gridjob, site)] else: job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] for (job_id, site) in job_sites: vlan_ids = ex5.get_oar_job_kavlan(job_id, site) vlans.extend([{ "site": site, "vlan_id": vlan_id } for vlan_id in vlan_ids]) concretize_networks(resources, vlans)
def _get_jobs_and_vlans(self, conf): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" provider_conf = conf['provider'] # Look if there is a running job or make a new reservation gridjob, _ = EX5.planning.get_job_by_name(provider_conf['name']) if gridjob is None: gridjob = self._make_reservation(conf) else: logging.info("Using running oargrid job %s" % gridjob) # Wait for the job to start EX5.wait_oargrid_job_start(gridjob) nodes = sorted(EX5.get_oargrid_job_nodes(gridjob), key=lambda n: n.address) # Checking the number of nodes given # the disribution policy self._check_nodes(nodes=nodes, resources=conf['resources'], mode=provider_conf['role_distribution']) # vlans information job_sites = EX5.get_oargrid_job_oar_jobs(gridjob) jobs = [] vlans = [] for (job_id, site) in job_sites: jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: vlans.append((site, EX5.get_oar_job_kavlan(job_id, site))) return (jobs, vlans, nodes)
def grid_reload_from_id(gridjob): logger.info("Reloading the resources from oargrid job %s", gridjob) gridjob = int(gridjob) nodes = ex5.get_oargrid_job_nodes(gridjob) job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] subnets = [] for (job_id, site) in job_sites: vlans, subnets = get_network_info_from_job_id(job_id, site, vlans, subnets) return nodes, vlans, subnets
def _get_job(self): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" # Look if there is a running job or make a new reservation self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name']) if self.gridjob is None: self._make_reservation() else: logging.info("Using running oargrid job %s" % self.gridjob) # Wait for the job to start EX5.wait_oargrid_job_start(self.gridjob) # # XXX Still useful? # attempts = 0 # self.nodes = None # while self.nodes is None and attempts < MAX_ATTEMPTS: # self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), # key = lambda n: n.address) # attempts += 1 self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), key=lambda n: n.address) # # XXX check already done into `_deploy`. self._check_nodes(nodes=self.nodes, resources=self.config['resources'], mode=self.config['role_distribution']) # XXX(Ad_rien_) Start_date is never used, deadcode? - August # 11th 2016 self.start_date = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'start_date' in job_info: self.start_date = job_info['start_date'] # filling some information about the jobs here self.user = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'user' in job_info: self.user = job_info['user'] # vlans information job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob) self.jobs = [] self.vlans = [] for (job_id, site) in job_sites: self.jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site)))
def concretize_resources(resources, gridjob): nodes = ex5.get_oargrid_job_nodes(gridjob) concretize_nodes(resources, nodes) job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] for (job_id, site) in job_sites: vlan_ids = ex5.get_oar_job_kavlan(job_id, site) vlans.extend([{ "site": site, "vlan_id": vlan_id} for vlan_id in vlan_ids]) concretize_networks(resources, vlans)
def get_job(self): """Get the hosts from an existing job (if any) or from a new job. This will perform a reservation if necessary.""" # Look if there is a running job or make a new reservation self.gridjob, _ = EX5.planning.get_job_by_name(self.config['name']) if self.gridjob is None: self._make_reservation() else: logger.info("Using running oargrid job %s" % style.emph(self.gridjob)) # Wait for the job to start EX5.wait_oargrid_job_start(self.gridjob) attempts = 0 self.nodes = None while self.nodes is None and attempts < MAX_ATTEMPTS: self.nodes = sorted(EX5.get_oargrid_job_nodes(self.gridjob), key = lambda n: n.address) attempts += 1 check_nodes( nodes = self.nodes, resources = self.config['resources'], mode = self.config['role_distribution']) # TODO - Start_date is never used, deadcode ? Ad_rien_ - August 11th 2016 self.start_date = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'start_date' in job_info: self.start_date = job_info['start_date'] ## filling some information about the jobs here self.user = None job_info = EX5.get_oargrid_job_info(self.gridjob) if 'user' in job_info: self.user = job_info['user'] ## vlans information job_sites = EX5.get_oargrid_job_oar_jobs(self.gridjob) self.jobs = [] self.vlans = [] for (job_id, site) in job_sites: self.jobs.append((site, job_id)) vlan_id = EX5.get_oar_job_kavlan(job_id, site) if vlan_id is not None: self.vlans.append((site, EX5.get_oar_job_kavlan(job_id, site))) return self.gridjob
def get_oargrid_job_vm5k_resources(oargrid_job_id): """Retrieve the hosts list and (ip, mac) list by sites from an oargrid_job_id and return the resources dict needed by vm5k_deployment, with kavlan-global if used in the oargrid job """ oargrid_job_id = int(oargrid_job_id) logger.info('Waiting job start') wait_oargrid_job_start(oargrid_job_id) resources = get_oar_job_vm5k_resources([(oar_job_id, site) for oar_job_id, site in get_oargrid_job_oar_jobs(oargrid_job_id)]) kavlan_global = None for site, res in resources.iteritems(): if res['kavlan'] >= 10: kavlan_global = {'kavlan': res['kavlan'], 'ip_mac': resources[site]['ip_mac'], 'site': site} break if kavlan_global: resources['global'] = kavlan_global return resources
def get_job_by_name(job_name, sites=None): """ """ logger.detail('Looking for a job named %s', style.emph(job_name)) if not sites: sites = get_g5k_sites() oargrid_jobs = get_current_oargrid_jobs() if len(oargrid_jobs) > 0: for g_job in oargrid_jobs: for job in get_oargrid_job_oar_jobs(g_job): info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Oargridjob %s found !', style.emph(g_job)) return g_job, None running_jobs = get_current_oar_jobs(sites) for job in running_jobs: info = get_oar_job_info(job[0], job[1]) if info['name'] == job_name: logger.info('Job %s found on site %s !', style.emph(job[0]), style.host(job[1])) return job return None, None
def main(): copy_outputs('config.log', 'config.log') args = parser.parse_args() whoami = os.getlogin() logger.info('whoami: %s', whoami) jobids = [] if args.grid_job_id == None: jobids = args.job_ids else: grid_job_id = int(args.grid_job_id[0]) jobids = ["%s:%d" % (site, job_id) for job_id, site in get_oargrid_job_oar_jobs(grid_job_id)] logger.info('Using jobs %s', style.emph(' '.join(jobids))) sites = [j.strip().split(':')[0] for j in jobids] frontends = [str('frontend.'+s) for s in sites] oar_ids = [j.strip().split(':')[1] for j in jobids] jobids_list=[(int(j.strip().split(':')[1]),str(j.strip().split(':')[0])) for j in jobids] # print sites # print oar_ids # print jobids_list # print frontends logger.info("Get list of associated nodes") nodes = [ job_nodes for job in jobids_list for job_nodes in get_oar_job_nodes(*job) ] # logger.info('%s', hosts_list(nodes)) logger.info('%s', nodes) logger.info("Deploying %i nodes" % (len(nodes),)) deployed, undeployed = deploy(Deployment(nodes, env_name = "jessie-x64-nfs")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) ## Configure Host OSes logger.info('Finalize node customization') # use root to connect on the host default_connection_params['user'] = '******' ## Copy local .ssh to remote nodes: logger.info('Copy ssh entries into root of each node') Put(nodes, ['/home/'+whoami+'/.ssh/id_rsa','/home/'+whoami+'/.ssh/id_rsa.pub'],'.ssh/.').run() ## Install missing packages logger.info('| - Install Packages') install_packages = TaktukRemote('export DEBIAN_MASTER=noninteractive ; export https_proxy="https://proxy:3128"; apt-get -o Acquire::Check-Valid-Until=false update && apt-get install -y --force-yes python-pip lynx openjdk-8-jdk uuid-runtime cpufrequtils kanif -o Acquire::Check-Valid-Until=false -o Dpkgtions::="--force-confdef" -o Dpkgtions::="--force-confold" ; pip install tabview', nodes).run() ## Fix ulimit and related stuffs logger.info('| - set limit related stuffs') cmd = 'ulimit -c unlimited; sysctl -w vm.max_map_count=331072 ; echo 120000 > /proc/sys/kernel/threads-max ; echo 600000 > /proc/sys/vm/max_map_count ; echo 200000 > /proc/sys/kernel/pid_max' TaktukRemote(cmd, nodes).run() ## Copy the DHT-EXP hierarchy to the remote site logger.info('Copy sloth and injector files on each NFS server involved in the experiment') TaktukRemote('mkdir -p ~/SLOTH-EXP-TMP/', frontends, connection_params={'user': str(whoami)}).run() TaktukPut(frontends, ['./SLOTH_HOME' ],'./SLOTH-EXP-TMP/.', connection_params={'user': str(whoami)}).run() TaktukPut(frontends, ['./INJECTOR_HOME' ], './SLOTH-EXP-TMP/.', connection_params={'user': str(whoami)}).run() ## Prepare the address file for the sloth peers (please remind that the last node is dedicated for the injector logger.info('Prepare the peers list') f1 = open('./hosts.info', 'w') f2 = open('./peers.info', 'w') logger.info('enumerate %s', enumerate(nodes[:-1])) i = 0 for j, node in enumerate(nodes[:-1]): f1.write("%s\n" % (node.address)) for cores in range(get_host_attributes(node)['architecture']['smt_size']): f2.write("%s:%d:%d\n" % (node.address, 3000 + i, 8000 + i)) i = i + 1 f1.close() f2.close() f = open('./service_node.info', 'w') f.write("%s" % nodes[-1].address) f.close() logger.info("Nodes are now ready, you should launch ./runExperiment.sh ... from the lyon frontend") logger.info("The list of sloth peers is in ./peers.info") logger.info("The service node is in ./service_node.info") logger.info("The injector will run on %s" % nodes[-1].address) logger.info("The usual(max) command should be : ./runExperiment.sh in_vivo %d %s/peers.info %s" % (i,os.getcwd(),nodes[-1].address))