def oargriddel(job_ids, frontend_connection_params = None, timeout = False): """Delete oargrid jobs. Ignores any error, so you can delete inexistant jobs, already deleted jobs, or jobs that you don't own. Those deletions will be ignored. :param job_ids: iterable of oar grid job ids. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') processes = [] for job_id in job_ids: p = get_process("oargriddel %i" % (job_id,), host = get_frontend_host(), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.nolog_exit_code = True p.pty = True processes.append(p) for process in processes: process.start() for process in processes: process.wait()
def get_oargrid_job_oar_jobs(oargrid_job_id = None, frontend_connection_params = None, timeout = False): """Return a list of tuples (oar job id, site), the list of individual oar jobs which make an oargrid job. :param oargrid_job_id: the oargrid job id. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') process = get_process("oargridstat %i" % (oargrid_job_id,), host = get_frontend_host(), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: job_specs = [] for m in re.finditer("^\t(\w+) --> (\d+)", process.stdout, re.MULTILINE): site = m.group(1) if site not in get_g5k_sites(): site = get_cluster_site(site) job_specs.append((int(m.group(2)), site)) return job_specs else: raise ProcessesFailed([process])
def get_oargrid_job_nodes(oargrid_job_id, frontend_connection_params = None, timeout = False): """Return an iterable of `execo.host.Host` containing the hosts of an oargrid job. :param oargrid_job_id: the oargrid job id. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') process = get_process("oargridstat -wl %i 2>/dev/null || oargridstat -l %i 2>/dev/null" % (oargrid_job_id, oargrid_job_id), host = get_frontend_host(), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE) return list(set([ Host(host_address) for host_address in host_addresses ])) else: raise ProcessesFailed([process])
def get_oargrid_job_nodes(oargrid_job_id, frontend_connection_params=None, timeout=False): """Return an iterable of `execo.host.Host` containing the hosts of an oargrid job. :param oargrid_job_id: the oargrid job id. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') process = get_process( "oargridstat -wl %i 2>/dev/null || oargridstat -l %i 2>/dev/null" % (oargrid_job_id, oargrid_job_id), host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE) return list( set([Host(host_address) for host_address in host_addresses])) else: raise ProcessesFailed([process])
def _init_processes(self): self.processes = [] self._unique_hosts = get_hosts_set(self.deployment.hosts) frontends = dict() for host in self._unique_hosts: frontend = _get_host_frontend(host) if frontend in frontends: frontends[frontend].append(host) else: frontends[frontend] = [host] lifecycle_handler = ActionNotificationProcessLH(self, len(frontends)) deploy_stdout_handler = _KadeployStdoutHandler() for frontend in frontends: kadeploy_command = self.deployment._get_common_kadeploy_command_line() for host in frontends[frontend]: kadeploy_command += " -m %s" % (host.address,) p = get_process(kadeploy_command, host = get_frontend_host(frontend), connection_params = make_connection_params(self.frontend_connection_params, default_frontend_connection_params)) p.pty = True p.timeout = self.timeout p.stdout_handlers.append(deploy_stdout_handler) p.stdout_handlers.extend([ FrontendPrefixWrapper(h) for h in singleton_to_collection(self._stdout_handlers) ]) p.stderr_handlers.extend([ FrontendPrefixWrapper(h) for h in singleton_to_collection(self._stderr_handlers) ]) p.lifecycle_handlers.append(lifecycle_handler) p.frontend = frontend p.kadeploy_hosts = [ host.address for host in frontends[frontend] ] p.deployed_hosts = set() p.undeployed_hosts = set() p.kadeployer = self self.processes.append(p)
def oargriddel(job_ids, frontend_connection_params=None, timeout=False): """Delete oargrid jobs. Ignores any error, so you can delete inexistant jobs, already deleted jobs, or jobs that you don't own. Those deletions will be ignored. :param job_ids: iterable of oar grid job ids. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') processes = [] for job_id in job_ids: p = get_process("oargriddel %i" % (job_id, ), host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.nolog_exit_code = True p.pty = True processes.append(p) for process in processes: process.start() for process in processes: process.wait()
def oarsub(job_specs, frontend_connection_params = None, timeout = False, abort_on_error = False): """Submit jobs. :param job_specs: iterable of tuples (execo_g5k.oar.OarSubmission, frontend) with None for default frontend :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for submitting. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. :param abort_on_error: default False. If True, raises an exception on any error. If False, will returned the list of job got, even if incomplete (some frontends may have failed to answer). Returns a list of tuples (oarjob id, frontend), with frontend == None for default frontend. If submission error, oarjob id == None. The returned list matches, in the same order, the job_specs parameter. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') processes = [] for (spec, frontend) in job_specs: oarsub_cmdline = get_oarsub_commandline(spec) p = get_process(oarsub_cmdline, host = get_frontend_host(frontend), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.pty = True p.frontend = frontend processes.append(p) oar_job_ids = [] if len(processes) == 0: return oar_job_ids for process in processes: process.start() for process in processes: process.wait() failed_processes = [] for process in processes: job_id = None if process.ok: mo = re.search("^OAR_JOB_ID=(\d+)\s*$", process.stdout, re.MULTILINE) if mo != None: job_id = int(mo.group(1)) if job_id == None: failed_processes.append(process) oar_job_ids.append((job_id, process.frontend)) if len(failed_processes) > 0 and abort_on_error: raise ProcessesFailed(failed_processes) else: return oar_job_ids
def get_oar_job_kavlan(oar_job_id=None, frontend=None, frontend_connection_params=None, timeout=False): """Return the list of vlan ids of a job (if any). :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError( "no oar job id given and no OAR_JOB_ID environment variable found" ) countdown = Timer(timeout) wait_oar_job_start(oar_job_id, frontend, frontend_connection_params, countdown.remaining()) process = get_process('kavlan -j %s -V ' % oar_job_id, host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = countdown.remaining() process.pty = True process.ignore_exit_code = True # kavlan exit code != 0 if request process.nolog_exit_code = True # is for a job without a vlan # reservation process.run() if process.ok: try: return [ int(x) for x in process.stdout.strip().rstrip().split('\r\n') ] except: return [] # handles cases where the job has no kavlan # resource or when kavlan isn't available else: raise ProcessesFailed([process])
def get_oargrid_job_info(oargrid_job_id=None, frontend_connection_params=None, timeout=False): """Return a dict with informations about an oargrid job. :param oargrid_job_id: the oargrid job id. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. Hash returned contains these keys: - ``start_date``: unix timestamp of job's start date - ``walltime``: job's walltime in seconds - ``user``: job's user """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') process = get_process("oargridstat %i" % (oargrid_job_id, ), host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() job_info = dict() start_date_result = re.search( "start date : (\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)", process.stdout, re.MULTILINE) if start_date_result: start_date = oar_date_to_unixts(start_date_result.group(1)) job_info['start_date'] = start_date walltime_result = re.search("walltime : (\d+:\d?\d:\d?\d)", process.stdout, re.MULTILINE) if walltime_result: walltime = oar_duration_to_seconds(walltime_result.group(1)) job_info['walltime'] = walltime user_result = re.search("user : (\S+)", process.stdout, re.MULTILINE) if user_result: user = user_result.group(1) job_info['user'] = user return job_info
def get_current_oargrid_jobs(start_between=None, end_between=None, frontend_connection_params=None, timeout=False): """Return a list of current active oargrid job ids. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [get_unixts(t) for t in start_between] if end_between: end_between = [get_unixts(t) for t in end_between] process = get_process("oargridstat", host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: jobs = re.findall("Reservation # (\d+):", process.stdout, re.MULTILINE) oargrid_job_ids = [int(j) for j in jobs] if start_between or end_between: filtered_job_ids = [] for job in oargrid_job_ids: info = get_oargrid_job_info(job, timeout) if (_date_in_range(info['start_date'], start_between) and _date_in_range( info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(job) oargrid_job_ids = filtered_job_ids return oargrid_job_ids else: raise ProcessesFailed([process])
def get_oar_job_nodes(oar_job_id=None, frontend=None, frontend_connection_params=None, timeout=False): """Return an iterable of `execo.host.Host` containing the hosts of an oar job. This method waits for the job start (the list of nodes isn't fixed until the job start). :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError( "no oar job id given and no OAR_JOB_ID environment variable found" ) countdown = Timer(timeout) wait_oar_job_start(oar_job_id, frontend, frontend_connection_params, countdown.remaining()) process = get_process( "(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && oarstat -pj %(oar_job_id)i | oarprint host -f -" % {'oar_job_id': oar_job_id}, host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = countdown.remaining() process.shell = process.pty = True process.run() if process.ok: host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE) return [Host(host_address) for host_address in host_addresses] else: raise ProcessesFailed([process])
def get_current_oargrid_jobs(start_between = None, end_between = None, frontend_connection_params = None, timeout = False): """Return a list of current active oargrid job ids. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [ get_unixts(t) for t in start_between ] if end_between: end_between = [ get_unixts(t) for t in end_between ] process = get_process("oargridstat", host = get_frontend_host(), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: jobs = re.findall("Reservation # (\d+):", process.stdout, re.MULTILINE) oargrid_job_ids = [ int(j) for j in jobs ] if start_between or end_between: filtered_job_ids = [] for job in oargrid_job_ids: info = get_oargrid_job_info(job, timeout) if (_date_in_range(info['start_date'], start_between) and _date_in_range(info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(job) oargrid_job_ids = filtered_job_ids return oargrid_job_ids else: raise ProcessesFailed([process])
def get_oar_job_kavlan(oar_job_id = None, frontend = None, frontend_connection_params = None, timeout = False): """Return the list of vlan ids of a job (if any). :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError("no oar job id given and no OAR_JOB_ID environment variable found") countdown = Timer(timeout) wait_oar_job_start(oar_job_id, frontend, frontend_connection_params, countdown.remaining()) process = get_process( 'kavlan -j %s -V ' % oar_job_id, host = get_frontend_host(frontend), connection_params = make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = countdown.remaining() process.pty = True process.ignore_exit_code = True # kavlan exit code != 0 if request process.nolog_exit_code = True # is for a job without a vlan # reservation process.run() if process.ok: try: return [ int(x) for x in process.stdout.strip().rstrip().split('\r\n') ] except: return [] # handles cases where the job has no kavlan # resource or when kavlan isn't available else: raise ProcessesFailed([process])
def get_oargrid_job_info(oargrid_job_id = None, frontend_connection_params = None, timeout = False): """Return a dict with informations about an oargrid job. :param oargrid_job_id: the oargrid job id. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. Hash returned contains these keys: - ``start_date``: unix timestamp of job's start date - ``walltime``: job's walltime in seconds - ``user``: job's user """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') process = get_process("oargridstat %i" % (oargrid_job_id,), host = get_frontend_host(), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() job_info = dict() start_date_result = re.search("start date : (\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)", process.stdout, re.MULTILINE) if start_date_result: start_date = oar_date_to_unixts(start_date_result.group(1)) job_info['start_date'] = start_date walltime_result = re.search("walltime : (\d+:\d?\d:\d?\d)", process.stdout, re.MULTILINE) if walltime_result: walltime = oar_duration_to_seconds(walltime_result.group(1)) job_info['walltime'] = walltime user_result = re.search("user : (\S+)", process.stdout, re.MULTILINE) if user_result: user = user_result.group(1) job_info['user'] = user return job_info
def get_oar_job_nodes(oar_job_id = None, frontend = None, frontend_connection_params = None, timeout = False): """Return an iterable of `execo.host.Host` containing the hosts of an oar job. This method waits for the job start (the list of nodes isn't fixed until the job start). :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError("no oar job id given and no OAR_JOB_ID environment variable found") countdown = Timer(timeout) wait_oar_job_start(oar_job_id, frontend, frontend_connection_params, countdown.remaining()) process = get_process("(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && oarstat -pj %(oar_job_id)i | oarprint host -f -" % {'oar_job_id': oar_job_id}, host = get_frontend_host(frontend), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = countdown.remaining() process.pty = True process.run() if process.ok: host_addresses = re.findall("(\S+)", process.stdout, re.MULTILINE) return [ Host(host_address) for host_address in host_addresses ] else: raise ProcessesFailed([process])
def _init_processes(self): self.processes = [] self._unique_hosts = get_hosts_set(self.deployment.hosts) frontends = dict() for host in self._unique_hosts: frontend = _get_host_frontend(host) if frontend in frontends: frontends[frontend].append(host) else: frontends[frontend] = [host] lifecycle_handler = ActionNotificationProcessLH(self, len(frontends)) deploy_stdout_handler = _KadeployStdoutHandler() for frontend in frontends: kadeploy_command = self.deployment._get_common_kadeploy_command_line( ) for host in frontends[frontend]: kadeploy_command += " -m %s" % (host.address, ) p = get_process(kadeploy_command, host=get_frontend_host(frontend), connection_params=make_connection_params( self.frontend_connection_params, default_frontend_connection_params)) p.pty = True p.timeout = self.timeout p.stdout_handlers.append(deploy_stdout_handler) p.stdout_handlers.extend([ FrontendPrefixWrapper(h) for h in singleton_to_collection(self._stdout_handlers) ]) p.stderr_handlers.extend([ FrontendPrefixWrapper(h) for h in singleton_to_collection(self._stderr_handlers) ]) p.lifecycle_handlers.append(lifecycle_handler) p.frontend = frontend p.kadeploy_hosts = [host.address for host in frontends[frontend]] p.deployed_hosts = set() p.undeployed_hosts = set() p.kadeployer = self self.processes.append(p)
def get_oargrid_job_oar_jobs(oargrid_job_id=None, frontend_connection_params=None, timeout=False): """Return a list of tuples (oar job id, site), the list of individual oar jobs which make an oargrid job. :param oargrid_job_id: the oargrid job id. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') process = get_process("oargridstat %i" % (oargrid_job_id, ), host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: job_specs = [] for m in re.finditer("^\t(\w+) --> (\d+)", process.stdout, re.MULTILINE): site = m.group(1) if site not in get_g5k_sites(): site = get_cluster_site(site) job_specs.append((int(m.group(2)), site)) return job_specs else: raise ProcessesFailed([process])
def get_oar_job_subnets(oar_job_id=None, frontend=None, frontend_connection_params=None, timeout=False): """Return a tuple containing an iterable of tuples (IP, MAC) and a dict containing the subnet parameters of the reservation (if any). subnet parameters dict has keys: 'ip_prefix', 'broadcast', 'netmask', 'gateway', 'network', 'dns_hostname', 'dns_ip'. :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError( "no oar job id given and no OAR_JOB_ID environment variable found" ) countdown = Timer(timeout) wait_oar_job_start(oar_job_id, frontend, frontend_connection_params, countdown.remaining()) # Get ip adresses process_ip = get_process( "(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && g5k-subnets -i -m -j %(oar_job_id)i" % {'oar_job_id': oar_job_id}, host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process_ip.timeout = countdown.remaining() process_ip.shell = process_ip.pty = True process_ip.run() # Get network parameters process_net = get_process( "(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && g5k-subnets -a -j %(oar_job_id)i" % {'oar_job_id': oar_job_id}, host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process_net.timeout = countdown.remaining() process_net.shell = process_net.pty = True process_net.run() if process_net.ok and process_ip.ok: subnet_addresses = re.findall("(\S+)\s+(\S+)", process_ip.stdout, re.MULTILINE) process_net_out = process_net.stdout.rstrip().split('\t') network_params = dict() if len(process_net_out) == 7: network_params = { "ip_prefix": process_net_out[0], "broadcast": process_net_out[1], "netmask": process_net_out[2], "gateway": process_net_out[3], "network": process_net_out[4], "dns_hostname": process_net_out[5], "dns_ip": process_net_out[6] } return (subnet_addresses, network_params) else: raise ProcessesFailed( [p for p in [process_net, process_ip] if not p.ok])
def get_oar_job_subnets(oar_job_id = None, frontend = None, frontend_connection_params = None, timeout = False): """Return a tuple containing an iterable of tuples (IP, MAC) and a dict containing the subnet parameters of the reservation (if any). subnet parameters dict has keys: 'ip_prefix', 'broadcast', 'netmask', 'gateway', 'network', 'dns_hostname', 'dns_ip'. :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError("no oar job id given and no OAR_JOB_ID environment variable found") countdown = Timer(timeout) wait_oar_job_start(oar_job_id, frontend, frontend_connection_params, countdown.remaining()) # Get ip adresses process_ip = get_process( "(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && g5k-subnets -i -m -j %(oar_job_id)i" % {'oar_job_id': oar_job_id}, host = get_frontend_host(frontend), connection_params = make_connection_params( frontend_connection_params, default_frontend_connection_params)) process_ip.timeout = countdown.remaining() process_ip.pty = True process_ip.run() # Get network parameters process_net = get_process( "(oarstat -sj %(oar_job_id)i | grep 'Running\|Terminated\|Error') > /dev/null 2>&1 && g5k-subnets -a -j %(oar_job_id)i" % {'oar_job_id': oar_job_id}, host = get_frontend_host(frontend), connection_params = make_connection_params( frontend_connection_params, default_frontend_connection_params)) process_net.timeout = countdown.remaining() process_net.pty = True process_net.run() if process_net.ok and process_ip.ok: subnet_addresses = re.findall("(\S+)\s+(\S+)", process_ip.stdout, re.MULTILINE) process_net_out = process_net.stdout.rstrip().split('\t') network_params = dict() if len(process_net_out) == 7: network_params = { "ip_prefix": process_net_out[0], "broadcast": process_net_out[1], "netmask": process_net_out[2], "gateway": process_net_out[3], "network": process_net_out[4], "dns_hostname": process_net_out[5], "dns_ip": process_net_out[6] } return (subnet_addresses, network_params) else: raise ProcessesFailed([ p for p in [process_net, process_ip] if not p.ok ])
def get_oar_job_info(oar_job_id=None, frontend=None, frontend_connection_params=None, timeout=False, nolog_exit_code=False, nolog_timeout=False, nolog_error=False): """Return a dict with informations about an oar job. :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. Hash returned may contain these keys: - ``start_date``: unix timestamp of job's start date - ``walltime``: job's walltime (seconds) - ``scheduled_start``: unix timestamp of job's start prediction (may change between invocations) - ``state``: job state. Possible states: 'Waiting', 'Hold', 'toLaunch', 'toError', 'toAckReservation', 'Launching', 'Running', 'Suspended', 'Resuming', 'Finishing', 'Terminated', 'Error', see table jobs, column state, in oar documentation http://oar.imag.fr/sources/2.5/docs/documentation/OAR-DOCUMENTATION-ADMIN/#jobs - ``name``: job name But no info may be available as long as the job is not scheduled. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError( "no oar job id given and no OAR_JOB_ID environment variable found" ) process = get_process("oarstat -fj %i" % (oar_job_id, ), host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.nolog_exit_code = nolog_exit_code process.nolog_timeout = nolog_timeout process.nolog_error = nolog_error process.run() job_info = dict() start_date_result = re.search( "^\s*startTime = (\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)\s*$", process.stdout, re.MULTILINE) if start_date_result: start_date = oar_date_to_unixts(start_date_result.group(1)) job_info['start_date'] = start_date walltime_result = re.search("^\s*walltime = (\d+:\d?\d:\d?\d)\s*$", process.stdout, re.MULTILINE) if walltime_result: walltime = oar_duration_to_seconds(walltime_result.group(1)) job_info['walltime'] = walltime scheduled_start_result = re.search( "^\s*scheduledStart = (\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)\s*$", process.stdout, re.MULTILINE) if scheduled_start_result: scheduled_start = oar_date_to_unixts(scheduled_start_result.group(1)) job_info['scheduled_start'] = scheduled_start state_result = re.search("^\s*state = (\w*)\s*$", process.stdout, re.MULTILINE) if state_result: job_info['state'] = state_result.group(1) name_result = re.search("^\s*name = ([ \t\S]*)\s*$", process.stdout, re.MULTILINE) if name_result: job_info['name'] = name_result.group(1) return job_info
def oargridsub(job_specs, reservation_date = None, walltime = None, job_type = None, queue = None, directory = None, additional_options = None, frontend_connection_params = None, timeout = False): """Submit oargrid jobs. :param job_specs: iterable of tuples (OarSubmission, clusteralias). Reservation date, walltime, queue, directory, project, additional_options, command of the OarSubmission are ignored. :param reservation_date: grid job reservation date. Default: now. :param walltime: grid job walltime. :param job_type: type of job for all clusters: deploy, besteffort, cosystem, checkpoint, timesharing. :param queue: oar queue to use. :param directory: directory where the reservation will be launched. :param additional_options: passed directly to oargridsub on the command line. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. Returns a tuple (oargrid_job_id, ssh_key), or (None, None) if error. Note that, as oargrid does not handle correctly quoting sql clauses enclosed inside braces, this function tries to automatically overcome this limitation by adding some, with the right escaping (backslashes). Also, note that oargrid's command line parser does not handle correctly commas in sql clauses enclosed inside braces, as it considers it as a rdef separator. This prevents, for example, using comma separated list values for ``NOT IN`` clauses. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') oargridsub_cmdline = get_oargridsub_commandline(job_specs, reservation_date, walltime, job_type, queue, directory, additional_options) process = get_process(oargridsub_cmdline, host = get_frontend_host(), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() job_id = None ssh_key = None if process.ok: mo = re.search("^\[OAR_GRIDSUB\] Grid reservation id = (\d+)\s*$", process.stdout, re.MULTILINE) if mo != None: job_id = int(mo.group(1)) mo = re.search("^\[OAR_GRIDSUB\] SSH KEY : (\S*)\s*$", process.stdout, re.MULTILINE) if mo != None: ssh_key = mo.group(1) if job_id != None: return (job_id, ssh_key) else: return (None, None)
def get_current_oar_jobs(frontends=None, start_between=None, end_between=None, frontend_connection_params=None, timeout=False, abort_on_error=False): """Return a list of current active oar job ids. The list contains tuples (oarjob id, frontend). :param frontends: an iterable of frontends to connect to. A frontend with value None means default frontend. If frontends == None, means get current oar jobs only for default frontend. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. :param abort_on_error: default False. If True, raises an exception on any error. If False, will returned the list of job got, even if incomplete (some frontends may have failed to answer). """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [get_unixts(t) for t in start_between] if end_between: end_between = [get_unixts(t) for t in end_between] processes = [] if frontends == None: frontends = [None] for frontend in frontends: p = get_process("oarstat -u", host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.pty = True p.frontend = frontend processes.append(p) oar_job_ids = [] if len(processes) == 0: return oar_job_ids for process in processes: process.start() for process in processes: process.wait() failed_processes = [] for process in processes: if process.ok: jobs = re.findall("^(\d+)\s", process.stdout, re.MULTILINE) oar_job_ids.extend([(int(jobid), process.frontend) for jobid in jobs]) else: failed_processes.append(process) if len(failed_processes) > 0 and abort_on_error: raise ProcessesFailed(failed_processes) else: if start_between or end_between: filtered_job_ids = [] for jobfrontend in oar_job_ids: info = get_oar_job_info(jobfrontend[0], jobfrontend[1], frontend_connection_params, timeout, nolog_exit_code=True, nolog_timeout=True, nolog_error=True) if (_date_in_range(info['start_date'], start_between) and _date_in_range( info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(jobfrontend) oar_job_ids = filtered_job_ids return oar_job_ids
def oarsub(job_specs, frontend_connection_params=None, timeout=False, abort_on_error=False): """Submit jobs. :param job_specs: iterable of tuples (execo_g5k.oar.OarSubmission, frontend) with None for default frontend :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for submitting. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. :param abort_on_error: default False. If True, raises an exception on any error. If False, will returned the list of job got, even if incomplete (some frontends may have failed to answer). Returns a list of tuples (oarjob id, frontend), with frontend == None for default frontend. If submission error, oarjob id == None. The returned list matches, in the same order, the job_specs parameter. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') processes = [] for (spec, frontend) in job_specs: oarsub_cmdline = get_oarsub_commandline(spec) p = get_process(oarsub_cmdline, host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.pty = True p.frontend = frontend processes.append(p) oar_job_ids = [] if len(processes) == 0: return oar_job_ids for process in processes: process.start() for process in processes: process.wait() failed_processes = [] for process in processes: job_id = None if process.ok: mo = re.search("^OAR_JOB_ID=(\d+)\s*$", process.stdout, re.MULTILINE) if mo != None: job_id = int(mo.group(1)) if job_id == None: failed_processes.append(process) oar_job_ids.append((job_id, process.frontend)) if len(failed_processes) > 0 and abort_on_error: raise ProcessesFailed(failed_processes) else: return oar_job_ids
def oargridsub(job_specs, reservation_date=None, walltime=None, job_type=None, queue=None, directory=None, additional_options=None, frontend_connection_params=None, timeout=False): """Submit oargrid jobs. :param job_specs: iterable of tuples (OarSubmission, clusteralias). Reservation date, walltime, queue, directory, project, additional_options, command of the OarSubmission are ignored. :param reservation_date: grid job reservation date. Default: now. :param walltime: grid job walltime. :param job_type: type of job for all clusters: deploy, besteffort, cosystem, checkpoint, timesharing. :param queue: oar queue to use. :param directory: directory where the reservation will be launched. :param additional_options: passed directly to oargridsub on the command line. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. Returns a tuple (oargrid_job_id, ssh_key), or (None, None) if error. Note that, as oargrid does not handle correctly quoting sql clauses enclosed inside braces, this function tries to automatically overcome this limitation by adding some, with the right escaping (backslashes). Also, note that oargrid's command line parser does not handle correctly commas in sql clauses enclosed inside braces, as it considers it as a rdef separator. This prevents, for example, using comma separated list values for ``NOT IN`` clauses. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') oargridsub_cmdline = get_oargridsub_commandline(job_specs, reservation_date, walltime, job_type, queue, directory, additional_options) process = get_process(oargridsub_cmdline, host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() job_id = None ssh_key = None if process.ok: mo = re.search("^\[OAR_GRIDSUB\] Grid reservation id = (\d+)\s*$", process.stdout, re.MULTILINE) if mo != None: job_id = int(mo.group(1)) mo = re.search("^\[OAR_GRIDSUB\] SSH KEY : (\S*)\s*$", process.stdout, re.MULTILINE) if mo != None: ssh_key = mo.group(1) if job_id != None: return (job_id, ssh_key) else: return (None, None)
def get_oar_job_info(oar_job_id = None, frontend = None, frontend_connection_params = None, timeout = False, nolog_exit_code = False, nolog_timeout = False, nolog_error = False): """Return a dict with informations about an oar job. :param oar_job_id: the oar job id. If None given, will try to get it from ``OAR_JOB_ID`` environment variable. :param frontend: the frontend of the oar job. If None given, use default frontend :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. Hash returned may contain these keys: - ``start_date``: unix timestamp of job's start date - ``walltime``: job's walltime (seconds) - ``scheduled_start``: unix timestamp of job's start prediction (may change between invocations) - ``state``: job state. Possible states: 'Waiting', 'Hold', 'toLaunch', 'toError', 'toAckReservation', 'Launching', 'Running', 'Suspended', 'Resuming', 'Finishing', 'Terminated', 'Error', see table jobs, column state, in oar documentation http://oar.imag.fr/sources/2.5/docs/documentation/OAR-DOCUMENTATION-ADMIN/#jobs - ``name``: job name But no info may be available as long as the job is not scheduled. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if oar_job_id == None: if 'OAR_JOB_ID' in os.environ: oar_job_id = os.environ['OAR_JOB_ID'] else: raise ValueError("no oar job id given and no OAR_JOB_ID environment variable found") process = get_process("oarstat -fj %i" % (oar_job_id,), host = get_frontend_host(frontend), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.nolog_exit_code = nolog_exit_code process.nolog_timeout = nolog_timeout process.nolog_error = nolog_error process.run() job_info = dict() start_date_result = re.search("^\s*startTime = (\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)\s*$", process.stdout, re.MULTILINE) if start_date_result: start_date = oar_date_to_unixts(start_date_result.group(1)) job_info['start_date'] = start_date walltime_result = re.search("^\s*walltime = (\d+:\d?\d:\d?\d)\s*$", process.stdout, re.MULTILINE) if walltime_result: walltime = oar_duration_to_seconds(walltime_result.group(1)) job_info['walltime'] = walltime scheduled_start_result = re.search("^\s*scheduledStart = (\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d)\s*$", process.stdout, re.MULTILINE) if scheduled_start_result: scheduled_start = oar_date_to_unixts(scheduled_start_result.group(1)) job_info['scheduled_start'] = scheduled_start state_result = re.search("^\s*state = (\w*)\s*$", process.stdout, re.MULTILINE) if state_result: job_info['state'] = state_result.group(1) name_result = re.search("^\s*name = ([ \t\S]*)\s*$", process.stdout, re.MULTILINE) if name_result: job_info['name'] = name_result.group(1) return job_info
def get_current_oar_jobs(frontends = None, start_between = None, end_between = None, frontend_connection_params = None, timeout = False, abort_on_error = False): """Return a list of current active oar job ids. The list contains tuples (oarjob id, frontend). :param frontends: an iterable of frontends to connect to. A frontend with value None means default frontend. If frontends == None, means get current oar jobs only for default frontend. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. :param abort_on_error: default False. If True, raises an exception on any error. If False, will returned the list of job got, even if incomplete (some frontends may have failed to answer). """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [ get_unixts(t) for t in start_between ] if end_between: end_between = [ get_unixts(t) for t in end_between ] processes = [] if frontends == None: frontends = [ None ] for frontend in frontends: p = get_process("oarstat -u", host = get_frontend_host(frontend), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.pty = True p.frontend = frontend processes.append(p) oar_job_ids = [] if len(processes) == 0: return oar_job_ids for process in processes: process.start() for process in processes: process.wait() failed_processes = [] for process in processes: if process.ok: jobs = re.findall("^(\d+)\s", process.stdout, re.MULTILINE) oar_job_ids.extend([ (int(jobid), process.frontend) for jobid in jobs ]) else: failed_processes.append(process) if len(failed_processes) > 0 and abort_on_error: raise ProcessesFailed(failed_processes) else: if start_between or end_between: filtered_job_ids = [] for jobfrontend in oar_job_ids: info = get_oar_job_info(jobfrontend[0], jobfrontend[1], frontend_connection_params, timeout, nolog_exit_code = True, nolog_timeout = True, nolog_error = True) if (_date_in_range(info['start_date'], start_between) and _date_in_range(info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(jobfrontend) oar_job_ids = filtered_job_ids return oar_job_ids