def getStatsAsXmlString(self): local_document = xml.Document() resource_stat_xml = local_document.createElement("resource_usage") if not self.pid_stats: # if there are no entries in the stats dict resource_stat_xml.setAttribute("noStatsRecorded", "true") return resource_stat_xml.toxml(encoding="ascii") try: for idx, (key, value) in enumerate(self.pid_stats.items()): #if there are entries if value: child_pid = add_child(resource_stat_xml, "process") child_pid.setAttribute("idx", str(idx)) #The first entry should contain the exec name. only needed once child_pid.setAttribute("executable", str(value[0][0])) child_pid.setAttribute("pid", str(key)) for entry in value: if "MEM" in str(entry[6]): # this is the default value continue data_point = add_child(child_pid, "data_point") data_point.setAttribute("timestamp", str(entry[1])) data_point.setAttribute("read_bytes", str(entry[2])) data_point.setAttribute("write_bytes", str(entry[3])) data_point.setAttribute("cancelled_bytes", str(entry[4])) data_point.setAttribute("cpu", str(entry[5])) data_point.setAttribute("mem", str(entry[6])) except: self.logger.error("monitoring statistic recording failed") resource_stat_xml.setAttribute("noStatsRecorded", "Exception") return resource_stat_xml.toxml(encoding="ascii") return resource_stat_xml.toxml(encoding="ascii")
def test_get_child(self): local_document = xml.Document() head = local_document.createElement("head") child = xmllogging.add_child(head, "child") second_child = xmllogging.add_child(head, "second_child") third_child = xmllogging.add_child(head, "child") # call the function returned_child = get_child(head, "child") # test output self.assertTrue(returned_child == child, "get_child dit not return the first child matching the name") self.assertTrue(returned_child != third_child, "get_child dit not return the first child matching the name")
def test_get_child(self): local_document = xml.Document() head = local_document.createElement("head") child = xmllogging.add_child(head, "child") second_child = xmllogging.add_child(head, "second_child") third_child = xmllogging.add_child(head, "child") # call the function returned_child = get_child(head, "child") # test output self.assertTrue( returned_child == child, "get_child dit not return the first child matching the name") self.assertTrue( returned_child != third_child, "get_child dit not return the first child matching the name")
def getStatsAsXmlString(self): """ returns the collected data as a xml file Data is cleaned and labeled according to the metric used. """ local_document = xml.Document() resource_stat_xml = local_document.createElement("resource_usage") resource_stat_xml.setAttribute("node_recipe_pid",str(self.owner_pid)) if not self.pid_stats: # if there are no entries in the stats dict resource_stat_xml.setAttribute("noStatsRecorded", "true") return resource_stat_xml.toxml(encoding = "ascii").decode('ascii') try: # TODO: The returned values are not in order and the owner PID # might not be printed with idx 0. Maybee print seperately for idx,(key,value) in enumerate(self.pid_stats.items()): # if there are entries if value: child_pid = add_child(resource_stat_xml, "process") child_pid.setAttribute("idx", str(idx)) # The first entry should contain the exec name. # only needed once child_pid.setAttribute("executable", str(value[0][0])) child_pid.setAttribute("pid", str(key)) for entry in value: # TODO: probably no longer needed with updated bash # script if "MEM" in str(entry[6]): # this is the default value continue data_point = add_child(child_pid, "data_point") data_point.setAttribute("timestamp", str(entry[1])) data_point.setAttribute("read_bytes", str(entry[2])) data_point.setAttribute("write_bytes", str(entry[3])) data_point.setAttribute("cancelled_bytes", str(entry[4])) data_point.setAttribute("cpu", str(entry[5])) data_point.setAttribute("mem", str(entry[6])) except: self.logger.warn("monitoring statistic recording failed") resource_stat_xml.setAttribute("noStatsRecorded", "Exception") # TODO: coalesce these two returns in one "finally:" return resource_stat_xml.toxml(encoding = "ascii").decode('ascii') return resource_stat_xml.toxml(encoding = "ascii").decode('ascii')
def test_add_child(self): local_document = xml.Document() head = local_document.createElement("head") returned_node = xmllogging.add_child(head, "child") self.assertTrue( len(head.childNodes) == 1, "add_child add more then one child") self.assertTrue(head.childNodes[0].tagName == "child", "add_child added a child with an incorrect name") self.assertTrue(returned_node == head.childNodes[0], "add_child should return the created node")
def test_get_child_not_found(self): local_document = xml.Document() head = local_document.createElement("head") child = xmllogging.add_child(head, "child") # call the function returned_child = get_child(head, "does_not_exist") # test output self.assertTrue(returned_child == None, "when no children are found get_child should return None")
def test_get_child_not_found(self): local_document = xml.Document() head = local_document.createElement("head") child = xmllogging.add_child(head, "child") # call the function returned_child = get_child(head, "does_not_exist") # test output self.assertTrue( returned_child == None, "when no children are found get_child should return None")
def test_add_child(self): local_document = xml.Document() head = local_document.createElement("head") returned_node = xmllogging.add_child(head, "child") self.assertTrue(len(head.childNodes) == 1, "add_child add more then one child") self.assertTrue(head.childNodes[0].tagName == "child", "add_child added a child with an incorrect name") self.assertTrue(returned_node == head.childNodes[0], "add_child should return the created node")
def _schedule_jobs(self, jobs, max_per_node = None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread( target = job.dispatch, args = ( self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch ) ) ) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("duration", str(job.results["job_duration"])) # return code if present (Not there on error if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute( "returncode", str(-1)) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help = "XML return data.") self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii") return jobpool
def _schedule_jobs(self, jobs, max_per_node = None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) # External cluster stuff try: method = self.config.get('remote', 'method') except: method = None redistribute_hosts = False # JURECA SLURM if method == 'slurm_srun': nodeliststr = [] hargs = ['srun','hostname'] proc = Popen(hargs, False, stdout=PIPE, stderr=None) tup = proc.communicate() nodeliststr = tup[0].rstrip('\n').split('\n') # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # Hertfordshire cluster elif method == 'pbs_ssh': # special case - get the list of nodes from the pbs job nodeliststr = [] try: filename = os.environ['PBS_NODEFILE'] except KeyError: self.logger.error('PBS_NODEFILE not found.') raise PipelineQuit() with open(filename, 'r') as file: for line in file: node_name = line.split()[0] if node_name not in nodeliststr: nodeliststr.append(node_name) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # get hostlist from slurm, but start jobs via ssh elif method == 'slurm_ssh': try: hostlist = os.environ['SLURM_JOB_NODELIST'] except KeyError: self.logger.error('SLURM_JOB_NODELIST not found. You must have a slurm reservation!') raise PipelineQuit() nodeliststr = expand_slurm_hostlist(hostlist) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # generic case, node-names in an env-variable elif method == 'ssh_generic': nodeliststr = [] try: env_name = self.config.get('remote', 'nodelist_variable') except: env_name = 'PIPELINE_NODES' try: nodes = os.environ[env_name] except KeyError: self.logger.error('Env-variable \"'+env_name+'\" not found.') raise PipelineQuit() nodeliststr = [node.strip() for node in nodes.strip('[] ').split(',')] # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # re-distribute the hosts if requested if redistribute_hosts: # equal distribution total = len(jobs) # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore nnodes = len(nodeliststr) # round robin nodelist = [] for i in range(total): nodelist.append(nodeliststr[i%nnodes]) for i, job in enumerate(jobs): job.host = nodelist[i] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread( target = job.dispatch, args = ( self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch ) ) ) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there # fixme the name of node_durations is not logical if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("job_host", str(job.host)) child_node_duration.setAttribute("duration", str(job.results["job_duration"])) # return code if present (Not there on error) if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute( "returncode", str(-1)) ## If there is 'node level' resource logging available if "monitor_stats" in job.results: return_node = xml.parseString( job.results['monitor_stats']).documentElement child_node_duration.appendChild(return_node) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help = "XML return data.") self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii") return jobpool
def _schedule_jobs(self, jobs, max_per_node=None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) # External cluster stuff try: method = self.config.get('remote', 'method') except: method = None redistribute_hosts = False # JURECA SLURM if method == 'slurm_srun': nodeliststr = [] hargs = ['srun', 'hostname'] proc = Popen(hargs, False, stdout=PIPE, stderr=None) tup = communicate_returning_strings(proc) nodeliststr = tup[0].rstrip('\n').split('\n') # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # Hertfordshire cluster elif method == 'pbs_ssh': # special case - get the list of nodes from the pbs job nodeliststr = [] try: filename = os.environ['PBS_NODEFILE'] except KeyError: self.logger.error('PBS_NODEFILE not found.') raise PipelineQuit() with open(filename, 'r') as file: for line in file: node_name = line.split()[0] if node_name not in nodeliststr: nodeliststr.append(node_name) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # get hostlist from slurm, but start jobs via ssh elif method == 'slurm_ssh': try: hostlist = os.environ['SLURM_JOB_NODELIST'] except KeyError: self.logger.error( 'SLURM_JOB_NODELIST not found. You must have a slurm reservation!' ) raise PipelineQuit() nodeliststr = expand_slurm_hostlist(hostlist) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # generic case, node-names in an env-variable elif method == 'ssh_generic': nodeliststr = [] try: env_name = self.config.get('remote', 'nodelist_variable') except: env_name = 'PIPELINE_NODES' try: nodes = os.environ[env_name] except KeyError: self.logger.error('Env-variable \"' + env_name + '\" not found.') raise PipelineQuit() nodeliststr = [ node.strip() for node in nodes.strip('[] ').split(',') ] # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # re-distribute the hosts if requested if redistribute_hosts: # equal distribution total = len(jobs) # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore nnodes = len(nodeliststr) # round robin nodelist = [] for i in range(total): nodelist.append(nodeliststr[i % nnodes]) for i, job in enumerate(jobs): job.host = nodelist[i] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread(target=job.dispatch, args=(self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch))) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there # fixme the name of node_durations is not logical if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("job_host", str(job.host)) child_node_duration.setAttribute( "duration", str(job.results["job_duration"])) # return code if present (Not there on error) if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute("returncode", str(-1)) ## If there is 'node level' resource logging available if "monitor_stats" in job.results: return_node = xml.parseString( job.results['monitor_stats']).documentElement child_node_duration.appendChild(return_node) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help="XML return data.") self.outputs["return_xml"] = node_durations.toxml( encoding="ascii").decode('ascii') return jobpool
def _schedule_jobs(self, jobs, max_per_node=None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread(target=job.dispatch, args=(self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch))) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there # fixme the name of node_durations is not logical if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("job_host", str(job.host)) child_node_duration.setAttribute( "duration", str(job.results["job_duration"])) # return code if present (Not there on error) if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute("returncode", str(-1)) ## If there is 'node level' resource logging available if "monitor_stats" in job.results: return_node = xml.parseString( job.results['monitor_stats']).documentElement child_node_duration.appendChild(return_node) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help="XML return data.") self.outputs["return_xml"] = node_durations.toxml(encoding="ascii") return jobpool