예제 #1
0
 def findMyJob(self):
     """Call FindJobsFromUser, passing the output of whoami, and keeping only the FIRST job returned
        Return  a tuple (jobid,partition,user,nodeset), or ("","","")
        WARNING - Jobs in any other state than RUNNING are ignored"""
        
     user = runCmd('whoami').rstrip()
     return self.findJobsFromUser(user)[0]
예제 #2
0
    def nodesetToHosts(self,nodeset):

        try:
            nodes = runCmd('nodeset -e ' + nodeset).rstrip().split(' ')
        except:
            return []

        return nodes
예제 #3
0
    def nodesetToHosts(self, nodeset):
        """Expand the nodeset to a list of hosts"""

        try:
            nodes = runCmd('nodeset -e ' + nodeset).rstrip().split(' ')
        except:
            return []

        return nodes
예제 #4
0
    def findJobFromId(self,jobid):
        
        cmd = 'squeue -t RUNNING -j ' + str(jobid) + ' --noheader -o %.16R@%.15u@%.7A'
        try:
            rvl = runCmd(cmd)
        except:
            return ("","","")
        
        if rvl == "":
            return ("","","")

        # host[0-4]@   user@jobid  @partition ==> (host[0-4],user,jobid,partition)
        return tuple(map(str.strip,rvl.split('@'))) 
예제 #5
0
    def __callNumactl(self): 
        """Call numactl, detecting reserved sockets and physical cores
           return the list of reserved sockets, and the list of physical cores  """

        cmd = "numactl --show"
        output = runCmd(cmd).split('\n')

        # l_sockets is generated from line nodebind of numactl
        # nodebind: 4 5 6 => [4,5,6]
        for l in output:
            if l.startswith('nodebind:'):
                l_sockets = list(map(int,l.rpartition(':')[2].strip().split(' ')))
            elif l.startswith('physcpubind:'):
                physcpubind = list(map(int,l.rpartition(':')[2].strip().split(' ')))
            
        return [l_sockets,physcpubind]
예제 #6
0
    def findJobsFromUser(self,user):
        """Call squeue and return a list of tuples corresponding to the jobs running for this user"""
        
        cmd = 'squeue -t RUNNING -u ' + user + ' --noheader -o %.16R@%.15u@%.7A'
        try:
            rvl = runCmd(cmd)
        except:
            return [("","","")]
        
        if rvl == "":
            return [("","","")]

        tuples = []

        # host[0-4]@   user@jobid  @partition ==> (host[0-4],user,jobid,partition)
        for j in rvl.split('\n'):
            tuples.append(tuple(map(str.strip,j.split('@'))))
            
        return tuples
예제 #7
0
    def __callNumactl(self):
        """Call numactl, detecting sockets and core addressing
           return An array of int (concatenation of node X cpus: outputs of numactl --hardware)
        """

        rvl = []
        cmd = "numactl --hardware"
        output = runCmd(cmd).split('\n')

        # Looking for lines (in this order !)
        # node 0 cpus: 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30
        # node 1 cpus: 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31
        sock_cnt = 0
        for l in output:
            if l.startswith('node ' + str(sock_cnt) + ' cpus:'):
                cores = l.partition(':')[2]
                rvl.append(list(map(int, cores.strip().split(' '))))
                sock_cnt += 1

        return flatten(rvl)
예제 #8
0
    def __initDataStructures(self):
        '''Init self.__pid2jobid, self.__core2jobid and self._job2tag
           Explore the /sys/fs/cgroup/cpuset pseudo filesystem'''

        # If the data do not exist, build them, else return
        if self.__pid2jobid == None or self.__core2jobid == None:

            pid2jobid = {}
            core2jobid = {}

            # Looking for /sys/fs/cgroup/cpuset/slurm/uid_xxx/job_yyyyyy/step_batch
            top_dir = "/sys/fs/cgroup/cpuset/slurm/"

            # This is a cache, avoiding more squeue than necessary
            jobids = {}
            for root, dirs, files in os.walk(top_dir, False):
                leaf = os.path.basename(root)
                if leaf.startswith('step_'):
                    job_path = os.path.split(root)[0]
                    # => .../slurm/uid_xxx/job_yyyyyy
                    job_dir = os.path.split(job_path)[1]  # => job_yyyyyy
                    jobid = job_dir.replace('job_', '')  # => yyyyyy

                    # If the job is not recognized by slurm, ignore it: this is an old trace
                    if jobid in jobids:
                        if not jobid in jobids:
                            continue

                    else:
                        # If we do not know, execute squeue
                        try:
                            # We do not need the output, only the exit status
                            runCmd(f'squeue -j {jobid} -o ""')
                            jobids[jobid] = True

                        except PlacementException as e:
                            jobids[jobid] = False
                            continue

                    # The pids are in the file cgroup.procs
                    pids = []
                    cgroup_procs = root + '/cgroup.procs'
                    with open(cgroup_procs, 'r') as infile:
                        for line in infile:
                            line = line.strip()
                            if line != '':
                                pid2jobid[line] = jobid

                    # The cores are in the file cpuset.cpus
                    cpuset_cpus = root + '/cpuset.cpus'
                    with open(cpuset_cpus, 'r') as infile:
                        for line in infile:
                            line = line.strip()
                            if line != '':
                                # Nearly same format for the cpusets as for the nodesets !
                                cores = self.nodesetToHosts('[' + line + ']')
                                for core in cores:
                                    core2jobid[core] = jobid

            # build the map self._job2tag
            jobids = sorted(list(set(core2jobid.values())))

            t = 0
            m = {}
            for j in jobids:
                t += 1
                m[j] = t

            self.__pid2jobid = pid2jobid
            self.__core2jobid = core2jobid
            self._job2tag = m