Пример #1
0
class poe(Plugin):
    def __init__(self, logFileName):
        self.__log= Logger(logFileName)
        self.__failedPollTimes = 0

    def isLocalProcess(self):
        return True

    def getName(self):
        return 'poe'

    def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30):
        return None

    def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False):

        submit_script = "#!/bin/bash -l" + "\n"
        submit_script += "#@ tasks_per_node = 1" + "\n"
        submit_script += "source /etc/profile.d/modules.sh" + "\n"
        submit_script += "module load mpi4py" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        submit_script += "source ${VO_ATLAS_SW_DIR}/local/setup-yampl.sh" + "\n"
        submit_script += "export PYTHONPATH=/cvmfs/atlas.cern.ch/repo/sw/local/noarch/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        #submit_script += "export CMTEXTRATAGS=ATLAS,useDBRelease" + "\n"
        submit_script += "env" + "\n"

        # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt"
        # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir
        # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir

        # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt"
        submit_script += "poe parrot_run python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" --outputDir=" + os.path.dirname(globalYodaDir) + " --dumpEventOutputs"
        self.__log.debug("POE submit script: %s" % submit_script)
        # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True)
        yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a')
        yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a')
        hpcJob = subprocess.Popen(submit_script, stdout=yoda_stdout, stderr=yoda_stderr, shell=True)

        t1 = time.time()
        i = 20
        while (hpcJob and hpcJob.poll() is None):
            if i == 0:
                self.__log.debug("Yoda process is running")
                i = 20
            time.sleep(30)
            i -= 1

        self.__log.debug("Yoda process terminated")
        self.__log.debug("Yoda process return code: %s" % hpcJob.returncode)

        return 0, None

    def poll(self, jobid):
        return None
Пример #2
0
 def __init__(self, logFileName):
     self.__log= Logger(logFileName)
     self.__failedPollTimes = 0
Пример #3
0
class arc(Plugin):
    def __init__(self, logFileName):
        self.__log= Logger(logFileName)
        self.__failedPollTimes = 0

    def isLocalProcess(self):
        return True

    def getName(self):
        return 'arc'

    def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30):
        return None

    def convertNodeList(self, nodelist):
        try:
            if '[' in nodelist:
                numNames = []
                tmp = nodelist
                preName, numList = tmp.split('[')
                numList,postName = numList.split(']')
                for items in numList.split(","):
                    if not '-' in items:
                        numNames.append(preName + items + postName)
                    else:
                        start, end = items.split('-')
                        numLen = len(start)
                        for i in range(int(start), int(end) + 1):
                            num = str(i).zfill(numLen)
                            numNames.append(preName + str(num) + postName)
                return ','.join(numNames)
            else:
                return nodelist
        except:
            self.__log.debug(traceback.format_exc())
            return nodelist

    def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None):

        nodelist = ""
        if os.environ.has_key('SLURM_NODELIST'):
            nodelist = os.environ['SLURM_NODELIST']
        elif os.environ.has_key('PBS_NODELIST'):
            nodelist = os.environ['PBS_NODELIST']
        nodelist = self.convertNodeList(nodelist)

        submit_script = "#!/bin/bash -l" + "\n"
        # submit_script += "module load mpi4py openmpi-ccm" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        submit_script += "source ${VO_ATLAS_SW_DIR}/local/setup-yampl.sh" + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        submit_script += "env" + "\n"

        # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt"
        # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir
        # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir

        # submit_script += "mpirun -bynode python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt"
        # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt"
        submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" --nonMPIMode"
        self.__log.debug("ARC submit script: %s" % submit_script)
        # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True)
        yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a')
        yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a')
        hpcJob = subprocess.Popen(submit_script, stdout=yoda_stdout, stderr=yoda_stderr, shell=True)

        i = 20
        while (hpcJob and hpcJob.poll() is None):
            if i == 0:
                self.__log.debug("Yoda process is running%s")
                i = 20
            time.sleep(30)
            i -= 1

        self.__log.debug("Yoda process terminated")
        self.__log.debug("Yoda process return code: %s" % hpcJob.returncode)

        return 0, None

    def poll(self, jobid):
        return None
Пример #4
0
 def __init__(self, logFileName):
     self.__log = Logger(logFileName)
     self.__failedPollTimes = 0
Пример #5
0
class slurm(Plugin):
    def __init__(self, logFileName):
        self.__log = Logger(logFileName)
        self.__failedPollTimes = 0

    def getHPCResources(self,
                        partition,
                        max_nodes=None,
                        min_nodes=2,
                        min_walltime_m=30):
        # copied from RunJobEdison
        #cmd = 'showbf -p %s' % partition
        cmd = 'sinfo '
        self.__log.info("Executing command: '%s'" % cmd)
        res_tuple = runcommand(cmd)
        self.__log.info("Executing command output: %s" % str(res_tuple))
        showbf_str = ""
        if res_tuple[0] == 0:
            showbf_str = res_tuple[1]

        res = {}
        self.__log.info("Available resources in %s  partition" % partition)
        self.__log.info(showbf_str)
        if showbf_str:
            shobf_out = showbf_str.splitlines()
            self.__log.info("Fitted resources")
            for l in shobf_out[2:]:
                d = l.split()
                nodes = int(d[2])

                if nodes < int(min_nodes):
                    continue

                if not d[3] == 'INFINITY':
                    wal_time_arr = d[3].split(":")
                    if len(wal_time_arr) < 4:
                        wal_time_sec = int(wal_time_arr[0]) * (60 * 60) + int(
                            wal_time_arr[1]) * 60 + int(wal_time_arr[2])
                        if wal_time_sec > 24 * 3600:
                            wal_time_sec = 24 * 3600
                    else:
                        wal_time_sec = 24 * 3600
                        #if nodes > 1:
                        #    nodes = nodes - 1
                else:
                    wal_time_sec = 12 * 3600

                # Fitting Hopper policy
                # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/
                nodes = max_nodes if nodes > max_nodes else nodes

                if nodes < 682 and wal_time_sec > 48 * 3600:
                    wal_time_sec = 48 * 3600
                elif nodes < 4096 and wal_time_sec > 36 * 3600:
                    wal_time_sec = 36 * 3600
                elif nodes < 5462 and wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600
                elif wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600

                if wal_time_sec < int(min_walltime_m) * 60:
                    continue

                self.__log.info(
                    "Nodes: %s, Walltime (str): %s, Walltime (min) %s" %
                    (nodes, d[3], wal_time_sec / 60))

                res.update({nodes: wal_time_sec})
        else:
            self.__log.info(
                "No availble resources. Default values will be used.")
        self.__log.info("Get resources: %s" % res)

        return res

    def submitJob(self,
                  globalWorkingDir,
                  globalYodaDir,
                  localWorkingDir,
                  queue,
                  repo,
                  mppwidth,
                  mppnppn,
                  walltime,
                  nodes,
                  localSetup=None,
                  cpuPerNode=None,
                  dumpEventOutputs=False):
        submit_script = "#!/bin/bash -l" + "\n"
        if queue == 'premium':
            submit_script += "#SBATCH -p regular\n"
            submit_script += "#SBATCH --qos=premium\n"
        elif queue == "scavenger":
            submit_script += "#SBATCH -p regular\n"
            submit_script += "#SBATCH --qos=scavenger\n"
        elif queue == "low":
            submit_script += "#SBATCH -p regular\n"
            submit_script += "#SBATCH --qos=low\n"
        else:
            submit_script += "#SBATCH -p " + queue + "\n"

        if repo:
            submit_script += "#SBATCH -A " + repo + "\n"
        # submit_script += "#SBATCH -n " + str(mppwidth) + "\n"
        submit_script += "#SBATCH -N " + str(nodes) + "\n"
        submit_script += "#SBATCH --signal=SIGUSR1@60\n"
        submit_script += "#SBATCH -t " + walltime + "\n"
        submit_script += "#SBATCH --ntasks-per-node=1\n"
        submit_script += "#SBATCH --cpus-per-task=" + str(cpuPerNode) + "\n"
        submit_script += "#SBATCH -J ES_job" + "\n"
        submit_script += "#SBATCH -o athena_stdout.txt" + "\n"
        submit_script += "#SBATCH -e athena_stderr.txt" + "\n"
        submit_script += "cd $SBATCH_O_WORKDIR" + "\n"
        submit_script += "module load mpi4py" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        #submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n"
        #submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        #submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n"
        #submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n"
        #submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n"
        #submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n"
        #submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n"
        submit_script += "env" + "\n"
        # submit_script += "module avail" + "\n"
        # submit_script += "module list" + "\n"

        #submit_script += "srun -n " + str(nodes) + " -N " + str(mppnppn) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+""
        submit_script += "srun -N " + str(
            nodes
        ) + " python-mpi " + os.path.join(
            globalWorkingDir, "HPC/HPCJob.py"
        ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir
        if dumpEventOutputs:
            submit_script += " --dumpEventOutputs"
        ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&"
        self.__submit_file = os.path.join(globalYodaDir, 'submit_script')
        handle = open(self.__submit_file, 'w')
        handle.write(submit_script)
        handle.close()

        self.__log.info("submit script:\n%s" % submit_script)
        cmd = "sbatch " + self.__submit_file
        self.__log.info("submitting HPC job: %s" % cmd)
        status, output = runcommand(cmd)
        self.__log.info("submitting HPC job: (status: %s, output: %s)" %
                        (status, output))
        self.__jobid = None
        if status == 0:
            self.__jobid = output.replace("\n", "").split(" ")[-1]
            return 0, self.__jobid
        return -1, None

    def poll(self, jobid):
        # poll the job in HPC. update it
        cmd = "scontrol show job " + jobid
        self.__log.info("polling HPC job: %s" % cmd)
        status, output = runcommand(cmd)
        # self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output))
        if status == 0:
            self.__failedPollTimes = 0
            state = None
            lines = output.split("\n")
            for line in lines:
                line = line.strip()
                if line.startswith('JobState'):
                    state = line.split(" ")[0].split("=")[1]

            if state == "COMPLETED":
                self.__log.info("HPC job complete")
                return "Complete"
            if state == "RUNNING":
                self.__log.info("HPC job is running")
                return "Running"
            if state == "PENDING":
                self.__log.info("HPC job is pending")
                return "Queue"
            if state == "FAILED":
                self.__log.info("HPC job is failed")
                return "Failed"
            if state == "CANCELLED":
                self.__log.info("HPC job is cancelled")
                return "Failed"
            if state == "TIMEOUT":
                self.__log.info("HPC job is timed out")
                return "Failed"
            self.__log.info("HPC job is in unknown state")
            return 'Unknown'
        else:
            self.__log.info("polling HPC job: (status: %s, output: %s)" %
                            (status, output))
            if 'Invalid job id specified' in output:
                self.__log.info("Unknown Job Id. Set Job Complete.")
                return "Complete"
            else:
                self.__failedPollTimes += 1
                self.__log.error(
                    'Failing HPC job because the polling command has failed ' +
                    str(self.__failedPollTimes) + ' times.')
                return 'Unknown'
        return 'Unknown'

    def delete(self, jobid):
        command = "scancel " + jobid
        status, output = runcommand(command)
        self.__log.debug("Run Command: %s " % command)
        self.__log.debug("Status: %s, Output: %s" % (status, output))
Пример #6
0
class pbs(Plugin):
    def __init__(self, logFileName):
        self.__log= Logger(logFileName)
        self.__failedPollTimes = 0

    def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30):
        # copied from RunJobEdison
        cmd = 'showbf -p %s' % partition
        self.__log.info("Executing command: '%s'" % cmd)
        res_tuple = commands.getstatusoutput(cmd)
        self.__log.info("Executing command output: %s" % str(res_tuple))
        showbf_str = ""
        if res_tuple[0] == 0:
            showbf_str = res_tuple[1]

        res = {}
        self.__log.info("Available resources in %s  partition" % partition)
        self.__log.info(showbf_str)
        if showbf_str:
            shobf_out = showbf_str.splitlines()
            self.__log.info("Fitted resources")
            for l in shobf_out[2:]:
                d = l.split()
                nodes = int(d[2])

                if nodes < int(min_nodes):
                    continue

                if not d[3] == 'INFINITY':
                    wal_time_arr =  d[3].split(":")
                    if len(wal_time_arr) < 4:
                        wal_time_sec = int(wal_time_arr[0])*(60*60) + int(wal_time_arr[1])*60 + int(wal_time_arr[2])
                        if wal_time_sec > 24 * 3600:
                            wal_time_sec = 24 * 3600
                    else:
                        wal_time_sec = 24 * 3600
                        #if nodes > 1:
                        #    nodes = nodes - 1
                else:
                    wal_time_sec = 12 * 3600

                # Fitting Hopper policy
                # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/
                nodes = max_nodes if nodes > max_nodes else nodes


                if nodes < 682 and wal_time_sec > 48 * 3600:
                    wal_time_sec = 48 * 3600
                elif nodes < 4096 and wal_time_sec > 36 * 3600:
                    wal_time_sec = 36 * 3600
                elif nodes < 5462 and wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600
                elif wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600

                if wal_time_sec < int(min_walltime_m) * 60:
                    continue

                self.__log.info("Nodes: %s, Walltime (str): %s, Walltime (min) %s" % (nodes, d[3], wal_time_sec/60 ))

                res.update({nodes:wal_time_sec})
        else:
            self.__log.info("No availble resources. Default values will be used.")
        self.__log.info("Get resources: %s" % res)

        return res

    def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False):
        submit_script = "#!/bin/bash -l" + "\n"
        submit_script += "#PBS -q " + queue + "\n"
        if repo:
            submit_script += "#PBS -A " + repo + "\n"
        submit_script += "#PBS -l mppwidth=" + str(mppwidth) + "\n"
        #submit_script += "#PBS -l mppnppn=" + str(mppnppn) + "\n"
        submit_script += "#PBS -l walltime=" + walltime + "\n"
        submit_script += "#PBS -N ES_job" + "\n"
        submit_script += "#PBS -j oe" + "\n"
        submit_script += "#PBS -o athena_stdout.txt" + "\n"
        submit_script += "#PBS -e athena_stderr.txt" + "\n"
        submit_script += "cd $PBS_O_WORKDIR" + "\n"
        submit_script += "module load mpi4py" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        #submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n"
        submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n"
        submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n"
        submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n"
        submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n"
        submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n"
        submit_script += "env" + "\n"

        #submit_script += "aprun -n " + str(nodes) + " -N " + str(mppnppn) + " -d " + str(ATHENA_PROC_NUMBER) + " -cc none python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalWorkingDir+" --localWorkingDir="+localWorkingDir+""
        submit_script += "aprun -n " + str(nodes) + " -N " + str(mppnppn) + " -cc none python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+""
        ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&"
        self.__submit_file = os.path.join(globalYodaDir, 'submit_script')
        handle = open(self.__submit_file, 'w')
        handle.write(submit_script)
        handle.close()

        self.__log.info("submit script:\n%s" % submit_script)
        cmd = "qsub " + self.__submit_file
        self.__log.info("submitting HPC job: %s" % cmd)
        status, output = commands.getstatusoutput(cmd)
        self.__log.info("submitting HPC job: (status: %s, output: %s)" %(status, output))
        self.__jobid = None
        if status == 0:
            self.__jobid = output.replace("\n", "").split(".")[0]
            return 0, self.__jobid
        return -1, None

    def poll(self, jobid):
        # poll the job in HPC. update it
        cmd = "qstat " + jobid
        self.__log.info("polling HPC job: %s" % cmd)
        status, output = commands.getstatusoutput(cmd)
        #self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output))
        if status == 0:
            self.__failedPollTimes = 0
            state = None
            lines = output.split("\n")
            for line in lines:
                line = line.strip()
                if line.startswith(jobid):
                    state = line.split(" ")[-2]

            if state == "C":
                self.__log.info("HPC job complete")
                return "Complete"
            if state == "R":
                return "Running"
            if state == "Q":
                return "Queue"
        else:
            self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output))
            if 'Unknown Job Id Error' in output:
                self.__log.info("Unknown Job Id. Set Job Complete.")
                return "Complete"
            else:
                self.__failedPollTimes += 1
                if self.__failedPollTimes > 5:
                    return "Failed"
                else:
                    return 'Unknown'

    def delete(self, jobid):
        command = "qdel " + jobid
        status, output = commands.getstatusoutput(command)
        self.__log.debug("Run Command: %s " % command)
        self.__log.debug("Status: %s, Output: %s" % (status, output))
Пример #7
0
Файл: pbs.py Проект: vokac/pilot
class pbs(Plugin):
    def __init__(self, logFileName):
        self.__log = Logger(logFileName)
        self.__failedPollTimes = 0

    def getHPCResources(self,
                        partition,
                        max_nodes=None,
                        min_nodes=2,
                        min_walltime_m=30):
        # copied from RunJobEdison
        cmd = 'showbf -p %s' % partition
        self.__log.info("Executing command: '%s'" % cmd)
        res_tuple = commands.getstatusoutput(cmd)
        self.__log.info("Executing command output: %s" % str(res_tuple))
        showbf_str = ""
        if res_tuple[0] == 0:
            showbf_str = res_tuple[1]

        res = {}
        self.__log.info("Available resources in %s  partition" % partition)
        self.__log.info(showbf_str)
        if showbf_str:
            shobf_out = showbf_str.splitlines()
            self.__log.info("Fitted resources")
            for l in shobf_out[2:]:
                d = l.split()
                nodes = int(d[2])

                if nodes < int(min_nodes):
                    continue

                if not d[3] == 'INFINITY':
                    wal_time_arr = d[3].split(":")
                    if len(wal_time_arr) < 4:
                        wal_time_sec = int(wal_time_arr[0]) * (60 * 60) + int(
                            wal_time_arr[1]) * 60 + int(wal_time_arr[2])
                        if wal_time_sec > 24 * 3600:
                            wal_time_sec = 24 * 3600
                    else:
                        wal_time_sec = 24 * 3600
                        #if nodes > 1:
                        #    nodes = nodes - 1
                else:
                    wal_time_sec = 12 * 3600

                # Fitting Hopper policy
                # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/
                nodes = max_nodes if nodes > max_nodes else nodes

                if nodes < 682 and wal_time_sec > 48 * 3600:
                    wal_time_sec = 48 * 3600
                elif nodes < 4096 and wal_time_sec > 36 * 3600:
                    wal_time_sec = 36 * 3600
                elif nodes < 5462 and wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600
                elif wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600

                if wal_time_sec < int(min_walltime_m) * 60:
                    continue

                self.__log.info(
                    "Nodes: %s, Walltime (str): %s, Walltime (min) %s" %
                    (nodes, d[3], wal_time_sec / 60))

                res.update({nodes: wal_time_sec})
        else:
            self.__log.info(
                "No availble resources. Default values will be used.")
        self.__log.info("Get resources: %s" % res)

        return res

    def submitJob(self,
                  globalWorkingDir,
                  globalYodaDir,
                  localWorkingDir,
                  queue,
                  repo,
                  mppwidth,
                  mppnppn,
                  walltime,
                  nodes,
                  localSetup=None,
                  cpuPerNode=None,
                  dumpEventOutputs=False):
        submit_script = "#!/bin/bash -l" + "\n"
        submit_script += "#PBS -q " + queue + "\n"
        if repo:
            submit_script += "#PBS -A " + repo + "\n"
        submit_script += "#PBS -l mppwidth=" + str(mppwidth) + "\n"
        #submit_script += "#PBS -l mppnppn=" + str(mppnppn) + "\n"
        submit_script += "#PBS -l walltime=" + walltime + "\n"
        submit_script += "#PBS -N ES_job" + "\n"
        submit_script += "#PBS -j oe" + "\n"
        submit_script += "#PBS -o athena_stdout.txt" + "\n"
        submit_script += "#PBS -e athena_stderr.txt" + "\n"
        submit_script += "cd $PBS_O_WORKDIR" + "\n"
        submit_script += "module load mpi4py" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        #submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n"
        submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n"
        submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n"
        submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n"
        submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n"
        submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n"
        submit_script += "env" + "\n"

        #submit_script += "aprun -n " + str(nodes) + " -N " + str(mppnppn) + " -d " + str(ATHENA_PROC_NUMBER) + " -cc none python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalWorkingDir+" --localWorkingDir="+localWorkingDir+""
        submit_script += "aprun -n " + str(
            nodes
        ) + " -N " + str(mppnppn) + " -cc none python-mpi " + os.path.join(
            globalWorkingDir, "HPC/HPCJob.py"
        ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir + ""
        ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&"
        self.__submit_file = os.path.join(globalYodaDir, 'submit_script')
        handle = open(self.__submit_file, 'w')
        handle.write(submit_script)
        handle.close()

        self.__log.info("submit script:\n%s" % submit_script)
        cmd = "qsub " + self.__submit_file
        self.__log.info("submitting HPC job: %s" % cmd)
        status, output = commands.getstatusoutput(cmd)
        self.__log.info("submitting HPC job: (status: %s, output: %s)" %
                        (status, output))
        self.__jobid = None
        if status == 0:
            self.__jobid = output.replace("\n", "").split(".")[0]
            return 0, self.__jobid
        return -1, None

    def poll(self, jobid):
        # poll the job in HPC. update it
        cmd = "qstat " + jobid
        self.__log.info("polling HPC job: %s" % cmd)
        status, output = commands.getstatusoutput(cmd)
        #self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output))
        if status == 0:
            self.__failedPollTimes = 0
            state = None
            lines = output.split("\n")
            for line in lines:
                line = line.strip()
                if line.startswith(jobid):
                    state = line.split(" ")[-2]

            if state == "C":
                self.__log.info("HPC job complete")
                return "Complete"
            if state == "R":
                return "Running"
            if state == "Q":
                return "Queue"
        else:
            self.__log.info("polling HPC job: (status: %s, output: %s)" %
                            (status, output))
            if 'Unknown Job Id Error' in output:
                self.__log.info("Unknown Job Id. Set Job Complete.")
                return "Complete"
            else:
                self.__failedPollTimes += 1
                if self.__failedPollTimes > 5:
                    return "Failed"
                else:
                    return 'Unknown'

    def delete(self, jobid):
        command = "qdel " + jobid
        status, output = commands.getstatusoutput(command)
        self.__log.debug("Run Command: %s " % command)
        self.__log.debug("Status: %s, Output: %s" % (status, output))
Пример #8
0
class poe(Plugin):
    def __init__(self, logFileName):
        self.__log = Logger(logFileName)
        self.__failedPollTimes = 0

    def isLocalProcess(self):
        return True

    def getName(self):
        return 'poe'

    def getHPCResources(self,
                        partition,
                        max_nodes=None,
                        min_nodes=2,
                        min_walltime_m=30):
        return None

    def submitJob(self,
                  globalWorkingDir,
                  globalYodaDir,
                  localWorkingDir,
                  queue,
                  repo,
                  mppwidth,
                  mppnppn,
                  walltime,
                  nodes,
                  localSetup=None,
                  cpuPerNode=None,
                  dumpEventOutputs=False):

        submit_script = "#!/bin/bash -l" + "\n"
        submit_script += "#@ tasks_per_node = 1" + "\n"
        submit_script += "source /etc/profile.d/modules.sh" + "\n"
        submit_script += "module load mpi4py" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        submit_script += "source ${VO_ATLAS_SW_DIR}/local/setup-yampl.sh" + "\n"
        submit_script += "export PYTHONPATH=/cvmfs/atlas.cern.ch/repo/sw/local/noarch/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        #submit_script += "export CMTEXTRATAGS=ATLAS,useDBRelease" + "\n"
        submit_script += "env" + "\n"

        # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt"
        # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir
        # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir

        # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt"
        submit_script += "poe parrot_run python-mpi " + os.path.join(
            globalWorkingDir, "HPC/HPCJob.py"
        ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir + " --outputDir=" + os.path.dirname(
            globalYodaDir)
        self.__log.debug("POE submit script: %s" % submit_script)
        # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True)
        yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a')
        yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a')
        hpcJob = subprocess.Popen(submit_script,
                                  stdout=yoda_stdout,
                                  stderr=yoda_stderr,
                                  shell=True)

        t1 = time.time()
        i = 20
        while (hpcJob and hpcJob.poll() is None):
            if i == 0:
                self.__log.debug("Yoda process is running")
                i = 20
            time.sleep(30)
            i -= 1

        self.__log.debug("Yoda process terminated")
        self.__log.debug("Yoda process return code: %s" % hpcJob.returncode)

        return 0, None

    def poll(self, jobid):
        return None
Пример #9
0
class mpi(Plugin):
    def __init__(self, logFileName):
        self.__log = Logger(logFileName)
        self.__failedPollTimes = 0

    def getHPCResources(self,
                        partition,
                        max_nodes=None,
                        min_nodes=2,
                        min_walltime_m=30):
        return None

    def convertNodeList(self, nodelist):
        try:
            if '[' in nodelist:
                numNames = []
                tmp = nodelist
                preName, numList = tmp.split('[')
                numList, postName = numList.split(']')
                for items in numList.split(","):
                    if not '-' in items:
                        numNames.append(preName + items + postName)
                    else:
                        start, end = items.split('-')
                        numLen = len(start)
                        for i in range(int(start), int(end) + 1):
                            num = str(i).zfill(numLen)
                            numNames.append(preName + str(num) + postName)
                return ','.join(numNames)
            else:
                return nodelist
        except:
            self.__log.debug(traceback.format_exc())
            return nodelist

    def submitJob(self,
                  globalWorkingDir,
                  globalYodaDir,
                  localWorkingDir,
                  queue,
                  repo,
                  mppwidth,
                  mppnppn,
                  walltime,
                  nodes,
                  localSetup=None,
                  cpuPerNode=None,
                  dumpEventOutputs=False):

        nodelist = ""
        if os.environ.has_key('SLURM_NODELIST'):
            nodelist = os.environ['SLURM_NODELIST']
        elif os.environ.has_key('PBS_NODELIST'):
            nodelist = os.environ['PBS_NODELIST']
        nodelist = self.convertNodeList(nodelist)

        submit_script = "#!/bin/bash -l" + "\n"
        submit_script += "module load mpi4py openmpi-ccm" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        submit_script += "env" + "\n"

        # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt"
        submit_script += "srun -N " + str(
            nodes
        ) + " python-mpi " + os.path.join(
            globalWorkingDir, "HPC/HPCJob.py"
        ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir
        # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir

        # submit_script += "mpirun -bynode python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt"
        # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt"
        # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+""
        self.__log.debug("ARC submit script: %s" % submit_script)
        # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True)
        yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a')
        yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a')
        hpcJob = subprocess.Popen(submit_script,
                                  stdout=yoda_stdout,
                                  stderr=yoda_stderr,
                                  shell=True)

        while (hpcJob and hpcJob.poll() is None):
            self.__log.debug("Yoda process is running%s")
            time.sleep(30)

        self.__log.debug("Yoda process terminated")
        self.__log.debug("Yoda process return code: %s" % hpcJob.returncode)

        return 0, None

    def poll(self, jobid):
        return None
Пример #10
0
class slurm(Plugin):
    def __init__(self, logFileName):
        self.__log = Logger(logFileName)
        self.__failedPollTimes = 0

    def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30):
        # copied from RunJobEdison
        # cmd = 'showbf -p %s' % partition
        cmd = "sinfo "
        self.__log.info("Executing command: '%s'" % cmd)
        res_tuple = runcommand(cmd)
        self.__log.info("Executing command output: %s" % str(res_tuple))
        showbf_str = ""
        if res_tuple[0] == 0:
            showbf_str = res_tuple[1]

        res = {}
        self.__log.info("Available resources in %s  partition" % partition)
        self.__log.info(showbf_str)
        if showbf_str:
            shobf_out = showbf_str.splitlines()
            self.__log.info("Fitted resources")
            for l in shobf_out[2:]:
                d = l.split()
                nodes = int(d[2])

                if nodes < int(min_nodes):
                    continue

                if not d[3] == "INFINITY":
                    wal_time_arr = d[3].split(":")
                    if len(wal_time_arr) < 4:
                        wal_time_sec = (
                            int(wal_time_arr[0]) * (60 * 60) + int(wal_time_arr[1]) * 60 + int(wal_time_arr[2])
                        )
                        if wal_time_sec > 24 * 3600:
                            wal_time_sec = 24 * 3600
                    else:
                        wal_time_sec = 24 * 3600
                        # if nodes > 1:
                        #    nodes = nodes - 1
                else:
                    wal_time_sec = 12 * 3600

                # Fitting Hopper policy
                # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/
                nodes = max_nodes if nodes > max_nodes else nodes

                if nodes < 682 and wal_time_sec > 48 * 3600:
                    wal_time_sec = 48 * 3600
                elif nodes < 4096 and wal_time_sec > 36 * 3600:
                    wal_time_sec = 36 * 3600
                elif nodes < 5462 and wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600
                elif wal_time_sec > 12 * 3600:
                    wal_time_sec = 12 * 3600

                if wal_time_sec < int(min_walltime_m) * 60:
                    continue

                self.__log.info("Nodes: %s, Walltime (str): %s, Walltime (min) %s" % (nodes, d[3], wal_time_sec / 60))

                res.update({nodes: wal_time_sec})
        else:
            self.__log.info("No availble resources. Default values will be used.")
        self.__log.info("Get resources: %s" % res)

        return res

    def submitJob(
        self,
        globalWorkingDir,
        globalYodaDir,
        localWorkingDir,
        queue,
        repo,
        mppwidth,
        mppnppn,
        walltime,
        nodes,
        localSetup=None,
        cpuPerNode=None,
        dumpEventOutputs=False,
    ):
        submit_script = "#!/bin/bash -l" + "\n"
        if queue == "premium":
            submit_script += "#SBATCH -p regular\n"
            submit_script += "#SBATCH --qos=premium\n"
        elif queue == "scavenger":
            submit_script += "#SBATCH -p regular\n"
            submit_script += "#SBATCH --qos=scavenger\n"
        elif queue == "low":
            submit_script += "#SBATCH -p regular\n"
            submit_script += "#SBATCH --qos=low\n"
        else:
            submit_script += "#SBATCH -p " + queue + "\n"

        if repo:
            submit_script += "#SBATCH -A " + repo + "\n"
        # submit_script += "#SBATCH -n " + str(mppwidth) + "\n"
        submit_script += "#SBATCH -N " + str(nodes) + "\n"
        submit_script += "#SBATCH --signal=SIGUSR1@60\n"
        submit_script += "#SBATCH -t " + walltime + "\n"
        submit_script += "#SBATCH --ntasks-per-node=1\n"
        submit_script += "#SBATCH --cpus-per-task=" + str(cpuPerNode) + "\n"
        submit_script += "#SBATCH -J ES_job" + "\n"
        submit_script += "#SBATCH -o athena_stdout.txt" + "\n"
        submit_script += "#SBATCH -e athena_stderr.txt" + "\n"
        submit_script += "cd $SBATCH_O_WORKDIR" + "\n"
        submit_script += "module load mpi4py" + "\n"
        if localSetup:
            submit_script += localSetup + "\n"
        # submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n"
        # submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n"
        submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir
        # submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n"
        # submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n"
        # submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n"
        # submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n"
        # submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n"
        submit_script += "env" + "\n"
        # submit_script += "module avail" + "\n"
        # submit_script += "module list" + "\n"

        # submit_script += "srun -n " + str(nodes) + " -N " + str(mppnppn) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+""
        submit_script += (
            "srun -N "
            + str(nodes)
            + " python-mpi "
            + os.path.join(globalWorkingDir, "HPC/HPCJob.py")
            + " --globalWorkingDir="
            + globalYodaDir
            + " --localWorkingDir="
            + localWorkingDir
        )
        if dumpEventOutputs:
            submit_script += " --dumpEventOutputs"
        ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&"
        self.__submit_file = os.path.join(globalYodaDir, "submit_script")
        handle = open(self.__submit_file, "w")
        handle.write(submit_script)
        handle.close()

        self.__log.info("submit script:\n%s" % submit_script)
        cmd = "sbatch " + self.__submit_file
        self.__log.info("submitting HPC job: %s" % cmd)
        status, output = runcommand(cmd)
        self.__log.info("submitting HPC job: (status: %s, output: %s)" % (status, output))
        self.__jobid = None
        if status == 0:
            self.__jobid = output.replace("\n", "").split(" ")[-1]
            return 0, self.__jobid
        return -1, None

    def poll(self, jobid):
        # poll the job in HPC. update it
        cmd = "scontrol show job " + jobid
        self.__log.info("polling HPC job: %s" % cmd)
        status, output = runcommand(cmd)
        # self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output))
        if status == 0:
            self.__failedPollTimes = 0
            state = None
            lines = output.split("\n")
            for line in lines:
                line = line.strip()
                if line.startswith("JobState"):
                    state = line.split(" ")[0].split("=")[1]

            if state == "COMPLETED":
                self.__log.info("HPC job complete")
                return "Complete"
            if state == "RUNNING":
                self.__log.info("HPC job is running")
                return "Running"
            if state == "PENDING":
                self.__log.info("HPC job is pending")
                return "Queue"
            if state == "FAILED":
                self.__log.info("HPC job is failed")
                return "Failed"
            if state == "CANCELLED":
                self.__log.info("HPC job is cancelled")
                return "Failed"
            if state == "TIMEOUT":
                self.__log.info("HPC job is timed out")
                return "Failed"
            self.__log.info("HPC job is in unknown state")
            return "Unknown"
        else:
            self.__log.info("polling HPC job: (status: %s, output: %s)" % (status, output))
            if "Invalid job id specified" in output:
                self.__log.info("Unknown Job Id. Set Job Complete.")
                return "Complete"
            else:
                self.__failedPollTimes += 1
                self.__log.error(
                    "Failing HPC job because the polling command has failed " + str(self.__failedPollTimes) + " times."
                )
                return "Unknown"
        return "Unknown"

    def delete(self, jobid):
        command = "scancel " + jobid
        status, output = runcommand(command)
        self.__log.debug("Run Command: %s " % command)
        self.__log.debug("Status: %s, Output: %s" % (status, output))