Exemplo n.º 1
0
def getListOfJobStates(jobName, username=None, detailed=True):
    command = "squeue --clusters=serial -u $(whoami) -l -h"
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("squeue failed (stderr: '" +
                                           stderr + "')")
    jobList = []
    jobStates = []
    for line in stdout.split('\n'):
        if line.startswith("CLUSTER: serial"):
            continue
        line = line.rstrip('\n')
        lineSplit = line.split()
        try:
            currentJobId = int(lineSplit[0])
            currentJobStatus = JobStatus(currentJobId)

            # name
            name = lineSplit[2]
            if name == jobName or jobName == None:
                jobList.append(currentJobId)
                jobStates.append(currentJobStatus)

            # status
            status = lineSplit[4]
            currentJobStatus.setStatus(JobStatus.kUnknown, name=status)
            if status == 'RUNNING':
                currentJobStatus.setStatus(JobStatus.kRunning)
            elif status == 'PENDING' or status == 'SUSPENDED' or status == 'COMPLETING' or status == 'COMPLETED' or status == 'COMPLETI':
                currentJobStatus.setStatus(JobStatus.kWaiting)
            elif status == 'CANCELLED' or status == 'FAILED' or status == 'TIMEOUT' or status == 'NODE_FAIL':
                currentJobStatus.setStatus(JobStatus.kError)
            else:
                print "Unknown job status", status

            # time
            time_str = lineSplit[5]
            try:
                hours = 0.0
                if '-' in time_str:
                    time_str = time_str.split('-')
                    hours += float(time_str[0]) * 24
                    time_str = time_str[1].split(':')
                else:
                    time_str = time_str.split(':')
                seconds = float(time_str[-1])
                minutes = float(time_str[-2])
                if (len(time_str) > 2):
                    hours += float(time_str[-3])
                total_time = hours + minutes / 60.0 + seconds / 3600.0
                currentJobStatus.setCpuTime(total_time, 0)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of squeue output to get time information failed. ({0})"
                    .format(lineSplit[5]))
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of squeue output to get job id failed.")

    return jobStates
Exemplo n.º 2
0
def submitJob(config, command, outputFile, jobName, wd=None):

    # check if only a certain amount of active jobs is allowd
    if config.has_option(submoduleIdentifier(), "max_active_jobs"):
        max_active_jobs = int(
            config.get(submoduleIdentifier(), "max_active_jobs"))
        i = 0
        waitTime = 90
        while len(getListOfActiveJobs(None)) >= max_active_jobs:
            if i == 0:
                sys.stdout.write("Waiting for free slots")
                sys.stdout.flush()
            time.sleep(waitTime)
            # wait 1.5  min
            i += 1
        if i > 0:
            sys.stdout.write("\r")

    if wd == None:
        wd = os.getcwd()
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    headerFileName = batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file"))
    with open(fileName, 'w') as tempFile:
        tempFile.write("#!/bin/bash\n\n")
        tempFile.write("#SBATCH -D " + wd + "\n")
        tempFile.write("#SBATCH -o " + outputFile + "\n")
        tempFile.write("#SBATCH --time=" +
                       config.get(submoduleIdentifier(), "wall_clock_limit") +
                       "\n")
        tempFile.write("#SBATCH --mem=" +
                       config.get(submoduleIdentifier(), "memory") + "\n")
        if jobName is not None:
            tempFile.write("#SBATCH -J " + jobName + "\n")
        tempFile.write("#SBATCH --get-user-env \n")
        tempFile.write("#SBATCH --export=NONE \n")
        tempFile.write("#SBATCH --clusters=serial \n\n\n")
        with open(headerFileName, 'r') as headerFile:
            for line in headerFile:
                if line.startswith("#!"):
                    continue
                tempFile.write(line)
        tempFile.write("\n\n")
        tempFile.write(command)
    cmnd = "sbatch " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    batchelor.runCommand("rm -f " + fileName)
    if returncode != 0:
        raise batchelor.BatchelorException("sbatch failed (stderr: '" +
                                           stderr + "')")
    jobId = stdout.split()[3]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing output of sbatch to get job id failed.')
    return jobId
Exemplo n.º 3
0
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None):

    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
    with open(fileName, 'a') as scriptFile:
        scriptFile.write(command)
    if arrayStart is not None:
        if (jobName is None) or (len(jobName) is 0):
            jobName = ''.join(random.sample(string.lowercase, 7))
        jobName = jobName + "[" + str(arrayStart) + "-" + str(
            arrayEnd) + ":" + str(arrayStep) + "]"
    cmnd = "bsub "
    cmnd += "" if jobName is None else ("-J " + jobName + " ")
    cmnd += "-o " + outputFile + " "
    cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " "
    cmnd += "-R '"
    cmnd += "-cwd '{0}'".format(wd) if wd else ""
    cmnd += " select[type=" + config.get(submoduleIdentifier(), "type") + "]"
    cmnd += " rusage[pool=" + config.get(submoduleIdentifier(), "pool") + "]"
    try:
        cmnd += " rusage[mem=" + config.get(submoduleIdentifier(),
                                            "memory") + "]"
        cmnd += " select[maxmem>" + config.get(submoduleIdentifier(),
                                               "memory") + "]"
    except ConfigParser.NoOptionError:
        pass
    cmnd += _getExcludedHostsString(config)
    cmnd += "' "
    cmnd += "< " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    if returncode != 0:
        raise batchelor.BatchelorException("bsub failed (stderr: '" + stderr +
                                           "')")


# example output: Job <533476534> is submitted to queue <1nd>.
    jobId = stdout.lstrip("Job <")
    jobId = jobId[:jobId.find(">")]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of bsub output to get job id failed.')
    batchelor.runCommand('rm -f ' + fileName)
    return jobId
Exemplo n.º 4
0
def getListOfActiveJobs(jobName):
    if jobName is None:
        command = "llq -u `whoami`"
        (returncode, stdout, stderr) = batchelor.runCommand(command)
        if returncode != 0:
            raise batchelor.BatchelorException("llq failed (stderr: '" +
                                               stderr + "')")
        if stdout == "llq: There is currently no job status to report.":
            return []
        stringList = [job.split()[0] for job in stdout.split('\n')[2:-2]]
        jobList = []
        try:
            for item in stringList:
                jobId = int(item[item.find(".") + 1:item.rfind(".")])
                if jobId not in jobList:
                    jobList.append(jobId)
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of llq output to get job id failed.")
        return jobList
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    command = "llq -u `whoami` -m &> " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        batchelor.runCommand("rm -f " + fileName)
        raise batchelor.BatchelorException("llq failed (stderr: '" + stderr +
                                           "')")
    jobList = []
    currentJobId = -1
    with open(fileName, 'r') as llqOutput:
        for line in llqOutput:
            line = line[:-1]
            if line.startswith("===== Job Step mgmt."):
                try:
                    currentJobId = int(line[line.find(".") +
                                            1:line.rfind(".")])
                except ValueError:
                    batchelor.runCommand("rm -f " + fileName)
                    raise batchelor.BatchelorException(
                        "parsing of llq output to get job id failed.")
            line = ' '.join(line.split())
            if line.startswith("Job Name: "):
                if currentJobId < 0:
                    batchelor.runCommand("rm -f " + fileName)
                    raise batchelor.BatchelorException(
                        "parsing of llq output failed, got job name before job id."
                    )
                name = line[10:]
                if name == jobName:
                    jobList.append(currentJobId)
    batchelor.runCommand("rm -f " + fileName)
    return jobList
Exemplo n.º 5
0
def submitJob(config, command, outputFile, jobName):
	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file"))
	with open(fileName, 'w') as tempFile:
		tempFile.write("#!/bin/bash\n\n")
		tempFile.write("#@ group = " + config.get(submoduleIdentifier(), "group") + "\n")
		tempFile.write("#@ output = " + outputFile + "\n")
		tempFile.write("#@ notification = " + config.get(submoduleIdentifier(), "notification") + "\n")
		tempFile.write("#@ notify_user = "******"notify_user") + "\n")
		tempFile.write("#@ node_usage = " + config.get(submoduleIdentifier(), "node_usage") + "\n")
		tempFile.write("#@ wall_clock_limit = " + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n")
		tempFile.write("#@ resources = " + config.get(submoduleIdentifier(), "resources") + "\n")
		tempFile.write("#@ job_type = " + config.get(submoduleIdentifier(), "job_type") + "\n")
		tempFile.write("#@ class = " + config.get(submoduleIdentifier(), "job_type") + "\n")
		if jobName is not None:
			tempFile.write("#@ job_name = " + jobName + "\n")
		tempFile.write("#@ queue\n\n\n")
		with open(headerFileName, 'r') as headerFile:
			for line in headerFile:
				if line.startswith("#!"):
					continue
				tempFile.write(line)
		tempFile.write("\n\n")
		tempFile.write(command)
	cmnd = "llsubmit - < " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd)
	if returncode != 0:
		batchelor.runCommand("rm -f " + fileName)
		raise batchelor.BatchelorException("llsubmit failed (stderr: '" + stderr + "')")
	# example output stdout:
	# llsubmit: The job "mgmt.12309" has been submitted.
	#
	# example output stderr:
	#
	# llsubmit: Stdin job command file written to "/tmp/loadlx_stdin.27558.CdoVxX".
	#
	# INFO: Project: pr83mo
	# INFO: Project's Expiration Date:    2015-01-31
	# INFO: Budget:                     Total [cpuh]        Used [cpuh]      Credit [cpuh]
	# INFO:                                  1350000      1011028 (75%)       338972 (25%)
	#
	# llsubmit: Processed command file through Submit Filter: "/lrz/loadl/filter/submit_filter_c2pap.pl".
	jobId = stdout.split("\n")[0]
	jobId = jobId[jobId.find('"mgmt.')+6:jobId.rfind('"')]
	try:
		jobId = int(jobId)
	except ValueError:
		batchelor.runCommand("rm -f " + fileName)
		raise batchelor.BatchelorException('parsing of qsub output to get job id failed.')
	batchelor.runCommand("rm -f " + fileName)
	return jobId
Exemplo n.º 6
0
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None):
    if wd:
        raise batchelor.BatchelorException(
            "Choosing the working directory is not jet implemented for {0}".
            format(submoduleIdentifier()))

    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
    with open(fileName, 'a') as scriptFile:
        scriptFile.write(command)
    cmnd = "qsub "
    cmnd += "-j y "
    cmnd += "" if jobName is None else ("-N " + jobName + " ")
    if arrayStart is not None:
        cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(
            arrayStep) + " "
    cmnd += "-o " + outputFile + " "
    cmnd += "-P " + config.get(submoduleIdentifier(), "project") + " "
    cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " "
    cmnd += "-l h_vmem=" + config.get(submoduleIdentifier(), "memory") + " "
    cmnd += _getExcludedHostsString(config)
    cmnd += "< " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    if returncode != 0:
        raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr +
                                           "')")
    # example output: "Your job 1601905 ("J2415c980b8") has been submitted"
    if arrayStart is not None:
        jobId = stdout.lstrip("Your job-array ")
        jobId = jobId[:jobId.find('.')]
    else:
        jobId = stdout.lstrip("Your job ")
        jobId = jobId[:jobId.find(' ')]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of qsub output to get job id failed.')
    batchelor.runCommand("rm -f " + fileName)
    return jobId
Exemplo n.º 7
0
def getListOfActiveJobs(jobName):
    command = "bjobs"
    if not jobName is None:
        command = command + " -J " + jobName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("bjobs failed (stderr: '" + stderr +
                                           "')")
    if stdout == "":
        return []
    jobList = stdout.split('\n')[1:]
    try:
        return [int(job.split()[0]) for job in jobList]
    except ValueError:
        raise batchelor.BatchelorException(
            "parsing of bjobs output to get job id failed.")
Exemplo n.º 8
0
def resetErrorJobs(jobName):
    for id in getListOfErrorJobs(jobName):
        command = "qmod -cj " + str(id)
        (returncode, stdout, stderr) = batchelor.runCommand(command)
        if stdout.find('cleared error state of job') is -1:
            raise batchelor.BatchelorException("qmod failed (stderr: '" +
                                               stderr + "')")
    return True
Exemplo n.º 9
0
def deleteJobs(jobIds):
	if not jobIds:
		return True
	command = "llcancel"
	for jobId in jobIds:
		command += " mgmt." + str(jobId)
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("llcancel failed (stderr: '" + stderr + "')")
	return True
Exemplo n.º 10
0
def deleteJobs(jobIds):
    if not jobIds:
        return True
    command = "bkill"
    for jobId in jobIds:
        command += ' ' + str(jobId)
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        if not 'Job has already finished' in stderr:
            raise batchelor.BatchelorException("bkill failed (stderr: '" +
                                               stderr + "')")
    return True
Exemplo n.º 11
0
def deleteJobs(jobIds):
    if not jobIds:
        return True
    command = "condor_rm"
    for jobId in jobIds:
        command += ' ' + str(jobId)
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        if not 'Couldn\'t find/remove all jobs matching constraint' in stderr:
            raise batchelor.BatchelorException("condor_rm failed (stderr: '" +
                                               stderr + "')")
    return True
Exemplo n.º 12
0
def getListOfErrorJobs(jobName):
    listOfActiveJobs = getListOfActiveJobs(jobName)
    command = "qstat"
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")
    qstatLines = stdout.split('\n')[2:]
    listOfErrorJobs = []
    for line in qstatLines:
        lineList = line.split()
        jobId = -1
        try:
            jobId = int(lineList[0])
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of qstat output to get job id failed.")
        if jobId not in listOfActiveJobs:
            continue
        if lineList[4] == "Eqw":
            listOfErrorJobs.append(jobId)
    return listOfErrorJobs
Exemplo n.º 13
0
def getListOfActiveJobs(jobName):
    command = "condor_q   -format \"%d.\" ClusterId -format \"%d\n\" ProcId "
    if jobName:
        command += "-constraint 'JobBatchName == \"{0}\"' ".format(jobName)
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("condor_q failed (stderr: '" +
                                           stderr + "')")
    if stdout == "":
        return []
    jobList = stdout.split('\n')
    jobs = []
    for job in jobList:
        job = job.split()
        if len(job) > 0:
            try:
                jobID = int(job[0].rstrip(".0"))
                jobs.append(jobID)
            except ValueError:
                raise batchelor.BatchelorException(
                    "Cannot parse return of condor_q (stdout: '" + stdout +
                    "')")
    return jobs
Exemplo n.º 14
0
def getListOfActiveJobs(jobName):
    if jobName is None:
        command = "qstat"
        (returncode, stdout, stderr) = batchelor.runCommand(command)
        if returncode != 0:
            raise batchelor.BatchelorException("qstat failed (stderr: '" +
                                               stderr + "')")
        if stdout == "":
            return []
        jobList = stdout.split('\n')[2:]
        try:
            return [int(job.split()[0]) for job in jobList]
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of qstat output to get job id failed.")
    command = "qstat -j " + jobName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        if stderr.split(
                '\n'
        )[0][:
             -1] == "Following jobs do not exist or permissions are not sufficient:":
            return []
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    command = "qstat -xml -j " + jobName + " > " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")
    batchelor.runCommand(
        "awk '/<\?xml version='\"'\"'1.0'\"'\"'\?>/{n++}{print >\"" +
        fileName + "\" n \".awkOut\" }' " + fileName)
    batchelor.runCommand("rm -f " + fileName)
    xmlFiles = glob.glob(fileName + "*.awkOut")
    jobIds = []
    for xmlFile in xmlFiles:
        tree = ElementTree.parse(xmlFile)
        root = tree.getroot()
        batchelor.runCommand("rm -f " + xmlFile)
        for child in root[0]:
            jobIdList = child.findall("JB_job_number")
            if len(jobIdList) != 1:
                raise batchelor.BatchelorException(
                    "parsing xml from qstat failed")
            try:
                jobId = int(jobIdList[0].text)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing int from xml from qstat failed")
            jobIds.append(jobId)
    return jobIds
Exemplo n.º 15
0
    def run(self):
        while True:
            try:
                jobId = queue.get(timeout=2)
            except Queue.Empty:
                with guard:
                    if aux[1]:
                        break
                    else:
                        continue

            with guard:
                knownJobIds = [job.jobId for job in jobs]
                if not jobId in knownJobIds:
                    # might actually happen if a job is deleted
                    continue
                i = knownJobIds.index(jobId)
                jobs[i].running = True
                outputFile = jobs[i].outputFile
                command = jobs[i].command
                cmdFile = tempfile.NamedTemporaryFile(delete=False)
                for line in command:
                    cmdFile.write(line)
                cmdFile.close()

                logFile = open(outputFile, "w")
                p = subprocess.Popen([self.shell, cmdFile.name],
                                     stdout=logFile,
                                     stderr=subprocess.STDOUT,
                                     preexec_fn=lambda: os.setpgid(0, 0))
                jobs[i].runningProcess = p

            p.wait()

            logFile.close()
            os.unlink(cmdFile.name)
            with guard:
                knownJobIds = [job.jobId for job in jobs]
                if not jobId in knownJobIds:
                    raise batchelor.BatchelorException(
                        "Job ID {0} finished, but already removed from list of jobs."
                        .format(jobId))
                i = knownJobIds.index(jobId)
                del jobs[i]
            queue.task_done()
Exemplo n.º 16
0
    def run(self):
        while True:
            try:
                jobId = queue.get(timeout=2)
            except Queue.Empty:
                with guard:
                    if aux[1]:
                        break
                    else:
                        continue

            with guard:
                for i in range(len(jobs)):
                    if jobs[i].jobId == jobId:
                        break
                if jobs[i].jobId != jobId:
                    continue  # might actually happen if a
                    # job is deleted
                jobs[i].running = True
                outputFile = jobs[i].outputFile
                command = jobs[i].command
            cmdFile = tempfile.NamedTemporaryFile(delete=False)
            for line in command:
                cmdFile.write(line)
            cmdFile.close()

            with open(outputFile, "w") as logFile:
                subprocess.call([self.shell, cmdFile.name],
                                stdout=logFile,
                                stderr=subprocess.STDOUT,
                                preexec_fn=lambda: os.setpgid(0, 0))

            os.unlink(cmdFile.name)
            with guard:
                for i in range(len(jobs)):
                    if jobs[i].jobId == jobId:
                        break
                if jobs[i].jobId != jobId:
                    raise batchelor.BatchelorException(
                        "Job ID {0} finished, but already removed from list of jobs."
                        .format(jobId))
                del jobs[i]
            queue.task_done()
Exemplo n.º 17
0
def submitJobs(config, newJobs):
	if len(newJobs) == 0:
		return []

	poolJobsArgs = []
	for job in newJobs:
		poolJobsArgs.append([config] + job)

	try:
		nParallelSubmissions = int(config.get(submoduleIdentifier(), "parallel_submissions"))
	except ValueError:
		raise batchelor.BatchelorException('option \'parallel_submissions\' in config file does not seem to be an int.')
	if nParallelSubmissions < 0:
		nParallelSubmissions = len(newJobs)
	nParallelSubmissions = min(nParallelSubmissions, len(newJobs))

	pool = multiprocessing.Pool(processes = nParallelSubmissions)
	jobIds = pool.map(_wrapSubmitJob, poolJobsArgs, 1)
	pool.close()
	pool.join()

	return jobIds
def _submitJob(config, command, outputFile, jobName, wd=None, nTasks=None):

    # check if only a certain amount of active jobs is allowd
    if config.has_option(submoduleIdentifier(), "max_active_jobs"):
        max_active_jobs = int(
            config.get(submoduleIdentifier(), "max_active_jobs"))
        i = 0
        waitTime = 90
        while True:
            try:
                nRunningJobs = len(getListOfActiveJobs(None))
            except batchelor.BatchelorException:
                nRunningJobs = max_active_jobs
            if nRunningJobs < max_active_jobs:
                break
            if i == 0:
                sys.stdout.write("Waiting for free slots")
                sys.stdout.flush()
            time.sleep(waitTime)
            # wait 1.5  min
            i += 1
        if i > 0:
            sys.stdout.write("\r")

    if wd == None:
        wd = os.getcwd()
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    headerFileName = batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file"))
    with open(fileName, 'w') as tempFile:
        tempFile.write("#!/bin/bash\n\n")
        tempFile.write("#SBATCH -D " + wd + "\n")
        tempFile.write("#SBATCH -o " + outputFile + "\n")
        tempFile.write("#SBATCH --time=" +
                       config.get(submoduleIdentifier(), "wall_clock_limit") +
                       "\n")
        if config.get(submoduleIdentifier(), "clusters") != 'mpp3':
            tempFile.write("#SBATCH --mem-per-cpu=" +
                           config.get(submoduleIdentifier(), "memory") + "\n")
        if jobName is not None:
            tempFile.write("#SBATCH -J " + jobName + "\n")
        tempFile.write("#SBATCH --get-user-env \n")
        tempFile.write("#SBATCH --export=NONE \n")
        if nTasks is not None:
            if config.get(submoduleIdentifier(), "clusters") != 'mpp3':
                tempFile.write("#SBATCH --ntasks={0:d} \n".format(nTasks))
            else:
                tempFile.write("#SBATCH --nodes={0:d} \n".format(
                    (nTasks + 63) // 64))
            tempFile.write("#SBATCH --ntasks-per-node={0} \n".format(
                config.get(submoduleIdentifier(), "n_tasks_per_node")))
        tempFile.write("#SBATCH --clusters={0}\n".format(
            config.get(submoduleIdentifier(), "clusters")))
        if config.get(submoduleIdentifier(),
                      "clusters") not in ['cm2_tiny', 'mpp3']:
            tempFile.write("#SBATCH --partition={0}\n\n".format(
                config.get(submoduleIdentifier(), "partition")))
        if config.get(submoduleIdentifier(),
                      "clusters") == 'cm2' or config.get(
                          submoduleIdentifier(), "clusters") == 'c2pap':
            tempFile.write("#SBATCH --qos={0}\n\n".format(
                config.get(submoduleIdentifier(), "partition")))
        tempFile.write("module load slurm_setup \n\n\n")
        with open(headerFileName, 'r') as headerFile:
            for line in headerFile:
                if line.startswith("#!"):
                    continue
                tempFile.write(line)
        tempFile.write("\n\n")
        tempFile.write(command)
    cmnd = "sbatch " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    batchelor.runCommand("rm -f " + fileName)
    if returncode != 0:
        raise batchelor.BatchelorException("sbatch failed (stderr: '" +
                                           stderr + "')")
    jobId = stdout.split()[3]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing output of sbatch to get job id failed.')
    return jobId
Exemplo n.º 19
0
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None):
    if arrayStart is not None or arrayEnd is not None or arrayStep is not None:
        raise BatchelorException(
            "Array jobs are not (yet) implementet for CERNs HTCondor system")

    filesDir = os.path.join(os.getcwd(), '.log')
    if " " in filesDir:
        raise BatchelorException(
            "Cannot handle submit directories with whitespaces")

    if not os.path.exists(filesDir):
        os.makedirs(filesDir)
    (fileDescriptor, submitFileName) = tempfile.mkstemp(dir=filesDir,
                                                        prefix='submitFiles_',
                                                        suffix='.submit')
    os.close(fileDescriptor)
    atexit.register(lambda: os.remove(submitFileName))
    (fileDescriptor, scriptFileName) = tempfile.mkstemp(dir=filesDir,
                                                        prefix='scriptFiles_',
                                                        suffix='.sh')
    os.close(fileDescriptor)
    atexit.register(lambda: os.remove(scriptFileName))
    os.chmod(scriptFileName, 0755)

    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " +
                         scriptFileName)
    with open(scriptFileName, 'a') as scriptFile:
        scriptFile.write(command)
    with open(submitFileName, 'w') as submitFile:
        outputFile = os.path.abspath(outputFile)
        submitFile.write("executable = {0}\n".format(scriptFileName))
        if outputFile:
            submitFile.write("output = {0}\n".format(outputFile))
            submitFile.write("log = {0}.condor\n".format(outputFile))
            submitFile.write("error = {0}.err\n".format(outputFile))

        submitFile.write(
            "should_transfer_files = NO\n")  # Disable file transport
        submitFile.write("request_cpus  = 1\n")
        submitFile.write("request_memory = {0}\n".format(
            config.get(submoduleIdentifier(), "memory")))
        submitFile.write("request_disk = {0}\n".format(
            config.get(submoduleIdentifier(), "disk")))
        submitFile.write("+JobFlavour = \"{0}\"\n".format(
            config.get(submoduleIdentifier(), "flavour")))
        submitFile.write("queue 1\n")
    cmnd = "condor_submit '{0}'".format(submitFileName)
    if jobName:
        cmnd += " -batch-name {0} ".format(jobName)
    kwargs = {}
    if wd:
        kwargs['wd'] = wd
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd, **kwargs)
    if returncode != 0:
        raise batchelor.BatchelorException("condor_submit failed (stderr: '" +
                                           stderr + "')")
    jobId = stdout.split('\n')[1].split()[5].rstrip(".")
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of condor_submit output to get job id failed.')
    return jobId
Exemplo n.º 20
0
def getListOfJobStates(select_jobIDs, username):

    # get list of all jobs
    if username == None:
        command = "qstat"
    else:
        command = "qstat -u {0}".format(username)

    (returncode, stdout, stderr) = batchelor.runCommand(command)

    if returncode != 0:
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")

    if stdout == "":
        return []

    jobList = stdout.split('\n')[2:]

    try:
        jobIDs = [int(job.split()[0]) for job in jobList]
        jobStates = [job.split()[4] for job in jobList]
    except ValueError:
        raise batchelor.BatchelorException(
            "parsing of qstat output to get job id failed.")

    list_of_states = []

    for i, jobID in enumerate(jobIDs):
        if select_jobIDs == None or jobID in select_jobIDs:
            job_status = JobStatus(jobID)
            job_status.setStatus(JobStatus.kUnknown, name=jobStates[i])

            if jobStates[i] == 'qw' or jobStates[i] == 'hqw':
                job_status.setStatus(JobStatus.kWaiting)

            elif jobStates[i] == 't':
                job_status.setStatus(JobStatus.kTransmitting)

            elif jobStates[i] == 'd' or jobStates[i] == 'dr' or jobStates[
                    i] == 'dt':
                job_status.setStatus(JobStatus.kDeletion)

            elif jobStates[i] == 'Eq':
                job_status.setStatus(JobStatus.kError)

            elif jobStates[i] == 'r' or jobStates[i] == 'hr':

                # get detailed job information
                command = "qstat -xml -j {0}".format(jobID)
                (returncode, stdout, stderr) = batchelor.runCommand(command)
                if returncode != 0:
                    raise batchelor.BatchelorException(
                        "qstat failed (stderr: '" + stderr + "')")
                elif 'unknown_jobs' in stdout:
                    continue
                    # the job has been ended between the qstat command and now
                else:
                    try:
                        root = ElementTree.fromstring(stdout)
                        for child in root[0]:
                            for task in child.findall('JB_ja_tasks'):
                                for sublist in task.findall('ulong_sublist'):
                                    task_number = sublist.findall(
                                        'JAT_task_number')
                                    if task_number:
                                        task_number = int(task_number[0].text)
                                        job_status.setStatus(
                                            JobStatus.kRunning)
                                        for usage_list in sublist.findall(
                                                'JAT_scaled_usage_list'):
                                            for scaled in usage_list.findall(
                                                    'scaled'):
                                                name = scaled.findall(
                                                    'UA_name')[0].text
                                                value = scaled.findall(
                                                    'UA_value')[0].text
                                                if name == 'cpu':
                                                    job_status.setCpuTime(
                                                        float(value) / 3600.0,
                                                        task_number)
                                                elif name == 'vmem':
                                                    job_status.setMemoryUsage(
                                                        float(value) /
                                                        (1024.0)**3,
                                                        task_number)
                    except xml.etree.ElementTree.ParseError as e:
                        raise batchelor.BatchelorException(
                            "xml-parser could not parse output of qstat -xml -j {0}: {1}"
                            .format(jobID, e))

                    # end of parsing through the xml tree

            list_of_states.append(job_status)

        # end of if jobs belongs to the selected jobs
    # end of loop over all jobs

    return list_of_states
Exemplo n.º 21
0
def getListOfJobStates(jobName, username=None, detailed=True):
    if detailed:
        command = "llq -u `whoami` -m -x"
    else:
        command = "llq -u `whoami` -m"
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("llq failed (stderr: '" + stderr +
                                           "')")
    jobList = []
    jobStates = []
    currentJobId = -1
    currentJobStatus = None
    for line in stdout.split('\n'):
        line = line.rstrip('\n')
        if line.startswith("===== Job Step mgmt."):
            try:
                currentJobId = int(line[line.find(".") + 1:line.rfind(".")])
                currentJobStatus = JobStatus(currentJobId)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of llq output to get job id failed.")
        line = ' '.join(line.split())

        if line.startswith("Job Name: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            name = line[10:]
            if name == jobName or jobName == None:
                jobList.append(currentJobId)
                jobStates.append(currentJobStatus)
        elif line.startswith("Step Virtual Memory: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            try:
                parsed = line.lstrip().lstrip('Step Virtual Memory:').split()
                currentJobStatus.setMemoryUsage(
                    float(parsed[0]) * _kMemoryUnits[parsed[1]], 0)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of llq output to get job id failed.")
        elif line.startswith("Status: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            else:
                status = line.lstrip().lstrip("Status: ")
                currentJobStatus.setStatus(JobStatus.kUnknown, name=status)
                if status == 'Running':
                    currentJobStatus.setStatus(JobStatus.kRunning)
                elif status == 'I' or status == 'Idle' or status == 'Pending':
                    currentJobStatus.setStatus(JobStatus.kWaiting)
                elif status == 'Submission Error' or status == 'Terminated' or status == 'Removed' or status == 'Remove Pending':
                    currentJobStatus.setStatus(JobStatus.kError)

        elif line.startswith("Step User Time: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            time_str = line.lstrip().lstrip("Step User Time:").split(':')
            try:
                hours = float(time_str[0])
                minuts = float(time_str[1])
                seconds = float(time_str[2])
                total_time = hours + minuts / 60.0 + seconds / 3600.0
                currentJobStatus.setCpuTime(total_time, 0)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of llq output to get job id failed.")

    return jobStates
Exemplo n.º 22
0
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None,
              priority=None,
              ompNumThreads=None):

    # some checks of the job-settings
    if wd and os.path.realpath(wd).count(
            os.path.realpath(os.path.expanduser('~'))):
        raise batchelor.BatchelorException(
            "The given working-directory is in your home-folder which is no allowed at E18: '{0}'"
            .format(wd))

    if os.path.realpath(outputFile).count(
            os.path.realpath(os.path.expanduser('~'))):
        raise batchelor.BatchelorException(
            "The given output-file is in your home-folder which is no allowed at E18: '{0}'"
            .format(outputFile))

    if priority:
        priority = max(int(-1024 + 2048 * (priority + 1.0) / 2.0), -1023)

    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
    with open(fileName, 'a') as scriptFile:
        if ompNumThreads is not None:
            scriptFile.write(
                "export OMP_NUM_THREADS={0}\n".format(ompNumThreads))
        scriptFile.write(command)
    cmnd = "qsub "
    cmnd += "-j y "
    cmnd += "-b no "
    cmnd += "-m n "
    cmnd += "" if jobName is None else ("-N " + jobName + " ")
    if arrayStart is not None:
        cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(
            arrayStep) + " "
    cmnd += "-o '" + outputFile + "' "
    cmnd += "-wd '" + ("/tmp/" if not wd else wd) + "' "
    if config.has_option(submoduleIdentifier(), "shortqueue") and config.get(
            submoduleIdentifier(),
            "shortqueue") in [1, "1", "TRUE", "true", "True"]:
        cmnd += "-l short=1 "
    elif config.has_option(submoduleIdentifier(), "longqueue") and config.get(
            submoduleIdentifier(),
            "longqueue") in [1, "1", "TRUE", "true", "True"]:
        cmnd += "-l long=1 "
    else:
        cmnd += "-l medium=1 "
    cmnd += "-l h_pmem=" + config.get(submoduleIdentifier(), "memory") + " "
    cmnd += "-l arch=" + config.get(submoduleIdentifier(), "arch") + " "
    cmnd += _getExcludedHostsString(config)
    cmnd += "-p {0} ".format(priority) if priority else ""
    cmnd += "-pe mt {0} ".format(
        ompNumThreads) if ompNumThreads is not None else ""
    cmnd += "< " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    if returncode != 0:
        raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr +
                                           "')")
    # example output: "Your job 1601905 ("J2415c980b8") has been submitted"
    if arrayStart is not None:
        jobId = stdout.lstrip("Your job-array ")
        jobId = jobId[:jobId.find('.')]
    else:
        jobId = stdout.lstrip("Your job ")
        jobId = jobId[:jobId.find(' ')]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of qsub output to get job id failed.')
    batchelor.runCommand("rm -f " + fileName)
    return jobId
Exemplo n.º 23
0
def getListOfErrorJobs(jobName=None):
    raise batchelor.BatchelorException("not implemented")