示例#1
0
def submitJob(config, command, outputFile, jobName, arrayStart = None, arrayEnd = None, arrayStep = None):
	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
	with open(fileName, 'a') as scriptFile:
		scriptFile.write(command)
	cmnd = "qsub "
	cmnd += "-j y "
	cmnd += "" if jobName is None else ("-N " + jobName + " ")
	if arrayStart is not None:
		cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + " "
	cmnd += "-o " + outputFile + " "
	cmnd += "-P " + config.get(submoduleIdentifier(), "project") + " "
	cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " "
	cmnd += "-l h_vmem=" + config.get(submoduleIdentifier(), "memory") + " "
	cmnd += _getExcludedHostsString(config)
	cmnd += "< " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd)
	if returncode != 0:
		raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')")
	# example output: "Your job 1601905 ("J2415c980b8") has been submitted"
	if arrayStart is not None:
		jobId = stdout.lstrip("Your job-array ")
		jobId = jobId[:jobId.find('.')]
	else:
		jobId = stdout.lstrip("Your job ")
		jobId = jobId[:jobId.find(' ')]
	try:
		jobId = int(jobId)
	except ValueError:
		raise batchelor.BatchelorException('parsing of qsub output to get job id failed.')
	batchelor.runCommand("rm -f " + fileName)
	return jobId
示例#2
0
def submitJob(config, command, outputFile, jobName, wd=None):

    # check if only a certain amount of active jobs is allowd
    if config.has_option(submoduleIdentifier(), "max_active_jobs"):
        max_active_jobs = int(
            config.get(submoduleIdentifier(), "max_active_jobs"))
        i = 0
        waitTime = 90
        while len(getListOfActiveJobs(None)) >= max_active_jobs:
            if i == 0:
                sys.stdout.write("Waiting for free slots")
                sys.stdout.flush()
            time.sleep(waitTime)
            # wait 1.5  min
            i += 1
        if i > 0:
            sys.stdout.write("\r")

    if wd == None:
        wd = os.getcwd()
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    headerFileName = batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file"))
    with open(fileName, 'w') as tempFile:
        tempFile.write("#!/bin/bash\n\n")
        tempFile.write("#SBATCH -D " + wd + "\n")
        tempFile.write("#SBATCH -o " + outputFile + "\n")
        tempFile.write("#SBATCH --time=" +
                       config.get(submoduleIdentifier(), "wall_clock_limit") +
                       "\n")
        tempFile.write("#SBATCH --mem=" +
                       config.get(submoduleIdentifier(), "memory") + "\n")
        if jobName is not None:
            tempFile.write("#SBATCH -J " + jobName + "\n")
        tempFile.write("#SBATCH --get-user-env \n")
        tempFile.write("#SBATCH --export=NONE \n")
        tempFile.write("#SBATCH --clusters=serial \n\n\n")
        with open(headerFileName, 'r') as headerFile:
            for line in headerFile:
                if line.startswith("#!"):
                    continue
                tempFile.write(line)
        tempFile.write("\n\n")
        tempFile.write(command)
    cmnd = "sbatch " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    batchelor.runCommand("rm -f " + fileName)
    if returncode != 0:
        raise batchelor.BatchelorException("sbatch failed (stderr: '" +
                                           stderr + "')")
    jobId = stdout.split()[3]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing output of sbatch to get job id failed.')
    return jobId
示例#3
0
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None, priority=None, ompNumThreads=None):

	# some checks of the job-settings
	if wd and os.path.realpath(wd).count(os.path.realpath(os.path.expanduser('~'))):
		raise batchelor.BatchelorException("The given working-directory is in your home-folder which is no allowed at E18: '{0}'".format(wd))

	if os.path.realpath(outputFile).count(os.path.realpath(os.path.expanduser('~'))):
		raise batchelor.BatchelorException("The given output-file is in your home-folder which is no allowed at E18: '{0}'".format(outputFile))

	if priority:
		priority = max(int(-1024 + 2048 * (priority+1.0)/2.0), -1023)

	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
	with open(fileName, 'a') as scriptFile:
		if ompNumThreads is not None:
			scriptFile.write("export OMP_NUM_THREADS={0}\n".format(ompNumThreads))
		scriptFile.write(command)
	cmnd = "qsub "
	cmnd += "-j y "
	cmnd += "-b no "
	cmnd += "-m n "
	cmnd += "" if jobName is None else ("-N " + jobName + " ")
	if arrayStart is not None:
		cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + " "
	cmnd += "-o '" + outputFile + "' "
	cmnd += "-wd '" + ("/tmp/" if not wd else wd) + "' "
	if config.has_option(submoduleIdentifier(), "shortqueue") and config.get(submoduleIdentifier(), "shortqueue") in [1, "1", "TRUE", "true", "True"]:
		cmnd += "-l short=1 "
	elif config.has_option(submoduleIdentifier(), "longqueue") and config.get(submoduleIdentifier(), "longqueue") in [1, "1", "TRUE", "true", "True"]:
		cmnd += "-l long=1 "
	else:
		cmnd += "-l medium=1 "
	cmnd += "-l h_pmem=" + config.get(submoduleIdentifier(), "memory") + " "
	cmnd += "-l arch=" + config.get(submoduleIdentifier(), "arch") + " "
	cmnd += _getExcludedHostsString(config)
	cmnd += "-p {0} ".format(priority) if priority else ""
	cmnd += "-pe mt {0} ".format(ompNumThreads) if ompNumThreads is not None else ""
	cmnd += "< " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd)
	if returncode != 0:
		raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')")
	# example output: "Your job 1601905 ("J2415c980b8") has been submitted"
	if arrayStart is not None:
		jobId = stdout.lstrip("Your job-array ")
		jobId = jobId[:jobId.find('.')]
	else:
		jobId = stdout.lstrip("Your job ")
		jobId = jobId[:jobId.find(' ')]
	try:
		jobId = int(jobId)
	except ValueError:
		raise batchelor.BatchelorException('parsing of qsub output to get job id failed.')
	batchelor.runCommand("rm -f " + fileName)
	return jobId
示例#4
0
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None):

    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
    with open(fileName, 'a') as scriptFile:
        scriptFile.write(command)
    if arrayStart is not None:
        if (jobName is None) or (len(jobName) is 0):
            jobName = ''.join(random.sample(string.lowercase, 7))
        jobName = jobName + "[" + str(arrayStart) + "-" + str(
            arrayEnd) + ":" + str(arrayStep) + "]"
    cmnd = "bsub "
    cmnd += "" if jobName is None else ("-J " + jobName + " ")
    cmnd += "-o " + outputFile + " "
    cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " "
    cmnd += "-R '"
    cmnd += "-cwd '{0}'".format(wd) if wd else ""
    cmnd += " select[type=" + config.get(submoduleIdentifier(), "type") + "]"
    cmnd += " rusage[pool=" + config.get(submoduleIdentifier(), "pool") + "]"
    try:
        cmnd += " rusage[mem=" + config.get(submoduleIdentifier(),
                                            "memory") + "]"
        cmnd += " select[maxmem>" + config.get(submoduleIdentifier(),
                                               "memory") + "]"
    except ConfigParser.NoOptionError:
        pass
    cmnd += _getExcludedHostsString(config)
    cmnd += "' "
    cmnd += "< " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    if returncode != 0:
        raise batchelor.BatchelorException("bsub failed (stderr: '" + stderr +
                                           "')")


# example output: Job <533476534> is submitted to queue <1nd>.
    jobId = stdout.lstrip("Job <")
    jobId = jobId[:jobId.find(">")]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of bsub output to get job id failed.')
    batchelor.runCommand('rm -f ' + fileName)
    return jobId
示例#5
0
def getListOfActiveJobs(jobName):
    if jobName is None:
        command = "llq -u `whoami`"
        (returncode, stdout, stderr) = batchelor.runCommand(command)
        if returncode != 0:
            raise batchelor.BatchelorException("llq failed (stderr: '" +
                                               stderr + "')")
        if stdout == "llq: There is currently no job status to report.":
            return []
        stringList = [job.split()[0] for job in stdout.split('\n')[2:-2]]
        jobList = []
        try:
            for item in stringList:
                jobId = int(item[item.find(".") + 1:item.rfind(".")])
                if jobId not in jobList:
                    jobList.append(jobId)
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of llq output to get job id failed.")
        return jobList
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    command = "llq -u `whoami` -m &> " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        batchelor.runCommand("rm -f " + fileName)
        raise batchelor.BatchelorException("llq failed (stderr: '" + stderr +
                                           "')")
    jobList = []
    currentJobId = -1
    with open(fileName, 'r') as llqOutput:
        for line in llqOutput:
            line = line[:-1]
            if line.startswith("===== Job Step mgmt."):
                try:
                    currentJobId = int(line[line.find(".") +
                                            1:line.rfind(".")])
                except ValueError:
                    batchelor.runCommand("rm -f " + fileName)
                    raise batchelor.BatchelorException(
                        "parsing of llq output to get job id failed.")
            line = ' '.join(line.split())
            if line.startswith("Job Name: "):
                if currentJobId < 0:
                    batchelor.runCommand("rm -f " + fileName)
                    raise batchelor.BatchelorException(
                        "parsing of llq output failed, got job name before job id."
                    )
                name = line[10:]
                if name == jobName:
                    jobList.append(currentJobId)
    batchelor.runCommand("rm -f " + fileName)
    return jobList
def submitJob(config, command, outputFile, jobName, wd = None):
	
	
	# check if only a certain amount of active jobs is allowd
	if config.has_option(submoduleIdentifier(), "max_active_jobs"):
		max_active_jobs = int(config.get(submoduleIdentifier(), "max_active_jobs"))
		i=0;
		waitTime = 90
		while len(getListOfActiveJobs(None)) >= max_active_jobs:
			if i == 0:
				sys.stdout.write("Waiting for free slots")
				sys.stdout.flush()
			time.sleep(waitTime); # wait 1.5  min
			i+=1
		if i > 0:
			sys.stdout.write("\r")

	if wd == None:
		wd = os.getcwd()
	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file"))
	with open(fileName, 'w') as tempFile:
		tempFile.write("#!/bin/bash\n\n")
		tempFile.write("#SBATCH -D " + wd + "\n")
		tempFile.write("#SBATCH -o " + outputFile + "\n")
		tempFile.write("#SBATCH --time=" + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n")
		tempFile.write("#SBATCH --mem=" + config.get(submoduleIdentifier(), "memory") + "\n")
		if jobName is not None:
			tempFile.write("#SBATCH -J " + jobName + "\n")
		tempFile.write("#SBATCH --get-user-env \n")
		tempFile.write("#SBATCH --export=NONE \n")
		tempFile.write("#SBATCH --clusters=serial \n\n\n")
		with open(headerFileName, 'r') as headerFile:
			for line in headerFile:
				if line.startswith("#!"):
					continue
				tempFile.write(line)
		tempFile.write("\n\n")
		tempFile.write(command)
	cmnd = "sbatch " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd)
	batchelor.runCommand("rm -f " + fileName)
	if returncode != 0:
		raise batchelor.BatchelorException("sbatch failed (stderr: '" + stderr + "')")
	jobId = stdout.split()[3]
	try:
		jobId = int(jobId)
	except ValueError:
		raise batchelor.BatchelorException('parsing output of sbatch to get job id failed.')
	return jobId
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None):
	if arrayStart is not None or arrayEnd is not None or arrayStep is not None:
		raise BatchelorException("Array jobs are not (yet) implementet for CERNs HTCondor system")

	filesDir = os.path.join(os.getcwd(), '.log')
	if " " in filesDir:
		raise BatchelorException("Cannot handle submit directories with whitespaces")

	if not os.path.exists(filesDir):
		os.makedirs(filesDir)
	(fileDescriptor, submitFileName) = tempfile.mkstemp(dir=filesDir, prefix='submitFiles_', suffix='.submit')
	os.close(fileDescriptor)
	atexit.register(lambda: os.remove( submitFileName ))
	(fileDescriptor, scriptFileName) = tempfile.mkstemp(dir=filesDir, prefix='scriptFiles_', suffix='.sh')
	os.close(fileDescriptor)
	atexit.register(lambda: os.remove( scriptFileName ))
	os.chmod(scriptFileName, 0755)

	batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + scriptFileName)
	with open(scriptFileName, 'a') as scriptFile:
		scriptFile.write(command)
	with open(submitFileName, 'w') as submitFile:
		outputFile = os.path.abspath(outputFile)
		submitFile.write("executable = {0}\n".format(scriptFileName))
		if outputFile:
			submitFile.write("output = {0}\n".format(outputFile))
			submitFile.write("log = {0}.condor\n".format(outputFile))
			submitFile.write("error = {0}.err\n".format(outputFile))

		submitFile.write("should_transfer_files = NO\n") # Disable file transport
		submitFile.write("request_cpus  = 1\n")
		submitFile.write("request_memory = {0}\n".format(config.get(submoduleIdentifier(), "memory")))
		submitFile.write("request_disk = {0}\n".format(config.get(submoduleIdentifier(), "disk")))
		submitFile.write("+JobFlavour = \"{0}\"\n".format(config.get(submoduleIdentifier(), "flavour")))
		submitFile.write("queue 1\n")
	cmnd = "condor_submit '{0}'".format(submitFileName)
	if jobName:
		cmnd += " -batch-name {0} ".format(jobName)
	kwargs = {}
	if wd:
		kwargs['wd'] = wd
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd, **kwargs)
	if returncode != 0:
		raise batchelor.BatchelorException("condor_submit failed (stderr: '" + stderr + "')")
	jobId = stdout.split('\n')[1].split()[5].rstrip(".")
	try:
		jobId = int(jobId)
	except ValueError:
		raise batchelor.BatchelorException('parsing of condor_submit output to get job id failed.')
	return jobId
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None):
    if wd:
        raise batchelor.BatchelorException(
            "Choosing the working directory is not jet implemented for {0}".
            format(submoduleIdentifier()))

    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
    with open(fileName, 'a') as scriptFile:
        scriptFile.write(command)
    cmnd = "qsub "
    cmnd += "-j y "
    cmnd += "" if jobName is None else ("-N " + jobName + " ")
    if arrayStart is not None:
        cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(
            arrayStep) + " "
    cmnd += "-o " + outputFile + " "
    cmnd += "-P " + config.get(submoduleIdentifier(), "project") + " "
    cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " "
    cmnd += "-l h_vmem=" + config.get(submoduleIdentifier(), "memory") + " "
    cmnd += _getExcludedHostsString(config)
    cmnd += "< " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    if returncode != 0:
        raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr +
                                           "')")
    # example output: "Your job 1601905 ("J2415c980b8") has been submitted"
    if arrayStart is not None:
        jobId = stdout.lstrip("Your job-array ")
        jobId = jobId[:jobId.find('.')]
    else:
        jobId = stdout.lstrip("Your job ")
        jobId = jobId[:jobId.find(' ')]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of qsub output to get job id failed.')
    batchelor.runCommand("rm -f " + fileName)
    return jobId
def getListOfActiveJobs(jobName):
    if jobName is None:
        command = "qstat"
        (returncode, stdout, stderr) = batchelor.runCommand(command)
        if returncode != 0:
            raise batchelor.BatchelorException("qstat failed (stderr: '" +
                                               stderr + "')")
        if stdout == "":
            return []
        jobList = stdout.split('\n')[2:]
        try:
            return [int(job.split()[0]) for job in jobList]
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of qstat output to get job id failed.")
    command = "qstat -j " + jobName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        if stderr.split(
                '\n'
        )[0][:
             -1] == "Following jobs do not exist or permissions are not sufficient:":
            return []
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    command = "qstat -xml -j " + jobName + " > " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")
    batchelor.runCommand(
        "awk '/<\?xml version='\"'\"'1.0'\"'\"'\?>/{n++}{print >\"" +
        fileName + "\" n \".awkOut\" }' " + fileName)
    batchelor.runCommand("rm -f " + fileName)
    xmlFiles = glob.glob(fileName + "*.awkOut")
    jobIds = []
    for xmlFile in xmlFiles:
        tree = ElementTree.parse(xmlFile)
        root = tree.getroot()
        batchelor.runCommand("rm -f " + xmlFile)
        for child in root[0]:
            jobIdList = child.findall("JB_job_number")
            if len(jobIdList) != 1:
                raise batchelor.BatchelorException(
                    "parsing xml from qstat failed")
            try:
                jobId = int(jobIdList[0].text)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing int from xml from qstat failed")
            jobIds.append(jobId)
    return jobIds
示例#10
0
def getListOfJobStates(jobName, username=None, detailed=True):
    command = "squeue --clusters=serial -u $(whoami) -l -h"
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("squeue failed (stderr: '" +
                                           stderr + "')")
    jobList = []
    jobStates = []
    for line in stdout.split('\n'):
        if line.startswith("CLUSTER: serial"):
            continue
        line = line.rstrip('\n')
        lineSplit = line.split()
        try:
            currentJobId = int(lineSplit[0])
            currentJobStatus = JobStatus(currentJobId)

            # name
            name = lineSplit[2]
            if name == jobName or jobName == None:
                jobList.append(currentJobId)
                jobStates.append(currentJobStatus)

            # status
            status = lineSplit[4]
            currentJobStatus.setStatus(JobStatus.kUnknown, name=status)
            if status == 'RUNNING':
                currentJobStatus.setStatus(JobStatus.kRunning)
            elif status == 'PENDING' or status == 'SUSPENDED' or status == 'COMPLETING' or status == 'COMPLETED' or status == 'COMPLETI':
                currentJobStatus.setStatus(JobStatus.kWaiting)
            elif status == 'CANCELLED' or status == 'FAILED' or status == 'TIMEOUT' or status == 'NODE_FAIL':
                currentJobStatus.setStatus(JobStatus.kError)
            else:
                print "Unknown job status", status

            # time
            time_str = lineSplit[5]
            try:
                hours = 0.0
                if '-' in time_str:
                    time_str = time_str.split('-')
                    hours += float(time_str[0]) * 24
                    time_str = time_str[1].split(':')
                else:
                    time_str = time_str.split(':')
                seconds = float(time_str[-1])
                minutes = float(time_str[-2])
                if (len(time_str) > 2):
                    hours += float(time_str[-3])
                total_time = hours + minutes / 60.0 + seconds / 3600.0
                currentJobStatus.setCpuTime(total_time, 0)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of squeue output to get time information failed. ({0})"
                    .format(lineSplit[5]))
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of squeue output to get job id failed.")

    return jobStates
def resetErrorJobs(jobName):
	for id in getListOfErrorJobs(jobName):
		command = "qmod -cj " + str(id)
		(returncode, stdout, stderr) = batchelor.runCommand(command)
		if stdout.find('cleared error state of job') is -1:
			raise batchelor.BatchelorException("qmod failed (stderr: '" + stderr + "')")
	return True
示例#12
0
def resetErrorJobs(jobName):
    for id in getListOfErrorJobs(jobName):
        command = "qmod -cj " + str(id)
        (returncode, stdout, stderr) = batchelor.runCommand(command)
        if stdout.find('cleared error state of job') is -1:
            raise batchelor.BatchelorException("qmod failed (stderr: '" +
                                               stderr + "')")
    return True
示例#13
0
def getListOfActiveJobs(jobName):
	if jobName is None:
		command = "llq -u `whoami`"
		(returncode, stdout, stderr) = batchelor.runCommand(command)
		if returncode != 0:
			raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')")
		if stdout == "llq: There is currently no job status to report.":
			return []
		stringList = [ job.split()[0] for job in stdout.split('\n')[2:-2] ]
		jobList = []
		try:
			for item in stringList:
				jobId = int(item[item.find(".")+1:item.rfind(".")])
				if jobId not in jobList:
					jobList.append(jobId)
		except ValueError:
			raise batchelor.BatchelorException("parsing of llq output to get job id failed.")
		return jobList
	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	command = "llq -u `whoami` -m &> " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		batchelor.runCommand("rm -f " + fileName)
		raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')")
	jobList = []
	currentJobId = -1
	with open(fileName, 'r') as llqOutput:
		for line in llqOutput:
			line = line[:-1]
			if line.startswith("===== Job Step mgmt."):
				try:
					currentJobId = int(line[line.find(".")+1:line.rfind(".")])
				except ValueError:
					batchelor.runCommand("rm -f " + fileName)
					raise batchelor.BatchelorException("parsing of llq output to get job id failed.")
			line = ' '.join(line.split())
			if line.startswith("Job Name: "):
				if currentJobId < 0:
					batchelor.runCommand("rm -f " + fileName)
					raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.")
				name = line[10:]
				if name == jobName:
					jobList.append(currentJobId)
	batchelor.runCommand("rm -f " + fileName)
	return jobList
def getListOfJobStates(jobName, username = None, detailed = True):
	command = "squeue --clusters=serial -u $(whoami) -l -h"
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("squeue failed (stderr: '" + stderr + "')")
	jobList = []
	jobStates = []
	for line in stdout.split('\n'):
		if line.startswith("CLUSTER: serial"):
			continue;
		line = line.rstrip('\n')
		lineSplit = line.split()
		try:
			currentJobId = int(lineSplit[0])
			currentJobStatus = JobStatus(currentJobId)

			# name
			name = lineSplit[2]
			if name == jobName or jobName == None:
				jobList.append(currentJobId)
				jobStates.append(currentJobStatus)

			# status
			status = lineSplit[4]
			currentJobStatus.setStatus(JobStatus.kUnknown, name = status)
			if status=='RUNNING':
				currentJobStatus.setStatus(JobStatus.kRunning)
			elif status=='PENDING' or status=='SUSPENDED' or status=='COMPLETING' or status=='COMPLETED' or status=='COMPLETI':
				currentJobStatus.setStatus(JobStatus.kWaiting)
			elif status=='CANCELLED' or status=='FAILED' or status=='TIMEOUT' or status=='NODE_FAIL':
				currentJobStatus.setStatus(JobStatus.kError)
			else:
				print "Unknown job status", status

			# time
			time_str = lineSplit[5]
			try:
				hours = 0.0
				if '-' in time_str:
					time_str = time_str.split('-')
					hours += float(time_str[0])*24
					time_str = time_str[1].split(':')
				else:
					time_str = time_str.split(':')
				seconds = float(time_str[-1])
				minutes = float(time_str[-2])
				if(len(time_str) > 2):
					hours += float(time_str[-3])
				total_time = hours + minutes / 60.0 + seconds / 3600.0
				currentJobStatus.setCpuTime(total_time, 0)
			except ValueError:
				raise batchelor.BatchelorException("parsing of squeue output to get time information failed. ({0})".format(lineSplit[5]))
		except ValueError:
			raise batchelor.BatchelorException("parsing of squeue output to get job id failed.")

	return jobStates
示例#15
0
def deleteJobs(jobIds):
	if not jobIds:
		return True
	command = "llcancel"
	for jobId in jobIds:
		command += " mgmt." + str(jobId)
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("llcancel failed (stderr: '" + stderr + "')")
	return True
示例#16
0
def deleteJobs(jobIds):
	if not jobIds:
		return True
	command = "llcancel"
	for jobId in jobIds:
		command += " mgmt." + str(jobId)
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("llcancel failed (stderr: '" + stderr + "')")
	return True
def submitJob(config, command, outputFile, jobName, wd = None):
	if wd:
		raise batchelor.BatchelorException("Choosing the working directory is not jet implemented for {0}".format(submoduleIdentifier()))

	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file"))
	with open(fileName, 'w') as tempFile:
		tempFile.write("#!/bin/bash\n\n")
		tempFile.write("#@ group = " + config.get(submoduleIdentifier(), "group") + "\n")
		tempFile.write("#@ output = " + outputFile + "\n")
		tempFile.write("#@ error = " + outputFile + "\n")
		tempFile.write("#@ notification = " + config.get(submoduleIdentifier(), "notification") + "\n")
		tempFile.write("#@ notify_user = "******"notify_user") + "\n")
		tempFile.write("#@ node_usage = " + config.get(submoduleIdentifier(), "node_usage") + "\n")
		tempFile.write("#@ wall_clock_limit = " + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n")
		tempFile.write("#@ resources = " + config.get(submoduleIdentifier(), "resources") + "\n")
		tempFile.write("#@ job_type = " + config.get(submoduleIdentifier(), "job_type") + "\n")
		tempFile.write("#@ class = " + config.get(submoduleIdentifier(), "job_type") + "\n")
		if jobName is not None:
			tempFile.write("#@ job_name = " + jobName + "\n")
		tempFile.write("#@ queue\n\n\n")
		with open(headerFileName, 'r') as headerFile:
			for line in headerFile:
				if line.startswith("#!"):
					continue
				tempFile.write(line)
		tempFile.write("\n\n")
		tempFile.write("exec 2>&1\n")
		tempFile.write("\n")
		tempFile.write(command)
	cmnd = "llsubmit - < " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd)
	if returncode != 0:
		batchelor.runCommand("rm -f " + fileName)
		raise batchelor.BatchelorException("llsubmit failed (stderr: '" + stderr + "')")
	# example output stdout:
	# llsubmit: The job "mgmt.12309" has been submitted.
	#
	# example output stderr:
	#
	# llsubmit: Stdin job command file written to "/tmp/loadlx_stdin.27558.CdoVxX".
	#
	# INFO: Project: pr83mo
	# INFO: Project's Expiration Date:    2015-01-31
	# INFO: Budget:                     Total [cpuh]        Used [cpuh]      Credit [cpuh]
	# INFO:                                  1350000      1011028 (75%)       338972 (25%)
	#
	# llsubmit: Processed command file through Submit Filter: "/lrz/loadl/filter/submit_filter_c2pap.pl".
	jobId = stdout.split("\n")[0]
	jobId = jobId[jobId.find('"mgmt.')+6:jobId.rfind('"')]
	try:
		jobId = int(jobId)
	except ValueError:
		batchelor.runCommand("rm -f " + fileName)
		raise batchelor.BatchelorException('parsing of qsub output to get job id failed.')
	batchelor.runCommand("rm -f " + fileName)
	return jobId
def deleteJobs(jobIds):
	if not jobIds:
		return True
	command = "bkill"
	for jobId in jobIds:
		command += ' ' + str(jobId)
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		if not 'Job has already finished' in stderr:
			raise batchelor.BatchelorException("bkill failed (stderr: '" + stderr + "')")
	return True
def deleteJobs(jobIds):
	if not jobIds:
		return True
	command = "condor_rm"
	for jobId in jobIds:
		command += ' ' + str(jobId)
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		if not 'Couldn\'t find/remove all jobs matching constraint' in stderr:
			raise batchelor.BatchelorException("condor_rm failed (stderr: '" + stderr + "')")
	return True
示例#20
0
def deleteJobs(jobIds):
    if not jobIds:
        return True
    command = "condor_rm"
    for jobId in jobIds:
        command += ' ' + str(jobId)
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        if not 'Couldn\'t find/remove all jobs matching constraint' in stderr:
            raise batchelor.BatchelorException("condor_rm failed (stderr: '" +
                                               stderr + "')")
    return True
示例#21
0
def deleteJobs(jobIds):
    if not jobIds:
        return True
    command = "bkill"
    for jobId in jobIds:
        command += ' ' + str(jobId)
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        if not 'Job has already finished' in stderr:
            raise batchelor.BatchelorException("bkill failed (stderr: '" +
                                               stderr + "')")
    return True
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None):

	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
	with open(fileName, 'a') as scriptFile:
		scriptFile.write(command)
	if arrayStart is not None:
		if (jobName is None) or (len(jobName) is 0):
			jobName = ''.join(random.sample(string.lowercase,7))
		jobName = jobName + "[" + str(arrayStart) + "-" +  str(arrayEnd) + ":" + str(arrayStep) + "]"
	cmnd = "bsub "
	cmnd += "" if jobName is None else ("-J " + jobName + " ")
	cmnd += "-o " + outputFile + " "
	cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " "
	cmnd += "-R '"
	cmnd += "-cwd '{0}'".format(wd) if wd else ""
	cmnd += " select[type=" + config.get(submoduleIdentifier(), "type") + "]"
	cmnd += " rusage[pool=" + config.get(submoduleIdentifier(), "pool") + "]"
	try:
		cmnd += " rusage[mem=" + config.get(submoduleIdentifier(), "memory") + "]"
		cmnd += " select[maxmem>" + config.get(submoduleIdentifier(), "memory") + "]"
	except ConfigParser.NoOptionError:
		pass
	cmnd += _getExcludedHostsString(config)
	cmnd += "' "
	cmnd += "< " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd)
	if returncode != 0:
		raise batchelor.BatchelorException("bsub failed (stderr: '" + stderr + "')")
# example output: Job <533476534> is submitted to queue <1nd>.
	jobId = stdout.lstrip("Job <")
	jobId = jobId[:jobId.find(">")]
	try:
		jobId = int(jobId)
	except ValueError:
		raise batchelor.BatchelorException('parsing of bsub output to get job id failed.')
	batchelor.runCommand('rm -f ' + fileName)
	return jobId
def getListOfActiveJobs(jobName):
	if jobName is None:
		command = "qstat"
		(returncode, stdout, stderr) = batchelor.runCommand(command)
		if returncode != 0:
			raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')")
		if stdout == "":
			return []
		jobList = stdout.split('\n')[2:]
		try:
			return [ int(job.split()[0]) for job in jobList ]
		except ValueError:
			raise batchelor.BatchelorException("parsing of qstat output to get job id failed.")
	command = "qstat -j " + jobName
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		if stderr.split('\n')[0][:-1] == "Following jobs do not exist:":
			return []
		raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')")
	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	command = "qstat -xml -j " + jobName + " > " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')")
	batchelor.runCommand("awk '/<\?xml version='\"'\"'1.0'\"'\"'\?>/{n++}{print >\"" + fileName + "\" n \".awkOut\" }' " + fileName)
	batchelor.runCommand("rm -f " + fileName)
	xmlFiles = glob.glob(fileName + "*.awkOut")
	jobIds = []
	for xmlFile in xmlFiles:
		tree = ElementTree.parse(xmlFile)
		root = tree.getroot()
		batchelor.runCommand("rm -f " + xmlFile)
		for child in root[0]:
			jobIdList = child.findall("JB_job_number")
			if len(jobIdList) != 1:
				raise batchelor.BatchelorException("parsing xml from qstat failed")
			try:
				jobId = int(jobIdList[0].text)
			except ValueError:
				raise batchelor.BatchelorException("parsing int from xml from qstat failed")
			jobIds.append(jobId)
	return jobIds
def getListOfActiveJobs(jobName):
	command = "bjobs"
	if not jobName is None:
		command = command + " -J " + jobName
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("bjobs failed (stderr: '" + stderr + "')")
	if stdout == "":
		return []
	jobList = stdout.split('\n')[1:]
	try:
		return [ int(job.split()[0]) for job in jobList ]
	except ValueError:
		raise batchelor.BatchelorException("parsing of bjobs output to get job id failed.")
示例#25
0
def submitJob(config, command, outputFile, jobName):
	(fileDescriptor, fileName) = tempfile.mkstemp()
	os.close(fileDescriptor)
	headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file"))
	with open(fileName, 'w') as tempFile:
		tempFile.write("#!/bin/bash\n\n")
		tempFile.write("#@ group = " + config.get(submoduleIdentifier(), "group") + "\n")
		tempFile.write("#@ output = " + outputFile + "\n")
		tempFile.write("#@ notification = " + config.get(submoduleIdentifier(), "notification") + "\n")
		tempFile.write("#@ notify_user = "******"notify_user") + "\n")
		tempFile.write("#@ node_usage = " + config.get(submoduleIdentifier(), "node_usage") + "\n")
		tempFile.write("#@ wall_clock_limit = " + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n")
		tempFile.write("#@ resources = " + config.get(submoduleIdentifier(), "resources") + "\n")
		tempFile.write("#@ job_type = " + config.get(submoduleIdentifier(), "job_type") + "\n")
		tempFile.write("#@ class = " + config.get(submoduleIdentifier(), "job_type") + "\n")
		if jobName is not None:
			tempFile.write("#@ job_name = " + jobName + "\n")
		tempFile.write("#@ queue\n\n\n")
		with open(headerFileName, 'r') as headerFile:
			for line in headerFile:
				if line.startswith("#!"):
					continue
				tempFile.write(line)
		tempFile.write("\n\n")
		tempFile.write(command)
	cmnd = "llsubmit - < " + fileName
	(returncode, stdout, stderr) = batchelor.runCommand(cmnd)
	if returncode != 0:
		batchelor.runCommand("rm -f " + fileName)
		raise batchelor.BatchelorException("llsubmit failed (stderr: '" + stderr + "')")
	# example output stdout:
	# llsubmit: The job "mgmt.12309" has been submitted.
	#
	# example output stderr:
	#
	# llsubmit: Stdin job command file written to "/tmp/loadlx_stdin.27558.CdoVxX".
	#
	# INFO: Project: pr83mo
	# INFO: Project's Expiration Date:    2015-01-31
	# INFO: Budget:                     Total [cpuh]        Used [cpuh]      Credit [cpuh]
	# INFO:                                  1350000      1011028 (75%)       338972 (25%)
	#
	# llsubmit: Processed command file through Submit Filter: "/lrz/loadl/filter/submit_filter_c2pap.pl".
	jobId = stdout.split("\n")[0]
	jobId = jobId[jobId.find('"mgmt.')+6:jobId.rfind('"')]
	try:
		jobId = int(jobId)
	except ValueError:
		batchelor.runCommand("rm -f " + fileName)
		raise batchelor.BatchelorException('parsing of qsub output to get job id failed.')
	batchelor.runCommand("rm -f " + fileName)
	return jobId
示例#26
0
def getListOfActiveJobs(jobName):
    command = "bjobs"
    if not jobName is None:
        command = command + " -J " + jobName
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("bjobs failed (stderr: '" + stderr +
                                           "')")
    if stdout == "":
        return []
    jobList = stdout.split('\n')[1:]
    try:
        return [int(job.split()[0]) for job in jobList]
    except ValueError:
        raise batchelor.BatchelorException(
            "parsing of bjobs output to get job id failed.")
def getListOfActiveJobs(jobName):
	command = "condor_q   -format \"%d.\" ClusterId -format \"%d\n\" ProcId "
	if jobName:
		command += "-constraint 'JobBatchName == \"{0}\"' ".format(jobName)
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("condor_q failed (stderr: '" + stderr + "')")
	if stdout == "":
		return []
	jobList = stdout.split('\n')
	jobs = []
	for job in jobList:
		job = job.split()
		if len(job) > 0:
			try:
				jobID = int(job[0].rstrip(".0"))
				jobs.append(jobID)
			except ValueError:
				raise batchelor.BatchelorException("Cannot parse return of condor_q (stdout: '" + stdout + "')")
	return jobs
def getListOfRunningJobs(jobName):
	listOfActiveJobs = getListOfActiveJobs(jobName)
	command = "qstat"
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')")
	qstatLines = stdout.split('\n')[2:]
	listOfErrorJobs = []
	for line in qstatLines:
		lineList = line.split()
		jobId = -1
		try:
			jobId = int(lineList[0])
		except ValueError:
			raise batchelor.BatchelorException("parsing of qstat output to get job id failed.")
		if jobId not in listOfActiveJobs:
			continue
		if lineList[4] == "r":
			listOfErrorJobs.append(jobId)
	return listOfErrorJobs
def getListOfErrorJobs(jobName):
    listOfActiveJobs = getListOfActiveJobs(jobName)
    command = "qstat"
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")
    qstatLines = stdout.split('\n')[2:]
    listOfErrorJobs = []
    for line in qstatLines:
        lineList = line.split()
        jobId = -1
        try:
            jobId = int(lineList[0])
        except ValueError:
            raise batchelor.BatchelorException(
                "parsing of qstat output to get job id failed.")
        if jobId not in listOfActiveJobs:
            continue
        if lineList[4] == "Eqw":
            listOfErrorJobs.append(jobId)
    return listOfErrorJobs
示例#30
0
def getListOfActiveJobs(jobName):
    command = "condor_q   -format \"%d.\" ClusterId -format \"%d\n\" ProcId "
    if jobName:
        command += "-constraint 'JobBatchName == \"{0}\"' ".format(jobName)
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("condor_q failed (stderr: '" +
                                           stderr + "')")
    if stdout == "":
        return []
    jobList = stdout.split('\n')
    jobs = []
    for job in jobList:
        job = job.split()
        if len(job) > 0:
            try:
                jobID = int(job[0].rstrip(".0"))
                jobs.append(jobID)
            except ValueError:
                raise batchelor.BatchelorException(
                    "Cannot parse return of condor_q (stdout: '" + stdout +
                    "')")
    return jobs
示例#31
0
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None):
    if arrayStart is not None or arrayEnd is not None or arrayStep is not None:
        raise BatchelorException(
            "Array jobs are not (yet) implementet for CERNs HTCondor system")

    filesDir = os.path.join(os.getcwd(), '.log')
    if " " in filesDir:
        raise BatchelorException(
            "Cannot handle submit directories with whitespaces")

    if not os.path.exists(filesDir):
        os.makedirs(filesDir)
    (fileDescriptor, submitFileName) = tempfile.mkstemp(dir=filesDir,
                                                        prefix='submitFiles_',
                                                        suffix='.submit')
    os.close(fileDescriptor)
    atexit.register(lambda: os.remove(submitFileName))
    (fileDescriptor, scriptFileName) = tempfile.mkstemp(dir=filesDir,
                                                        prefix='scriptFiles_',
                                                        suffix='.sh')
    os.close(fileDescriptor)
    atexit.register(lambda: os.remove(scriptFileName))
    os.chmod(scriptFileName, 0755)

    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " +
                         scriptFileName)
    with open(scriptFileName, 'a') as scriptFile:
        scriptFile.write(command)
    with open(submitFileName, 'w') as submitFile:
        outputFile = os.path.abspath(outputFile)
        submitFile.write("executable = {0}\n".format(scriptFileName))
        if outputFile:
            submitFile.write("output = {0}\n".format(outputFile))
            submitFile.write("log = {0}.condor\n".format(outputFile))
            submitFile.write("error = {0}.err\n".format(outputFile))

        submitFile.write(
            "should_transfer_files = NO\n")  # Disable file transport
        submitFile.write("request_cpus  = 1\n")
        submitFile.write("request_memory = {0}\n".format(
            config.get(submoduleIdentifier(), "memory")))
        submitFile.write("request_disk = {0}\n".format(
            config.get(submoduleIdentifier(), "disk")))
        submitFile.write("+JobFlavour = \"{0}\"\n".format(
            config.get(submoduleIdentifier(), "flavour")))
        submitFile.write("queue 1\n")
    cmnd = "condor_submit '{0}'".format(submitFileName)
    if jobName:
        cmnd += " -batch-name {0} ".format(jobName)
    kwargs = {}
    if wd:
        kwargs['wd'] = wd
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd, **kwargs)
    if returncode != 0:
        raise batchelor.BatchelorException("condor_submit failed (stderr: '" +
                                           stderr + "')")
    jobId = stdout.split('\n')[1].split()[5].rstrip(".")
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of condor_submit output to get job id failed.')
    return jobId
示例#32
0
def getListOfJobStates(jobName, username=None, detailed=True):
    if detailed:
        command = "llq -u `whoami` -m -x"
    else:
        command = "llq -u `whoami` -m"
    (returncode, stdout, stderr) = batchelor.runCommand(command)
    if returncode != 0:
        raise batchelor.BatchelorException("llq failed (stderr: '" + stderr +
                                           "')")
    jobList = []
    jobStates = []
    currentJobId = -1
    currentJobStatus = None
    for line in stdout.split('\n'):
        line = line.rstrip('\n')
        if line.startswith("===== Job Step mgmt."):
            try:
                currentJobId = int(line[line.find(".") + 1:line.rfind(".")])
                currentJobStatus = JobStatus(currentJobId)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of llq output to get job id failed.")
        line = ' '.join(line.split())

        if line.startswith("Job Name: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            name = line[10:]
            if name == jobName or jobName == None:
                jobList.append(currentJobId)
                jobStates.append(currentJobStatus)
        elif line.startswith("Step Virtual Memory: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            try:
                parsed = line.lstrip().lstrip('Step Virtual Memory:').split()
                currentJobStatus.setMemoryUsage(
                    float(parsed[0]) * _kMemoryUnits[parsed[1]], 0)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of llq output to get job id failed.")
        elif line.startswith("Status: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            else:
                status = line.lstrip().lstrip("Status: ")
                currentJobStatus.setStatus(JobStatus.kUnknown, name=status)
                if status == 'Running':
                    currentJobStatus.setStatus(JobStatus.kRunning)
                elif status == 'I' or status == 'Idle' or status == 'Pending':
                    currentJobStatus.setStatus(JobStatus.kWaiting)
                elif status == 'Submission Error' or status == 'Terminated' or status == 'Removed' or status == 'Remove Pending':
                    currentJobStatus.setStatus(JobStatus.kError)

        elif line.startswith("Step User Time: "):
            if currentJobId < 0:
                raise batchelor.BatchelorException(
                    "parsing of llq output failed, got job name before job id."
                )
            time_str = line.lstrip().lstrip("Step User Time:").split(':')
            try:
                hours = float(time_str[0])
                minuts = float(time_str[1])
                seconds = float(time_str[2])
                total_time = hours + minuts / 60.0 + seconds / 3600.0
                currentJobStatus.setCpuTime(total_time, 0)
            except ValueError:
                raise batchelor.BatchelorException(
                    "parsing of llq output to get job id failed.")

    return jobStates
示例#33
0
def getListOfJobStates(select_jobIDs, username):

    # get list of all jobs
    if username == None:
        command = "qstat"
    else:
        command = "qstat -u {0}".format(username)

    (returncode, stdout, stderr) = batchelor.runCommand(command)

    if returncode != 0:
        raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr +
                                           "')")

    if stdout == "":
        return []

    jobList = stdout.split('\n')[2:]

    try:
        jobIDs = [int(job.split()[0]) for job in jobList]
        jobStates = [job.split()[4] for job in jobList]
    except ValueError:
        raise batchelor.BatchelorException(
            "parsing of qstat output to get job id failed.")

    list_of_states = []

    for i, jobID in enumerate(jobIDs):
        if select_jobIDs == None or jobID in select_jobIDs:
            job_status = JobStatus(jobID)
            job_status.setStatus(JobStatus.kUnknown, name=jobStates[i])

            if jobStates[i] == 'qw' or jobStates[i] == 'hqw':
                job_status.setStatus(JobStatus.kWaiting)

            elif jobStates[i] == 't':
                job_status.setStatus(JobStatus.kTransmitting)

            elif jobStates[i] == 'd' or jobStates[i] == 'dr' or jobStates[
                    i] == 'dt':
                job_status.setStatus(JobStatus.kDeletion)

            elif jobStates[i] == 'Eq':
                job_status.setStatus(JobStatus.kError)

            elif jobStates[i] == 'r' or jobStates[i] == 'hr':

                # get detailed job information
                command = "qstat -xml -j {0}".format(jobID)
                (returncode, stdout, stderr) = batchelor.runCommand(command)
                if returncode != 0:
                    raise batchelor.BatchelorException(
                        "qstat failed (stderr: '" + stderr + "')")
                elif 'unknown_jobs' in stdout:
                    continue
                    # the job has been ended between the qstat command and now
                else:
                    try:
                        root = ElementTree.fromstring(stdout)
                        for child in root[0]:
                            for task in child.findall('JB_ja_tasks'):
                                for sublist in task.findall('ulong_sublist'):
                                    task_number = sublist.findall(
                                        'JAT_task_number')
                                    if task_number:
                                        task_number = int(task_number[0].text)
                                        job_status.setStatus(
                                            JobStatus.kRunning)
                                        for usage_list in sublist.findall(
                                                'JAT_scaled_usage_list'):
                                            for scaled in usage_list.findall(
                                                    'scaled'):
                                                name = scaled.findall(
                                                    'UA_name')[0].text
                                                value = scaled.findall(
                                                    'UA_value')[0].text
                                                if name == 'cpu':
                                                    job_status.setCpuTime(
                                                        float(value) / 3600.0,
                                                        task_number)
                                                elif name == 'vmem':
                                                    job_status.setMemoryUsage(
                                                        float(value) /
                                                        (1024.0)**3,
                                                        task_number)
                    except xml.etree.ElementTree.ParseError as e:
                        raise batchelor.BatchelorException(
                            "xml-parser could not parse output of qstat -xml -j {0}: {1}"
                            .format(jobID, e))

                    # end of parsing through the xml tree

            list_of_states.append(job_status)

        # end of if jobs belongs to the selected jobs
    # end of loop over all jobs

    return list_of_states
def getListOfJobStates(select_jobIDs, username):


	# get list of all jobs
	if username == None:
		command = "qstat"
	else:
		command = "qstat -u {0}".format(username)

	(returncode, stdout, stderr) = batchelor.runCommand(command)

	if returncode != 0:
		raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')")

	if stdout == "":
		return []

	jobList = stdout.split('\n')[2:]

	try:
		jobIDs = [ int(job.split()[0]) for job in jobList ]
		jobStates = [ job.split()[4] for job in jobList ];
	except ValueError:
		raise batchelor.BatchelorException("parsing of qstat output to get job id failed.")

	list_of_states = [];

	for i, jobID in enumerate(jobIDs):
		if select_jobIDs == None or jobID in select_jobIDs:
			job_status = JobStatus(jobID);
			job_status.setStatus( JobStatus.kUnknown, name = jobStates[i] );

			if jobStates[i] == 'qw' or jobStates[i] == 'hqw':
				job_status.setStatus( JobStatus.kWaiting );

			elif jobStates[i] == 't':
				job_status.setStatus( JobStatus.kTransmitting )

			elif jobStates[i] == 'd' or jobStates[i] == 'dr' or jobStates[i] == 'dt':
				job_status.setStatus( JobStatus.kDeletion)

			elif jobStates[i] == 'Eq':
				job_status.setStatus( JobStatus.kError );

			elif jobStates[i] == 'r' or jobStates[i] == 'hr':

				# get detailed job information
				command = "qstat -xml -j {0}".format(jobID);
				(returncode, stdout, stderr) = batchelor.runCommand(command)
				if returncode != 0:
					raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')")
				elif 'unknown_jobs' in stdout:
					continue; # the job has been ended between the qstat command and now
				else:
					try:
						root = ElementTree.fromstring( stdout );
						for child in root[0]:
							for task in child.findall('JB_ja_tasks'):
								for sublist in task.findall('ulong_sublist'):
									task_number = sublist.findall('JAT_task_number')
									if task_number:
										task_number = int(task_number[0].text)
										job_status.setStatus( JobStatus.kRunning );
										for usage_list in sublist.findall('JAT_scaled_usage_list'):
											for scaled in usage_list.findall('scaled'):
												name = scaled.findall('UA_name')[0].text
												value = scaled.findall('UA_value')[0].text
												if name == 'cpu':
													job_status.setCpuTime(float(value) / 3600.0, task_number);
												elif name == 'vmem':
													job_status.setMemoryUsage(float(value) / (1024.0)**3, task_number);
					except xml.etree.ElementTree.ParseError as e:
						raise batchelor.BatchelorException("xml-parser could not parse output of qstat -xml -j {0}: {1}".format(jobID, e))

					# end of parsing through the xml tree



			list_of_states.append( job_status );


		# end of if jobs belongs to the selected jobs
	# end of loop over all jobs

	return list_of_states;
示例#35
0
def submitJob(config,
              command,
              outputFile,
              jobName,
              wd=None,
              arrayStart=None,
              arrayEnd=None,
              arrayStep=None,
              priority=None,
              ompNumThreads=None):

    # some checks of the job-settings
    if wd and os.path.realpath(wd).count(
            os.path.realpath(os.path.expanduser('~'))):
        raise batchelor.BatchelorException(
            "The given working-directory is in your home-folder which is no allowed at E18: '{0}'"
            .format(wd))

    if os.path.realpath(outputFile).count(
            os.path.realpath(os.path.expanduser('~'))):
        raise batchelor.BatchelorException(
            "The given output-file is in your home-folder which is no allowed at E18: '{0}'"
            .format(outputFile))

    if priority:
        priority = max(int(-1024 + 2048 * (priority + 1.0) / 2.0), -1023)

    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    batchelor.runCommand("cp " + batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file")) + " " + fileName)
    with open(fileName, 'a') as scriptFile:
        if ompNumThreads is not None:
            scriptFile.write(
                "export OMP_NUM_THREADS={0}\n".format(ompNumThreads))
        scriptFile.write(command)
    cmnd = "qsub "
    cmnd += "-j y "
    cmnd += "-b no "
    cmnd += "-m n "
    cmnd += "" if jobName is None else ("-N " + jobName + " ")
    if arrayStart is not None:
        cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(
            arrayStep) + " "
    cmnd += "-o '" + outputFile + "' "
    cmnd += "-wd '" + ("/tmp/" if not wd else wd) + "' "
    if config.has_option(submoduleIdentifier(), "shortqueue") and config.get(
            submoduleIdentifier(),
            "shortqueue") in [1, "1", "TRUE", "true", "True"]:
        cmnd += "-l short=1 "
    elif config.has_option(submoduleIdentifier(), "longqueue") and config.get(
            submoduleIdentifier(),
            "longqueue") in [1, "1", "TRUE", "true", "True"]:
        cmnd += "-l long=1 "
    else:
        cmnd += "-l medium=1 "
    cmnd += "-l h_pmem=" + config.get(submoduleIdentifier(), "memory") + " "
    cmnd += "-l arch=" + config.get(submoduleIdentifier(), "arch") + " "
    cmnd += _getExcludedHostsString(config)
    cmnd += "-p {0} ".format(priority) if priority else ""
    cmnd += "-pe mt {0} ".format(
        ompNumThreads) if ompNumThreads is not None else ""
    cmnd += "< " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    if returncode != 0:
        raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr +
                                           "')")
    # example output: "Your job 1601905 ("J2415c980b8") has been submitted"
    if arrayStart is not None:
        jobId = stdout.lstrip("Your job-array ")
        jobId = jobId[:jobId.find('.')]
    else:
        jobId = stdout.lstrip("Your job ")
        jobId = jobId[:jobId.find(' ')]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing of qsub output to get job id failed.')
    batchelor.runCommand("rm -f " + fileName)
    return jobId
示例#36
0
def getListOfJobStates(jobName, username = None, detailed = True):
	if detailed:
		command = "llq -u `whoami` -m -x"
	else:
		command = "llq -u `whoami` -m"
	(returncode, stdout, stderr) = batchelor.runCommand(command)
	if returncode != 0:
		raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')")
	jobList = []
	jobStates = []
	currentJobId = -1
	currentJobStatus = None;
	for line in stdout.split('\n'):
		line = line.rstrip('\n')
		if line.startswith("===== Job Step mgmt."):
			try:
				currentJobId = int(line[line.find(".")+1:line.rfind(".")])
				currentJobStatus = JobStatus(currentJobId)
			except ValueError:
				raise batchelor.BatchelorException("parsing of llq output to get job id failed.")
		line = ' '.join(line.split())

		if line.startswith("Job Name: "):
			if currentJobId < 0:
				raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.")
			name = line[10:]
			if name == jobName or jobName == None:
				jobList.append(currentJobId)
				jobStates.append(currentJobStatus)
		elif line.startswith("Step Virtual Memory: "):
			if currentJobId < 0:
				raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.")
			try:
				parsed = line.lstrip().lstrip('Step Virtual Memory:').split()
				currentJobStatus.setMemoryUsage( float(parsed[0]) * _kMemoryUnits[parsed[1]], 0)
			except ValueError:
				raise batchelor.BatchelorException("parsing of llq output to get job id failed.")
		elif line.startswith("Status: "):
			if currentJobId < 0:
				raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.")
			else:
				status = line.lstrip().lstrip("Status: ")
				currentJobStatus.setStatus(JobStatus.kUnknown, name = status)
				if status == 'Running':
					currentJobStatus.setStatus(JobStatus.kRunning)
				elif status == 'I' or status == 'Idle' or status == 'Pending':
					currentJobStatus.setStatus(JobStatus.kWaiting)
				elif status == 'Submission Error' or status == 'Terminated' or status == 'Removed' or status == 'Remove Pending':
					currentJobStatus.setStatus(JobStatus.kError)

		elif line.startswith("Step User Time: "):
			if currentJobId < 0:
				raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.")
			time_str = line.lstrip().lstrip("Step User Time:").split(':')
			try:
				hours = float(time_str[0])
				minuts = float(time_str[1])
				seconds = float(time_str[2])
				total_time = hours + minuts / 60.0 + seconds / 3600.0
				currentJobStatus.setCpuTime(total_time, 0)
			except ValueError:
				raise batchelor.BatchelorException("parsing of llq output to get job id failed.")
	
	return jobStates
def _submitJob(config, command, outputFile, jobName, wd=None, nTasks=None):

    # check if only a certain amount of active jobs is allowd
    if config.has_option(submoduleIdentifier(), "max_active_jobs"):
        max_active_jobs = int(
            config.get(submoduleIdentifier(), "max_active_jobs"))
        i = 0
        waitTime = 90
        while True:
            try:
                nRunningJobs = len(getListOfActiveJobs(None))
            except batchelor.BatchelorException:
                nRunningJobs = max_active_jobs
            if nRunningJobs < max_active_jobs:
                break
            if i == 0:
                sys.stdout.write("Waiting for free slots")
                sys.stdout.flush()
            time.sleep(waitTime)
            # wait 1.5  min
            i += 1
        if i > 0:
            sys.stdout.write("\r")

    if wd == None:
        wd = os.getcwd()
    (fileDescriptor, fileName) = tempfile.mkstemp()
    os.close(fileDescriptor)
    headerFileName = batchelor._getRealPath(
        config.get(submoduleIdentifier(), "header_file"))
    with open(fileName, 'w') as tempFile:
        tempFile.write("#!/bin/bash\n\n")
        tempFile.write("#SBATCH -D " + wd + "\n")
        tempFile.write("#SBATCH -o " + outputFile + "\n")
        tempFile.write("#SBATCH --time=" +
                       config.get(submoduleIdentifier(), "wall_clock_limit") +
                       "\n")
        if config.get(submoduleIdentifier(), "clusters") != 'mpp3':
            tempFile.write("#SBATCH --mem-per-cpu=" +
                           config.get(submoduleIdentifier(), "memory") + "\n")
        if jobName is not None:
            tempFile.write("#SBATCH -J " + jobName + "\n")
        tempFile.write("#SBATCH --get-user-env \n")
        tempFile.write("#SBATCH --export=NONE \n")
        if nTasks is not None:
            if config.get(submoduleIdentifier(), "clusters") != 'mpp3':
                tempFile.write("#SBATCH --ntasks={0:d} \n".format(nTasks))
            else:
                tempFile.write("#SBATCH --nodes={0:d} \n".format(
                    (nTasks + 63) // 64))
            tempFile.write("#SBATCH --ntasks-per-node={0} \n".format(
                config.get(submoduleIdentifier(), "n_tasks_per_node")))
        tempFile.write("#SBATCH --clusters={0}\n".format(
            config.get(submoduleIdentifier(), "clusters")))
        if config.get(submoduleIdentifier(),
                      "clusters") not in ['cm2_tiny', 'mpp3']:
            tempFile.write("#SBATCH --partition={0}\n\n".format(
                config.get(submoduleIdentifier(), "partition")))
        if config.get(submoduleIdentifier(),
                      "clusters") == 'cm2' or config.get(
                          submoduleIdentifier(), "clusters") == 'c2pap':
            tempFile.write("#SBATCH --qos={0}\n\n".format(
                config.get(submoduleIdentifier(), "partition")))
        tempFile.write("module load slurm_setup \n\n\n")
        with open(headerFileName, 'r') as headerFile:
            for line in headerFile:
                if line.startswith("#!"):
                    continue
                tempFile.write(line)
        tempFile.write("\n\n")
        tempFile.write(command)
    cmnd = "sbatch " + fileName
    (returncode, stdout, stderr) = batchelor.runCommand(cmnd)
    batchelor.runCommand("rm -f " + fileName)
    if returncode != 0:
        raise batchelor.BatchelorException("sbatch failed (stderr: '" +
                                           stderr + "')")
    jobId = stdout.split()[3]
    try:
        jobId = int(jobId)
    except ValueError:
        raise batchelor.BatchelorException(
            'parsing output of sbatch to get job id failed.')
    return jobId