Exemplo n.º 1
0
 def prepare(self, verbosity=0, *args, **kwargs):
     status = super(ShJobSingle, self).prepare(verbosity=verbosity,
                                               *args,
                                               **kwargs)
     self.store_config()
     chcmd = 'chmod 750 -R "{0:s}"'.format(self.directory)
     outp = run_cmds((chcmd, ), queue=self.queue)
     return status
Exemplo n.º 2
0
 def __init__(self, jobs=None, partition=None, summary_func=None):
     self.partition = partition or 'thchem'
     super(SlurmQueue, self).__init__(jobs=jobs, summary_func=summary_func)
     self.time_limit = '07-00:00:00'
     if find_executable('sinfo'):
         partition_info = run_cmds(
             ['sinfo -l --partition {0:s}'.format(self.partition)],
             queue=self)
         if partition_info:
             partition_lines = partition_info[0].splitlines()[2:]
             if partition_lines:
                 time = partition_lines[0].split()[2]
                 if findall(r'^[-\d]*\d+:\d+:\d+$', time):
                     self.time_limit = time
Exemplo n.º 3
0
    def run_cmd(self, job, cmd):
        """
		See Queue.run_cmd(), but run everything on local machine.
		"""
        assert job.directory
        cmds = [
            'cd \'{0:s}\';'.format(job.directory),
            'nohup {0:s} &> all_output.txt &'.format(cmd),
            'echo "$\!"'  # pid
        ]
        outp = run_cmds(cmds, queue=self)
        if not outp:
            raise self.CmdException(
                'job {0:} could not be started'.format(self))
        return str(int(outp[-1]))
Exemplo n.º 4
0
    def processes(self, node):
        """
		Get processes on specific node and cache them.
		"""
        self._log('loading processes for %s' % node, level=3)
        self.process_list[node] = []
        outp = run_cmds([
            'ps ux',
        ], queue=self)
        for line in outp[0].splitlines()[1:]:
            cells = line.split()
            self.process_list[node].append({
                'pid': int(cells[1]),
                'node': node,
            })
        return self.process_list[node]
Exemplo n.º 5
0
    def run_cmd(self, job, cmd):
        """
		Start an individual job by means of queueing a shell command.
		"""
        self._test_qstat()
        queue = self.qname
        if job.force_node:
            queue = '{0:s}@{1:s}'.format(self.qname, job.force_node)
            self._log('job {0:s} forced queue {1:s}'.format(job, queue),
                      level=2)
        assert job.directory
        subcmd = [
            'qsub',  # wait in line
            '-b',
            'y',  # it's a binary
            '-cwd',  # use the current working directory
            '-q',
            queue,  # which que to wait in
            '-N',
            job.name,  # name of the job
            #'-l slots={0:d}'.format(job.weight), # number of slots = weight of job
            #check this; maybe it's threads rather than processes
            '-e',
            join(job.directory, 'qsub.err'),  # error directory for the queue
            '-o',
            join(job.directory, 'qsub.out'),  # output directory for the queue
            'bash -c \'%s\'' % cmd,  # the actual command (single quotes!)
        ]
        cmds = [
            'cd \'%s\'' % job.directory,
            ' '.join(subcmd),
        ]
        outp = run_cmds(cmds, queue=self)
        self._log(cmds[-1], level=3)
        if not outp or not outp[1]:
            raise self.CmdException(
                'job %s could not be started (output is empty)' % job)
        qid = findall(r'Your job (\d+) \("[^"]+"\) has been submitted',
                      outp[1])[0]
        if not qid:
            raise self.CmdException('job %s id could not be found in "%s"' %
                                    (job, outp[1]))
        return int(qid)
Exemplo n.º 6
0
    def run_cmd(self, job, cmd):
        """
		Start an individual job by means of queueing a shell command.
		"""
        self.test_slurm()
        assert job.directory
        node_flags = ()
        comment = 'batch: {0:s}; job: {1:s}; weight: {2:d}'.format(
            job.batch_name, job.name, job.weight)
        if job.force_node:
            node_flags += (
                '--nodelist',
                str(job.force_node),
                '--no-requeue',
            )
            comment = '{0:s}; forced to node: {1:s}'.format(
                comment, job.force_node)
        elif 'EXCLUDE_NODES' in environ and environ['EXCLUDE_NODES'].strip():
            node_flags += (
                '--exclude',
                '"{0:}"'.format(environ['EXCLUDE_NODES']),
            )
            comment = '{0:s}; excl: {1:s}'.format(
                comment, environ['EXCLUDE_NODES'].strip())
        core_flags = (
            'sbatch',
            '--job-name',
            '"{0:s}"'.format(job.name),
            '--partition',
            str(self.partition),
            '--workdir',
            '"{0:s}"'.format(job.directory),
            '--time',
            self.time_limit,
            '--mem',
            '{0:d}G'.format(2 + job.weight),
            '--ntasks',
            '1',  # different tasks can be on different nodes
            '--cpus-per-task',
            str(max(min(job.weight // 1, 10), 1)),
            '--nodes',
            '1',
            '--output',
            '"{0:s}"'.format(join(job.directory, 'slurm.all')),
            '--error',
            '"{0:s}"'.format(join(job.directory, 'slurm.all')),
        )
        if getattr(job, 'niceness', True):
            node_flags += (
                '--nice={0:}'.format(getattr(
                    job, 'niceness', 100)),  # otherwise other people can't run
            )
            comment = '{0:s}; nice: {1:}'.format(comment,
                                                 getattr(job, 'niceness', 100))
        core_flags += ('--comment', '"{0:s}"'.format(comment))
        subcmd = ' '.join(core_flags + node_flags +
                          ('\'{0:s}\''.format(cmd), ))
        cdcmd = 'cd "{0:s}"'.format(job.directory)
        outp = run_cmds((
            cdcmd,
            subcmd,
        ), queue=self)
        self._log(subcmd, level=3)
        if not outp or not outp[1]:
            raise self.CmdException(
                'job {0:s} could not be queued (output is empty)'.format(job))
        qid = findall(r'Submitted batch job (\d+)(\s|$)', outp[1])[0][0]
        if not qid:
            raise self.CmdException(
                'job {0:s} id could not be found in "{1:s}"'.format(
                    job, outp[1]))
        return int(qid)
Exemplo n.º 7
0
    def stop_job(self, node, pid):
        """
		Remove individual job from queue.
		"""
        run_cmds(['scancel {0:d}'.format(pid)], queue=self)
Exemplo n.º 8
0
 def _test_qstat(self):
     if run_cmds(['qstat'], queue=self) is None:
         self._log(
             'qstat does not work on this machine; run this code from a node that has access to the queue'
         )
         exit()
Exemplo n.º 9
0
    def stop_job(self, node, pid):
        """
		Remove individual job from queue.
		"""
        run_cmds(['qdel %s' % pid], queue=self)
Exemplo n.º 10
0
    def stop_job(self, node, pid):
        """
		Kill an individual job, specified by pid given during start ('pid' could also e.g. be a queue number).
		"""
        run_cmds(['kill %s' % pid], queue=self)