Пример #1
0
 def submit(self, sdpdata, options=None):
     submissiondict = {  #"executable": self.bindir + "worker.py",
         #"arguments": "-w cern" + sdpdata.filename,
         "executable": self.bindir + "clusterstarter.sh",
         "arguments": self.bindir + "worker.py -w cern " + sdpdata.filename,
         "log": self._hide(sdpdata.filename + '.condorlog'),
         "output": self._hide(sdpdata.filename + '.out'),
         # str(sdpdata.numlogs()).zfill(3),
         "error": self._hide(sdpdata.filename + '.err')
         # str(sdpdata.numlogs()).zfill(3)}
     }
     sdpdict = sdpdata.dict.get('cernworld').get('cluster')
     if sdpdict is not None:
         # add "+" because that's what htcondor can stomach
         if sdpdict.get('MaxRuntime'):
             sdpdict['+MaxRuntime'] = sdpdict['MaxRuntime']
             del sdpdict['MaxRuntime']
         if sdpdict.get('JobFlavour'):
             sdpdict['+JobFlavour'] = sdpdict['JobFlavour']
             del sdpdict['JobFlavour']
         submissiondict.update(sdpdict)
     op = sp.check_output(['condor_submit', '-terse'],
                          input=str(htc.Submit(submissiondict)),
                          encoding='utf-8')
     # expected output: XXXXX.0 - XXXXX.0 with XXXXX the submissionid
     return op.split()[0].split('.')[0]
Пример #2
0
    def Submit(self, count=1):
        # It's easier to smash the case of the keys (since ClassAds and the
        # submit language don't care) than to do the case-insensitive compare.
        self._job_args = dict([(k.lower(), v)
                               for k, v in self._job_args.items()])

        # Extract the event log filename, or insert one if none.
        self._log = self._job_args.setdefault(
            "log", "test-{0}.log".format(os.getpid()))
        self._log = os.path.abspath(self._log)

        # Submit the job defined by submit_args
        Utils.TLog("Submitting job with arguments: " + str(self._job_args))
        if self._schedd is None:
            self._schedd = htcondor.Schedd()
        submit = htcondor.Submit(self._job_args)
        try:
            with self._schedd.transaction() as txn:
                self._cluster_id = submit.queue(txn, count)
                self._count = count
        except Exception as e:
            print("Job submission failed for an unknown error: " + str(e))
            return JOB_FAILURE

        Utils.TLog("Job submitted succeeded with cluster ID " +
                   str(self._cluster_id))

        # We probably don't need self._log, but it seems like it may be
        # handy for log messages at some point.
        self._jel = JobEventLog(self._log)

        return None
Пример #3
0
def submit_dask_workers(schedd, n_workers=1):
    import htcondor
    schedd_address = get_schedd_address(schedd)
    sub = {
        'MY.DaskWorkerName': '"htcondor--$F(MY.JobId)--"',
        'RequestCpus': '"MY.DaskWorkerCores"',
        'RequestMemory': '"floor(MY.DaskWorkerMemory / 1048576)"',
        'RequestDisk': '"floor(MY.DaskWorkerDisk / 1024)"',
        'MY.JobId': '"$(ClusterId).$(ProcId)"',
        'MY.DaskWorkerCores': '1',
        'MY.DaskWorkerMemory': '2000000000',
        'MY.DaskWorkerDisk': '2000000000',
        'use_x509userproxy': 'true',
        'Log': 'logs/dask_$(Cluster)_$(Process).log',
        'output': 'logs/dask_$(Cluster)_$(Process).out',
        'error': 'logs/dask_$(Cluster)_$(Process).err',
        'should_transfer_files': 'YES',
        'when_to_transfer_output': 'ON_EXIT_OR_EVICT',
        'Environment': "",
        'Arguments':
        "'python -m distributed.cli.dask_worker ${DASK_SCHED} --nthreads 1 --memory-limit 2.00GB --name ${USER}_${logname} --no-nanny --death-timeout 60 --worker-port 10002:10100'",
        'Executable': '"dask_worker.sh"',
    }

    with schedd.transaction() as transaction:
        submit_object = htcondor.Submit(sub)
        submitted_ads = []
        submit_object.queue(transaction, n_workers, submitted_ads)

    for ad in submitted_ads:
        logger.info('Submitted worker: %s', ad)
    return submitted_ads
def submit_sleep_job():
    """Submit a sleep job and return the cluster ID"""
    sub = htcondor.Submit({"Executable": "/usr/bin/sleep", "Arguments": "300"})
    schedd = htcondor.Schedd()
    with schedd.transaction() as txn:
        cluster_id = sub.queue(txn)
    return cluster_id
Пример #5
0
    def start_workers(self,
                      n=1,
                      memory_per_worker=None,
                      disk_per_worker=None,
                      procs_per_worker=None,
                      threads_per_worker=None,
                      worker_timeout=None,
                      transfer_files=None,
                      extra_attribs=None):
        n = int(n)
        if n < 1:
            raise ValueError("n must be >= 1")
        if procs_per_worker:
            self.logger.warning("Multiple processes and adaptive scaling"
                                " don't mix; ignoring procs_per_worker")
        memory_per_worker = int(memory_per_worker or self.memory_per_worker)
        if memory_per_worker < 1:
            raise ValueError("memory_per_worker must be >= 1 (MB)")
        disk_per_worker = int(disk_per_worker or self.disk_per_worker)
        if disk_per_worker < 1:
            raise ValueError("disk_per_worker must be >= 1 (KB)")
        threads_per_worker = int(threads_per_worker or self.threads_per_worker)
        if threads_per_worker < 1:
            raise ValueError("threads_per_worker must be >= 1")
        worker_timeout = int(worker_timeout or self.worker_timeout)
        if worker_timeout < 1:
            raise ValueError("worker_timeout must be >= 1 (sec)")
        transfer_files = transfer_files or self.transfer_files
        if transfer_files:
            if not isinstance(transfer_files, str):
                transfer_files = ', '.join(transfer_files)

        job = htcondor.Submit(JOB_TEMPLATE)
        job['MY.DaskSchedulerAddress'] = '"' + self.scheduler_address + '"'
        job['MY.DaskNProcs'] = "1"
        job['MY.DaskNThreads'] = str(threads_per_worker)
        job['RequestMemory'] = str(memory_per_worker)
        job['RequestDisk'] = str(disk_per_worker)
        job['MY.DaskSchedulerId'] = '"' + self.scheduler.id + '"'
        job['MY.DaskWorkerTimeout'] = str(worker_timeout)
        job['LogDir'] = self.logdir
        if self.script:
            job['Executable'] = self.script.name
            job['Transfer_Input_Files'] = self.worker_tarball \
                + ((', ' + transfer_files) if transfer_files else '') \
                + ((', ' + self.pre_script) if self.pre_script else '')
        else:
            if transfer_files:
                job['Transfer_Input_Files'] = transfer_files

        if extra_attribs:
            job.update(extra_attribs)

        classads = []
        with self.schedd.transaction() as txn:
            clusterid = job.queue(txn, count=n, ad_results=classads)
        self.logger.info("%d job(s) submitted to cluster %s." % (n, clusterid))
        for ad in classads:
            self.jobs[ad['JobId']] = ad
Пример #6
0
def submit_sleeper():
    sub = htcondor.Submit({
        "executable": "/bin/sleep",
        "arguments": "20s",
        'log': LOGFILE
    })
    schedd = htcondor.Schedd()
    with schedd.transaction() as txn:
        return sub.queue(txn)
Пример #7
0
    def start(self, jupyter_args: List[str]) -> "JupyterJobManager":
        if self.has_running_job():
            raise click.ClickException(
                "You already have a running Jupyter notebook server; "
                'use "dask-chtc jupyter status" subcommand to see it\'s logs.')

        self.prep_log_files()

        arguments = " ".join(
            ["-m", "jupyter", *jupyter_args, "--no-browser", "-y"])
        sub = htcondor.Submit({
            "universe":
            "local",
            "JobBatchName":
            " ".join(("jupyter", *jupyter_args)),
            "executable":
            sys.executable,
            "arguments":
            arguments,
            "initialdir":
            Path.cwd(),
            "output":
            self.out.as_posix(),
            "error":
            self.err.as_posix(),
            "log":
            self.event_log.as_posix(),
            "stream_output":
            "true",
            "stream_error":
            "true",
            "getenv":
            "true",
            "environment":
            f"{MARKER_KEY}={MARKER_VALUE}",
            "transfer_executable":
            "false",
            "transfer_output_files":
            '""',
            # job_max_vacate_time doesn't actually work in local universe,
            # but might some day:
            # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=7746
            "job_max_vacate_time":
            "60",
            f"My.{MARKER_KEY}":
            MARKER_VALUE,
        })

        logger.debug(f"HTCondor job submit description:\n{sub}")

        schedd = htcondor.Schedd()
        with schedd.transaction() as txn:
            self.cluster_id = sub.queue(txn)

        logger.debug(f"Submitted job with cluster ID {self.cluster_id}")

        return self
Пример #8
0
def get_submit(submit_config):
    submit_config['transfer_input_files'] = ', '.join(
        submit_config['transfer_input_files'])
    if 'transfer_output_remaps' in submit_config:
        submit_config['transfer_output_remaps'] = '"{}"'.format('; '.join([
            "{} = {}".format(*entry)
            for entry in submit_config['transfer_output_remaps'].items()
        ]))
    return htcondor.Submit(submit_config)
Пример #9
0
    def submit_pythonbindings(self, njobsmax=None):
        qondor.utils.check_proxy()
        if not self.submittables:
            return
        import htcondor

        if njobsmax is None:
            njobsmax = 1e7
        n_jobs_summed = sum([njobs for _, njobs in self.submittables])
        n_jobs_total = min(n_jobs_summed, njobsmax)
        logger.info("Submitting all jobs; %s out of %s", n_jobs_total, n_jobs_summed)
        schedd = qondor.schedd.get_best_schedd()
        n_jobs_todo = n_jobs_total
        ads = []
        with qondor.utils.switchdir(self.rundir):
            with qondor.schedd._transaction(schedd) as transaction:
                submit_object = htcondor.Submit()
                for sub_orig, njobs in self.submittables:
                    sub = (
                        sub_orig.copy()
                    )  # Keep original dict intact? Global settings already contained
                    sub["environment"] = qondor.schedd.format_env_htcondor(
                        sub["environment"]
                    )
                    njobs = min(njobs, n_jobs_todo)
                    n_jobs_todo -= njobs
                    # Load the dict into the submit object
                    for key in sub:
                        submit_object[key] = sub[key]
                    new_ads = []
                    cluster_id = (
                        int(submit_object.queue(transaction, njobs, new_ads))
                        if not qondor.DRYMODE
                        else 0
                    )
                    logger.warning(
                        "Submitted %s jobs for i_cluster %s (%s) to htcondor cluster %s",
                        len(new_ads) if not qondor.DRYMODE else njobs,
                        sub_orig["environment"]["QONDORICLUSTER"],
                        sub_orig["environment"]["QONDORCLUSTERNAME"],
                        cluster_id,
                    )
                    ads.extend(new_ads)
                    self.submitted.append(
                        (
                            sub_orig,
                            cluster_id,
                            len(new_ads),
                            [ad["ProcId"] for ad in new_ads],
                        )
                    )
                    if n_jobs_todo == 0:
                        break
        logger.info(
            "Summary: Submitted %s jobs to cluster %s", n_jobs_total, cluster_id
        )
Пример #10
0
def job_node_kwargs(node):
    return dict(
        name=node.name,
        submit_description=htcondor.Submit(node.file.read_text()),
        dir=node.dir,
        noop=node.noop,
        done=node.done,
        pre=node.pre,
        post=node.post,
    )
Пример #11
0
def test_save_and_load_submit(tmpdir):
    path = Path(tmpdir.mkdir('save_and_load_submit_test_dir'))

    sub = htcondor.Submit({'foo': 'bar'})

    htio.save_submit(path, sub)

    loaded = htio.load_submit(path)

    assert loaded['foo'] == sub['foo']
Пример #12
0
def test_save_and_load_submit(tmpdir):
    path = Path(tmpdir.mkdir("save_and_load_submit_test_dir"))

    sub = htcondor.Submit({"foo": "bar"})

    htio.save_submit(path, sub)

    loaded = htio.load_submit(path)

    assert loaded["foo"] == sub["foo"]
Пример #13
0
def make_outer_dag(
    dest_dir,
    requirements,
    source_dir,
    test_mode,
    transfer_manifest_path,
    unique_id,
    working_dir,
):

    # Only import htcondor.dags submit-side
    import htcondor.dags as dags

    outer_dag = dags.DAG()

    outer_dag.layer(
        name="calc_work",
        submit_description=htcondor.Submit({
            "output":
            "calc_work.out",
            "error":
            "calc_work.err",
            "log":
            "calc_work.log",
            "arguments":
            "generate {} {}".format(source_dir,
                                    '--test-mode' if test_mode else ''),
            "should_transfer_files":
            "yes",
            **shared_submit_descriptors(unique_id, requirements),
        }),
        post=dags.Script(
            executable=THIS_FILE,
            arguments=[
                "write_subdag",
                source_dir,
                "source_manifest.txt",
                dest_dir,
                "destination_manifest.txt",
                transfer_manifest_path,
                "--requirements_file=requirements.txt"
                if requirements is not None else "",
                "--unique-id={}".format(unique_id)
                if unique_id is not None else "",
                "--test-mode" if test_mode else "",
            ],
        ),
    ).child_subdag(
        name="inner",
        dag_file=working_dir / "inner.dag",
        post=dags.Script(executable=THIS_FILE,
                         arguments=["analyze", transfer_manifest_path]),
    )

    return outer_dag
Пример #14
0
    def condorSubmit(skimdir, dirname, filename, index):
        # Condor class
        job = htcondor.Submit()
        schedd = htcondor.Schedd()

        skimFilename = dirname + "_{}.root".format(index)

        # Condor configuration
        job["executable"] = "{}/src/ttjet/nanoskimmer/batch/produceSkim.sh".format(
            os.environ["CMSSW_BASE"])
        # job["arguments"] = " ".join(
        #     [filename, skimFilename] + list(channels))

        job["getenv"] = "True"

        job["arguments"] = " ".join([filename, skimFilename])
        job["universe"] = "vanilla"

        job["should_transfer_files"] = "YES"
        # job["transfer_input_files"] = ",".join(
        #     [os.environ["CMSSW_BASE"] + "/src/ttjet", os.environ["CMSSW_BASE"] + "/src/x509"])
        # job["transfer_input_files"] = ",".join(
        #     [os.environ["CMSSW_BASE"] + "/src/x509"])
        job["transfer_input_files"] = ",".join([
            os.environ["CMSSW_BASE"] + "/src/ttjet",
            os.environ["CMSSW_BASE"] + "/src/x509"
        ])
        # job["transfer_input_files"] = ",".join([
        #     os.environ["X509_USER_PROXY"]])

        job["log"] = "{}/{}/log/job_$(Cluster).log".format(skimdir, dirname)
        job["output"] = "{}/{}/log/job_$(Cluster).out".format(skimdir, dirname)
        job["error"] = "{}/{}/log/job_$(Cluster).err".format(skimdir, dirname)

        # print skimdir, dirname, filename, index

        job["when_to_transfer_output"] = "ON_EXIT"
        job["transfer_output_remaps"] = '"' + '{outFile} = {skimDir}/{dirName}/{outFile}'.format(
            outFile=skimFilename, skimDir=skimdir, dirName=dirname) + '"'

        # Agressively submit your jobs

        def submit(schedd, job):
            with schedd.transaction() as txn:
                job.queue(txn)
                print "Submit job for file {}".format(filename)

        while (True):
            try:
                submit(schedd, job)
                break

            except:
                pass
Пример #15
0
        def prepareSubmission(self, cpu, memory, disk, jobID, jobName, command):

            # Convert resource requests
            cpu = int(math.ceil(cpu)) # integer CPUs
            memory = float(memory)/1024 # memory in KB
            disk = float(disk)/1024 # disk in KB

            # NOTE: formatStdOutErrPath() puts files in the Toil workflow directory, which defaults
            # to being in the system temporary directory ($TMPDIR, /tmp) which is unlikely to be on
            # a shared filesystem. So to make this work we need to set should_transfer_files = Yes
            # in the submit file, so that HTCondor will write the standard output/error files on the
            # compute node, then transfer back once the job has completed.
            stdoutfile = self.boss.formatStdOutErrPath(jobID, 'htcondor', '$(cluster)', 'std_output')
            stderrfile = self.boss.formatStdOutErrPath(jobID, 'htcondor', '$(cluster)', 'std_error')

            condorlogfile = self.boss.formatStdOutErrPath(jobID, 'htcondor', '$(cluster)', 'job_events')

            # Execute the entire command as /bin/sh -c "command"
            # TODO: Transfer the jobStore directory if using a local file store with a relative path.
            submit_parameters = {
                'executable': '/bin/sh',
                'transfer_executable': 'False',
                'arguments': '''"-c '{0}'"'''.format(command).encode('utf-8'),    # Workaround for HTCondor Python bindings Unicode conversion bug
                'environment': self.getEnvString(),
                'getenv': 'True',
                'should_transfer_files': 'Yes',   # See note above for stdoutfile, stderrfile
                'output': stdoutfile,
                'error': stderrfile,
                'log': condorlogfile,
                'request_cpus': '{0}'.format(cpu),
                'request_memory': '{0:.3f}KB'.format(memory),
                'request_disk': '{0:.3f}KB'.format(disk),
                'leave_in_queue': '(JobStatus == 4)',
                '+IsToilJob': 'True',
                '+ToilJobID': '{0}'.format(jobID),
                '+ToilJobName': '"{0}"'.format(jobName),
                '+ToilJobKilled': 'False',
            }

            # Extra parameters for HTCondor
            extra_parameters = os.getenv('TOIL_HTCONDOR_PARAMS')
            if extra_parameters is not None:
                logger.debug("Extra HTCondor parameters added to submit file from TOIL_HTCONDOR_PARAMS env. variable: {}".format(extra_parameters))
                for parameter, value in [parameter_value.split('=', 1) for parameter_value in extra_parameters.split(';')]:
                    parameter = parameter.strip()
                    value = value.strip()
                    if parameter in submit_parameters:
                        raise ValueError("Some extra parameters are incompatible: {}".format(extra_parameters))

                    submit_parameters[parameter] = value

            # Return the Submit object
            return htcondor.Submit(submit_parameters)
Пример #16
0
    def submit(self, description, count=1, itemdata=None):
        sub = htcondor.Submit(dict(description))
        logger.debug(
            "Submitting jobs with description:\n{}\nCount: {}\nItemdata: {}".
            format(sub, count, itemdata))
        with self.use_config():
            schedd = self.get_local_schedd()
            with schedd.transaction() as txn:
                result = sub.queue_with_itemdata(txn, count, itemdata)
                logger.debug("Got submit result:\n{}".format(result))

        return handles.ClusterHandle(self, result)
Пример #17
0
def __submit_one(txn, job_cfg, cfg):
    sub = htcondor.Submit(job_cfg)
    out = sub.queue(txn)
    return dict(
        batch_id=int(out),
        batch=Batch.condor,
        config_file=cfg,
        stderr_log=job_cfg['error'],
        stdout_log=job_cfg['output'],
        job_log=job_cfg['log'],
        status=Status.CREATED,
    )
Пример #18
0
 def __submit_python(self, jsd, n):
     """
     submit using the python bindings
     :param JobSubmissionDescription jsd: instance of JobSubmissionDescription 
     :param int n: number of jobs to submit
     :return int: the clusterid of jobs submitted
     """
     submit_d = jsd.items()
     submit = htcondor.Submit(submit_d)
     with self.schedd.transaction() as txn:
         clusterid = submit.queue(txn, n)
     return clusterid
Пример #19
0
 def submit_with_python(self, jdl_list, use_spool=False):
     # Make logger
     tmpLog = core_utils.make_logger(
         baseLogger,
         'submissionHost={0}'.format(self.submissionHost),
         method_name='CondorJobSubmit.submit_with_python')
     # Start
     tmpLog.debug('Start')
     # Initialize
     errStr = ''
     batchIDs_list = []
     # Make list of jdl map with dummy submit objects
     jdl_map_list = [dict(htcondor.Submit(jdl).items()) for jdl in jdl_list]
     # Go
     submit_obj = htcondor.Submit()
     try:
         with self.schedd.transaction() as txn:
             # TODO: Currently spool is not supported in htcondor.Submit ...
             submit_result = submit_obj.queue_with_itemdata(
                 txn, 1, iter(jdl_map_list))
             clusterid = submit_result.cluster()
             first_proc = submit_result.first_proc()
             num_proc = submit_result.num_procs()
             batchIDs_list.extend([
                 '{0}.{1}'.format(clusterid, procid)
                 for procid in range(first_proc, first_proc + num_proc)
             ])
     except RuntimeError as e:
         errStr = '{0}: {1}'.format(e.__class__.__name__, e)
         tmpLog.error('submission failed: {0}'.format(errStr))
         raise
     if batchIDs_list:
         n_jobs = len(batchIDs_list)
         tmpLog.debug('submitted {0} jobs: {1}'.format(
             n_jobs, ' '.join(batchIDs_list)))
     elif not errStr:
         tmpLog.error('submitted nothing')
     tmpLog.debug('Done')
     # Return
     return (batchIDs_list, errStr)
Пример #20
0
def condor_submit_process(mp_queue, host, jdl_map_list):
    """
    Function for new process to submit condor
    """
    # initialization
    errStr = ''
    batchIDs_list = []
    # parse schedd and pool name
    condor_schedd, condor_pool = None, None
    if host in ('LOCAL', 'None'):
        tmpLog.debug(
            'submissionHost is {0}, treated as local schedd. Skipped'.format(
                host))
    else:
        try:
            condor_schedd, condor_pool = host.split(',')[0:2]
        except ValueError:
            tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(host))
    # get schedd
    try:
        if condor_pool:
            collector = htcondor.Collector(condor_pool)
        else:
            collector = htcondor.Collector()
        if condor_schedd:
            scheddAd = collector.locate(htcondor.DaemonTypes.Schedd,
                                        condor_schedd)
        else:
            scheddAd = collector.locate(htcondor.DaemonTypes.Schedd)
        schedd = htcondor.Schedd(scheddAd)
    except Exception as e:
        errStr = 'create condor collector and schedd failed; {0}: {1}'.format(
            e.__class__.__name__, e)
    else:
        submit_obj = htcondor.Submit()
        try:
            with schedd.transaction() as txn:
                # TODO: Currently spool is not supported in htcondor.Submit ...
                submit_result = submit_obj.queue_with_itemdata(
                    txn, 1, iter(jdl_map_list))
                clusterid = submit_result.cluster()
                first_proc = submit_result.first_proc()
                num_proc = submit_result.num_procs()
                batchIDs_list.extend([
                    '{0}.{1}'.format(clusterid, procid)
                    for procid in range(first_proc, first_proc + num_proc)
                ])
        except RuntimeError as e:
            errStr = 'submission failed; {0}: {1}'.format(
                e.__class__.__name__, e)
    mp_queue.put((batchIDs_list, errStr))
Пример #21
0
    def start(self, jupyter_args: List[str]) -> "JupyterJobManager":
        if self.has_running_job():
            raise click.ClickException(
                'You already have a running Jupyter notebook server; try the "status" subcommand to see it.'
            )

        self.prep_log_files()

        arguments = " ".join(
            ["-m", "jupyter", *jupyter_args, "--no-browser", "-y"])
        sub = htcondor.Submit({
            "universe":
            "local",
            "JobBatchName":
            " ".join(("jupyter", *jupyter_args)),
            "executable":
            sys.executable,
            "arguments":
            arguments,
            "initialdir":
            Path.cwd(),
            "output":
            self.out.as_posix(),
            "error":
            self.err.as_posix(),
            "log":
            self.event_log.as_posix(),
            "stream_output":
            "true",
            "stream_error":
            "true",
            "getenv":
            "true",
            "transfer_executable":
            "false",
            "transfer_output_files":
            '""',
            f"My.{MARKER}":
            "true",
        })

        logger.debug(f"HTCondor job submit description:\n{sub}")

        schedd = htcondor.Schedd()
        with schedd.transaction() as txn:
            self.cluster_id = sub.queue(txn)

        logger.debug(f"Submitted job with cluster ID {self.cluster_id}")

        return self
Пример #22
0
def Submit(jobdesc, log, appjobid, schedd):

    global queuelist

    if len(queuelist) == 0:
        log.error("%s: no cluster free for submission" % appjobid)
        return None

    # This method only works with condor version >= 8.5.8 but is needed to
    # get $() variable expansion working
    sub = htcondor.Submit(dict(jobdesc))
    with schedd.transaction() as txn:
        jobid = sub.queue(txn)
    return jobid
Пример #23
0
def htcondor_submit(sub, njobs=1, submission_dir='.'):
    """
    Submits the submission dict `sub` to the best scheduler.
    Returns the cluster id and class ad of the submitted job
    """
    import htcondor
    schedd = qondor.get_best_schedd(renew=True)
    with qondor.utils.switchdir(submission_dir):
        submit_object = htcondor.Submit(sub)
        with schedd.transaction() as transaction:
            ad = []
            cluster_id = submit_object.queue(transaction, njobs, ad)
            cluster_id = int(cluster_id)
    return cluster_id, ad
Пример #24
0
def submit(MHc, Mh):
    job = htcondor.Submit()
    schedd = htcondor.Schedd()

    lhefile = "unweighted_events.lhe"
    outdir = "/nfs/dust/cms/user/{}/Signal/Hc+hTol4b_MHc{}_Mh{}/LHE/".format(
        os.environ["USER"], MHc, Mh)
    outfile = "Hc+hTol4b_MHc{}_Mh{}_{}.lhe".format(
        MHc, Mh,
        str(time.time()).replace(".", ""))

    os.system("mkdir -p " + outdir)
    os.system("mkdir -p {}/log".format(outdir))

    ##Condor configuration
    job["executable"] = "{}/src/ChargedHiggs/MCproduction/batch/produceLHE.sh".format(
        os.environ["CMSSW_BASE"])
    job["universe"] = "vanilla"

    job["should_transfer_files"] = "YES"
    job["transfer_input_files"] = ",".join([
        os.environ["CMSSW_BASE"] +
        "/src/ChargedHiggs/MCproduction/MG5_aMC_v2_6_4",
        os.environ["CMSSW_BASE"] + "/src/command.txt",
        os.environ["CMSSW_BASE"] +
        "/src/ChargedHiggs/MCproduction/SLHA/Hc+hTol4b_MHc{}_Mh{}.shla".format(
            MHc, Mh)
    ])

    job["log"] = "log/job_$(Cluster).log"
    job["output"] = "log/job_$(Cluster).out"
    job["error"] = "log/job_$(Cluster).err"

    job["when_to_transfer_output"] = "ON_EXIT"
    job["transfer_output_remaps"] = '"' + '{} = {}/{}'.format(
        lhefile, outdir, outfile) + '"'

    ##Agressively submit your jobs
    def submit(schedd, job):
        with schedd.transaction() as txn:
            job.queue(txn)

    while (True):
        try:
            submit(schedd, job)
            print "Submit job for LHE file production"
            break

        except:
            pass
Пример #25
0
    def _coreExecution(self, handler, particles):
        schedd = htcondor.Schedd()
        conf = self.submitf

        executable = conf['darwin']['executable']
        executable_path = os.path.join(handler.optdir, executable)
        conf['htcondor']['executable'] = executable_path
        if not os.path.exists(executable_path):
            logger.error('executable "{}" not found'.format(executable_path))
            sys.exit(1)

        # secure the job id from condor
        self.ids = []
        for p in particles:
            arguments = p.coordinate.format()
            formatted_args = [
                '-{} {}'.format(k, v) for k, v in arguments.items()
            ]
            conf['htcondor']['arguments'] = ' '.join(formatted_args)
            conf['htcondor']['initialdir'] = handler.particlepath(p.name)

            # get redirect of htcondor submit file to a dict
            sub = htcondor.Submit(dict(conf.items('htcondor')))
            with schedd.transaction() as txn:
                ads = []
                clusterid = sub.queue(txn, ad_results=ads)
                self.ids.append(clusterid)

                if 'should_transfer_files' in conf['htcondor'] and \
                        conf['htcondor']['should_transfer_files'] in ('YES',):
                    schedd.spool(ads)

        req = ' || '.join('(ClusterId == {})'.format(id) for id in self.ids)
        proj = ['ClusterId', 'JobStatus']

        finished = False
        while not finished:
            count = 0
            for data in schedd.xquery(requirements=req, projection=proj):
                count += 1
            if count == 0:
                finished = True
            else:
                time.sleep(self.refresh_rate)

        if 'should_transfer_files' in conf['htcondor'] and \
                conf['htcondor']['should_transfer_files'] in ('YES',):
            for clusterid in self.ids:
                self._schedd.retrieve("ClusterId == %d".format(clusterid))
Пример #26
0
        def prepareSubmission(self, cpu, memory, disk, jobID, jobName,
                              command):

            # Convert resource requests
            cpu = int(math.ceil(cpu))  # integer CPUs
            memory = float(memory) / 1024  # memory in KB
            disk = float(disk) / 1024  # disk in KB

            # Execute the entire command as /bin/sh -c "command"
            # TODO: Transfer the jobStore directory if using a local file store with a relative path.
            submit_parameters = {
                'executable': '/bin/sh',
                'transfer_executable': 'False',
                'arguments': '''"-c '{0}'"'''.format(command).encode(
                    'utf-8'
                ),  # Workaround for HTCondor Python bindings Unicode conversion bug
                'environment': self.getEnvString(),
                'getenv': 'True',
                'request_cpus': '{0}'.format(cpu),
                'request_memory': '{0:.3f}KB'.format(memory),
                'request_disk': '{0:.3f}KB'.format(disk),
                'leave_in_queue': '(JobStatus == 4)',
                '+IsToilJob': 'True',
                '+ToilJobID': '{0}'.format(jobID),
                '+ToilJobName': '"{0}"'.format(jobName),
                '+ToilJobKilled': 'False',
            }

            # Extra parameters for HTCondor
            extra_parameters = os.getenv('TOIL_HTCONDOR_PARAMS')
            if extra_parameters is not None:
                logger.debug(
                    "Extra HTCondor parameters added to submit file from TOIL_HTCONDOR_PARAMS env. variable: {}"
                    .format(extra_parameters))
                for parameter, value in [
                        parameter_value.split('=', 1)
                        for parameter_value in extra_parameters.split(';')
                ]:
                    parameter = parameter.strip()
                    value = value.strip()
                    if parameter in submit_parameters:
                        raise ValueError(
                            "Some extra parameters are incompatible: {}".
                            format(extra_parameters))

                    submit_parameters[parameter] = value

            # Return the Submit object
            return htcondor.Submit(submit_parameters)
Пример #27
0
    def test_client_metrics(self):

        coll = htcondor.Collector()
        startd = coll.locateAll(htcondor.DaemonTypes.Startd)
        if len(startd) == 0:
            # Submitting some sleep jobs
            job = {
                "executable": "/bin/sleep",
                "arguments": "5m",
                "request_memory": "500"
            }

            sub = htcondor.Submit(job)
            schedd = htcondor.Schedd()
            with schedd.transaction() as txn:
                sub.queue(txn, 1)

            # Waiting for the glideins to start
            time.sleep(60)

        uuid = 'pyglideinpyglideinclient'
        partition = 'Cluster'
        metrics = [
            'glideins.launched', 'glideins.running', 'glideins.idle',
            'glideins.avg_idle_time', 'glideins.min_idle_time',
            'glideins.max_idle_time'
        ]
        for metric in metrics:
            path = '.'.join([self.metrics_namespace, uuid, partition, metric])
            url = 'http://{}/render?target={}'.format(
                self.metrics_graphite_server, path)
            url += '&format=json&from=-5min'
            r = requests.get(url)
            output = r.json()
            self.assertTrue(len(output) > 0,
                            msg='{} client metric not found'.format(path))
            if len(output) > 0:
                output = output[0]
                self.assertTrue(len(output['datapoints']) > 0,
                                msg='No datapoints found for {}.'.format(path))
                self.assertTrue(output['tags']['name'] == path,
                                msg='Metrics mismatch for {}.'.format(path))
                not_zeros = False
                for datapoint in output['datapoints']:
                    if datapoint[0] != 0.0:
                        not_zeros = True
                self.assertTrue(
                    not_zeros,
                    msg='Add datapoints are zero for {}.'.format(path))
Пример #28
0
    def create(self, htcondor_job):
        """
        Submit a job & return the job id
        """
        id = None
        try:
            sub = htcondor.Submit(htcondor_job)
            with self._schedd.transaction() as txn:
                id = sub.queue(txn, 1)
        except Exception as err:
            return None

        if id:
            return id
        return None
Пример #29
0
    def submit(self):
        '''Submit the test jobs for all prepIDs to HTCondor.'''
        prepid_campaign_map = self._get_prepid_campaign_map()

        for campaign_name, prepids in prepid_campaign_map.items():
            # Set output directory for each campaign
            outdir = mcmtest_path(
                f'output/{self.request_name}/{campaign_name}')
            if not os.path.exists(outdir):
                os.makedirs(outdir)

            jobfiledir = mcmtest_path(
                f'job_files/{self.request_name}/{campaign_name}')
            if not os.path.exists(jobfiledir):
                os.makedirs(jobfiledir)

            for prepid in prepids:
                # Set the correct arguments for the executable
                arg_list = ['$(proxy_path)', prepid]
                args = ' '.join(arg_list)
                self.submission_settings['arguments'] = args

                # Set output log files
                output_file = pjoin(outdir, f'out_{prepid}.txt')
                log_file = pjoin(outdir, f'log_{prepid}.txt')
                err_file = pjoin(outdir, f'err_{prepid}.txt')

                self.submission_settings['output'] = output_file
                self.submission_settings['log'] = log_file
                self.submission_settings['error'] = err_file

                sub = htcondor.Submit(self.submission_settings)

                # Write the job file to submit
                jobfile = pjoin(jobfiledir, f'job_{prepid}.jdl')
                with open(jobfile, 'w+') as f:
                    f.write(str(sub))
                    f.write('\nqueue 1\n')

                # Submit the job, if dry run is not specified
                if not self.dryrun:
                    jobid = condor_submit(jobfile)
                    print(f'Submitted job: {prepid}, Job ID: {jobid}')

        if self.dryrun:
            print('Dry run requested. No submissions will be made.')
            print('PrepID and campaign information:')
            pprint(prepid_campaign_map)
Пример #30
0
 def launch(self, job_id):
     status = STATUS_OK
     msg = ''
     cluster_id = ''
     # Get the job info from the database
     job_info = self.db.get_job_info(job_id)
     # Load environment variables required for the job
     env = job_info['spec']['env']
     executable = job_info['spec']['executable']
     log_path = job_info['spec']['log']
     out_path = job_info['spec']['output']
     err_path = job_info['spec']['error']
     for envvar in self.job_types[job_info['type']]['env']:
         os.environ[envvar] = '{}'.format(env[envvar])
     # Create the output log directory if it does not exist
     os.makedirs(log_path, exist_ok=True)
     job_spec = {
         "executable": executable,  # the program to run on the execute node
         "output":
         out_path,  # anything the job prints to standard output will end up in this file
         "error":
         err_path,  # anything the job prints to standard error will end up in this file
         "log":
         log_path,  # this file will contain a record of what happened to the job
         "getenv": "True",
     }
     # Submit the HTCondor job
     htcondor_job = htcondor.Submit(job_spec)
     htcondor_schedd = htcondor.Schedd(
     )  # get the Python representation of the scheduler
     with htcondor_schedd.transaction(
     ) as txn:  # open a transaction, represented by `txn`
         cluster_id = htcondor_job.queue(
             txn
         )  # queues one job in the current transaction; returns job's ClusterID
     if not isinstance(cluster_id, int):
         msg = 'Error submitting Condor job'
         status = STATUS_ERROR
         self.update_job(job_id, updates={
             'msg': msg,
         })
     else:
         self.update_job(job_id,
                         updates={
                             'cluster_id': cluster_id,
                             'status': 'submitted',
                         })
     return status, msg, cluster_id