示例#1
0
    def try_to_complete(self):
        """
        Try to force the task to complete
        (e.g., through min_completion_fraction satisfaction
        or otherwise), and also do so by removing residual condor
        jobs and deleting output files that aren't explicitly done
        but may have been put there in the meantime by a condor job
        """
        # if min_completion_fraction is 1, then don't do anything
        if self.min_completion_fraction > 1. - 1.e-3: return
        # if it's not complete by the min_completion_fraction standard, then
        # don't even bother killing tail jobs.
        if not self.complete(): return

        for cjob in self.get_running_condor_jobs():
            cluster_id = cjob["ClusterId"]
            Utils.condor_rm([cluster_id])
            self.logger.info("Tail condor job {} removed".format(cluster_id))
        files_to_remove = [
            output.get_name() for output in self.get_uncompleted_outputs()
        ]
        new_mapping = []
        for ins, out in self.get_io_mapping():
            if out in files_to_remove:
                continue
            new_mapping.append([ins, out])
        for fname in files_to_remove:
            Utils.do_cmd("rm {}".format(fname))
            self.logger.info("Tail root file {} removed".format(fname))
        self.io_mapping = new_mapping
示例#2
0
    def handle_condor_job(self,
                          this_job_dict,
                          out,
                          fake=False,
                          remove_running_x_hours=48.0,
                          remove_held_x_hours=5.0):
        """
        takes `out` (File object) and dictionary of condor
        job information returns action_type specifying the type of action taken
        given the info
        """
        cluster_id = "{}".format(this_job_dict["ClusterId"])
        running = this_job_dict.get("JobStatus", "I") == "R"
        idle = this_job_dict.get("JobStatus", "I") == "I"
        held = this_job_dict.get("JobStatus", "I") == "H"
        hours_since = abs(time.time() -
                          int(this_job_dict["EnteredCurrentStatus"])) / 3600.

        action_type = "UNKNOWN"
        out.set_status(Constants.RUNNING)

        if running:
            self.logger.debug(
                "Job {0} for ({1}) running for {2:.1f} hrs".format(
                    cluster_id, out, hours_since))
            action_type = "RUNNING"
            out.set_status(Constants.RUNNING)

            if hours_since > remove_running_x_hours:
                self.logger.debug(
                    "Job {0} for ({1}) removed for running for more than a day!"
                    .format(cluster_id, out))
                if not fake: Utils.condor_rm([cluster_id])
                action_type = "LONG_RUNNING_REMOVED"

        elif idle:
            self.logger.debug("Job {0} for ({1}) idle for {2:.1f} hrs".format(
                cluster_id, out, hours_since))
            action_type = "IDLE"
            out.set_status(Constants.IDLE)

        elif held:
            self.logger.debug(
                "Job {0} for ({1}) held for {2:.1f} hrs with hold reason: {3}".
                format(cluster_id, out, hours_since,
                       this_job_dict.get("HoldReason", "???")))
            action_type = "HELD"
            out.set_status(Constants.HELD)

            if hours_since > remove_held_x_hours:
                self.logger.info(
                    "Job {0} for ({1}) removed for excessive hold time".format(
                        cluster_id, out))
                if not fake: Utils.condor_rm([cluster_id])
                action_type = "HELD_AND_REMOVED"

        return action_type
示例#3
0
    def test_condor_submission_and_status(self):
        basedir = "/tmp/{0}/metis/condor_test/".format(os.getenv("USER"))
        Utils.do_cmd("mkdir -p {0}".format(basedir))

        with open("{0}/temp_test.sh".format(basedir), "w") as fhout:
            fhout.write("""#!/usr/bin/env bash
echo "--- begin header output ---"
echo "hostname: $(hostname)"
echo "uname -a: $(uname -a)"
echo "time: $(date +%s)"
echo "args: $@"
echo "ls -l output"
ls -l
# logging every 45 seconds gives ~100kb log file/3 hours
dstat -cdngytlmrs --float --nocolor -T --output dsout.csv 45 >& /dev/null &
echo "--- end header output ---"

# run main job stuff
sleep 60s

echo "--- begin dstat output ---"
cat dsout.csv
echo "--- end dstat output ---"
kill %1 # kill dstat

echo "ls -l output"
ls -l
                        """)
        Utils.do_cmd("chmod a+x {0}/temp_test.sh".format(basedir))

        success, cluster_id = Utils.condor_submit(
            executable=basedir + "temp_test.sh",
            arguments=["cat", 10, "foo"],
            inputfiles=[],
            logdir=basedir,
            selection_pairs=[["MyVar1", "METIS_TEST"],
                             ["MyVar2", "METIS_TEST2"]])

        jobs = Utils.condor_q(selection_pairs=[["MyVar1", "METIS_TEST"],
                                               ["MyVar2", "METIS_TEST2"]])
        found_job = len(jobs) >= 1

        Utils.condor_rm([cluster_id])

        self.assertEqual(success, True)
        self.assertEqual(found_job, True)