示例#1
0
  def __init__(self, argv):

    try:
      self.classad = classad.parseOne(
        open(os.environ['_CONDOR_JOB_AD']))
    except Exception as e:
      log.critical("Unable to open classad from environment variable _CONDOR_JOB_AD: {0}".format(e))
      sys.exit(1)
     
    try: 
      self.machine_ad = classad.parseOne(
        open(os.environ['_CONDOR_MACHINE_AD']))
    except Exception as e:
      log.critical("Unable to open machinead from environment variable _CONDOR_MACHINE_AD: {0}".format(e))
      sys.exit(1)

    self.cmd_orig = argv[1:]
    self.cmd = ' '.join(self.cmd_orig)

    self.app = self.classad['HMDCApplicationName']
    self.use_xpra = self.classad['HMDCUseXpra']
    self.localjobdir = self.classad['LocalJobDir'].eval()
    self.app_log = "{0}/{1}.out.txt".format(
        self.localjobdir,
        self.app)
    self.__BASENAME__ = os.path.basename(__file__)

    self.memory_bytes = (int(self.machine_ad['Memory']) * 1024) * 1024
示例#2
0
    def run(self):
        '''Main running function for a process watching a particular condor
        job. Creates its own logfile, watches for changes and then exits'''
        observer = Observer()
        observer.schedule(self, self.__watchdir, recursive=True)
        files = 0
        file_space = 0
        job_ad = classad.parseOne(open(self.__watchdir+"/.job.ad", "r"))
        jobdate = datetime.datetime.fromtimestamp(
            int(job_ad['JobStartDate'])).strftime('%Y-%m-%d %H:%M:%S')
        try:
            logname = ''.join([LOG_DIR, job_ad['Owner'], ".",
                               job_ad['UidDomain'], ".", str(job_ad['QDate']),
                               ".", str(job_ad['ClusterId']),
                               ".", str(job_ad['ProcId']), ".log"])
            logfile = open(logname, "wb")
        except IOError:
            sys.stderr.write("Problem creating logfile {0}".format(logname))
            return

        logwriter = csv.writer(logfile)
        logwriter.writerow([job_ad['User'], jobdate])
        observer.start()
        while not self.__exit.is_set():
            time.sleep(1)
            for item in self.stat_monitors.copy():
                try:
                    file_space += os.path.getsize(item)
                except OSError:
                    pass      # File has been deleted during our loop
                files += 1
            logwriter.writerow([int(time.time()), files, file_space])
            files = 0
            file_space = 0
        logfile.close()
def main(job_classad = classad.parseOne(sys.stdin.read())):

  jobid = lambda ad: "{0}.{1}".format(
      str(ad['ClusterId']),
      str(ad['ProcId']))

  is_interactive = lambda ad: job_classad['HMDCInteractive']

  try:
    if not is_interactive(job_classad):
      return 0
  except:
    return 0

  # If the job isn't currently running, we don't care.
  if 2 > job_classad['JobStatus'] > 2:
    log.info('Job is no longer running, exiting.')
    return 0

  log.info ('Job {0}: Running.'.format(jobid(job_classad)))

  try:
    is_job_idle = HMDCCondor()._collector.query(htcondor.AdTypes.Any,
        'JobId =?= "{0}"'.format(jobid(job_classad)),
        ['JobCpuIsIdle'])[0]['JobCpuIsIdle'].eval()
    log.info('Job {0}: Idle? {1}'.format(jobid(job_classad), is_job_idle))
  except:
    log.info('Job {0}: Unable to evaluate JobCpuIsIdle'.format(jobid(job_classad)))
    return 0

  return check_if_preempt(job_classad, update_job(jobid(job_classad),
    is_job_idle, job_classad))
示例#4
0
def main(args):
    os.environ["HOME"] = HOME_DIR
    os.environ["USER"] = PARROT_USER
    # Parrot variables
    os.environ["PARROT_ALLOW_SWITCHING_CVMFS_REPOSITORIES"] = "yes"
    os.environ["LD_LIBRARY_PATH"] = (os.getenv("LD_LIBRARY_PATH", "") +
                                     ":/usr/local/cctools/lib")
    os.environ["PATH"] = (os.getenv("PATH", "") + ":/usr/local/sbin:" +
                          "/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" +
                          "/usr/local/cctools/bin")

    job_ad = classad.parseOne(open(".job.ad", "r"))
    pkg_dir = PKG_HOST_DIR + job_ad['parrotRun']
    parrot_str =  "parrot_package_run -p " + pkg_dir + " " + os.getcwd() + "/condor_exec.exe " + " ".join(args)

    if os.path.isdir(pkg_dir):
        retval = call(parrot_str, shell=True)
    else:
        sys.stderr.write("Invalid package specified")
        sys.exit(1)

    if retval != 0:
        sys.stderr.write("An error with parrot_pacakge run_occured")
    else:
        os.remove(MOUNTLIST_NAME)  # Don't want condor to transfer the mountlist on exit
示例#5
0
 def classad_to_dict(text):
     ret = {}
     c = classad.parseOne(text)
     for k in c.keys():
         try:
             ret[k.lower()] = c.eval(k)
         except TypeError:
             ret[k.lower()] = c[k]
     return ret
示例#6
0
def classad_parse(inputstr):
    """Parse string into a classad.

    Uses classad.parseOne if available (HTCondor 8.3+), and
    classad.parse otherwise (HTCondor 8.2, deprecated in 8.3).

    """
    if hasattr(classad, 'parseOne'):
        return classad.parseOne(inputstr)
    else:
        return classad.parse(inputstr)
示例#7
0
 def test_parse_one(self):
     ad = classad.parseOne("foo = 1\nbar = 2")
     self.one_ad_verify(ad)
     ad = classad.parseOne("[foo = 1; bar = 2]")
     self.one_ad_verify(ad)
     ad = classad.parseOne("foo = 1", classad.Parser.New)
     self.assertEqual(len(ad), 0)
     self.one_ad_verify(classad.parseOne("foo = 1\nbar = 2\n"))
     self.one_ad_verify(classad.parseOne("foo = 1\nbar = 1\n\nbar = 2\n"))
     ad = classad.parseOne("[foo = 1]", classad.Parser.Old)
     self.assertEqual(len(ad), 0)
     self.one_ad_verify(classad.parseOne("[foo = 1; bar = 1;] [bar = 2]"))
     self.one_ad_verify(classad.parseOne("-------\nfoo = 1\nbar = 2\n\n"))
def main():

  try:
    ad = is_xpra_job(classad.parseOne(sys.stdin.read()))
  except:
    return 0

  if ad == False:
    log.info("Job {0} has HMDCUseXpra == False. No clean-up required.".
        format(int(ad['ClusterId'])))
    return 0

  try:
    return remove_dir(ad['LocalJobDir'].eval(), ad['ClusterId'])
  except Exception as e:
    log.critical("Encountered exception while removing LocalJobDir: {0}".format(e))
    return 0
示例#9
0
    def _wait_for_ready(self, timeout=120):
        daemons = self._daemons()
        master_log_path = self._master_log

        logger.debug("Starting up daemons for {}, waiting for: {}".format(
            self, " ".join(sorted(daemons))))

        start = time.time()
        while time.time() - start < timeout:
            time_to_give_up = int(timeout - (time.time() - start))

            # if the master log does not exist yet, we can't use condor_who
            if not master_log_path.exists():
                logger.debug(
                    "MASTER_LOG at {} does not yet exist for {}, retrying in 1 seconds (giving up in {} seconds)."
                    .format(master_log_path, self, time_to_give_up))
                time.sleep(1)
                continue

            who = self.run_command(
                shlex.split(
                    "condor_who -wait:10 'IsReady && STARTD_State =?= \"Ready\"'"
                ), )
            if who.stdout.strip() == "":
                logger.debug(
                    "condor_who stdout was unexpectedly blank for {}, retrying in 1 second (giving up in {} seconds). condor_who stderr:\n{}"
                    .format(self, time_to_give_up, who.stderr))
                time.sleep(1)
                continue

            who_ad = classad.parseOne(who.stdout)

            if (who_ad.get("IsReady") and who_ad.get("STARTD_State") == "Ready"
                    and all(who_ad.get(d) == "Alive" for d in daemons)):
                self.state = PersonalPoolState.READY
                return self

            logger.debug(
                "{} is waiting for daemons to be ready (giving up in {} seconds)"
                .format(self, time_to_give_up))

        raise TimeoutError("Standup for {} failed".format(self))
示例#10
0
 def testScheddSubmitMany(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 10, False, ads)
     #print ads[0]
     for i in range(60):
         ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
         ads = list(ads)
         #print ads
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEqual(open(output_file).read(), "hello world\n")
示例#11
0
 def testScheddQueryPoll(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 10, False, ads)
     for i in range(60):
         ads_iter = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query1")
         ads_iter2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query2")
         ads = []
         for query in htcondor.poll([ads_iter, ads_iter2]):
             self.assertTrue(query.tag() in ["query1", "query2"])
             ads += query.nextAdsNonBlocking()
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
示例#12
0
    def who(self) -> classad.ClassAd:
        """
        Return the result of ``condor_who -quick``,
        as a :class:`classad.ClassAd`.
        If ``condor_who -quick`` fails, or the output can't be parsed into
        a sensible who ad, this method returns an empty ad.
        """
        who = self.run_command(["condor_who", "-quick"])

        try:
            parsed = classad.parseOne(who.stdout)

            # If there's no MASTER key in the parsed ad, it indicates
            # that we actually got the special post-shutdown message
            # from condor_who and should act like there's nothing there.
            if "MASTER" not in parsed:
                return classad.ClassAd()

            return parsed
        except Exception:
            return classad.ClassAd()
示例#13
0
 def testScheddSubmitMany2(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("submit.ad"))
     ads = []
     cluster = schedd.submitMany(ad, [({'foo': 1}, 5), ({'foo': 2}, 5)], False, ads)
     for i in range(60):
         ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus", 'ProcId', 'foo'])
         ads = list(ads)
         for ad in ads:
             if ad['ProcId'] < 5: self.assertEqual(ad['foo'], 1)
             else: self.assertEqual(ad['foo'], 2)
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEqual(open(output_file).read(), "hello world\n");
示例#14
0
 def testScheddQueryPoll(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 10, False, ads)
     for i in range(60):
         ads_iter = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query1")
         ads_iter2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query2")
         ads = []
         for query in htcondor.poll([ads_iter, ads_iter2]):
             self.assertTrue(query.tag() in ["query1", "query2"])
             ads += query.nextAdsNonBlocking()
         #print ads
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
示例#15
0
def read_from_file(filename):
    """Read condor classads from file.

    A generator that yields condor job dicts.

    Args:
        filename (str): filename to read
    """
    with (gzip.open(filename)
          if filename.endswith('.gz') else open(filename)) as f:
        entry = ''
        for line in f.readlines():
            if line.startswith('***'):
                try:
                    c = classad.parseOne(entry)
                    yield classad_to_dict(c)
                    entry = ''
                except:
                    entry = ''
            else:
                entry += line + '\n'
示例#16
0
 def testScheddSubmitMany2(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submitMany(ad, [({'foo': 1}, 5), ({'foo': 2}, 5)], False, ads)
     #print ads[0]
     for i in range(60):
         ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus", 'ProcId', 'foo'])
         ads = list(ads)
         #print ads
         for ad in ads:
             if ad['ProcId'] < 5: self.assertEquals(ad['foo'], 1)
             else: self.assertEquals(ad['foo'], 2)
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEquals(open(output_file).read(), "hello world\n");
示例#17
0
 def testScheddSubmitSpool(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("submit.ad"))
     result_ads = []
     cluster = schedd.submit(ad, 1, True, result_ads)
     schedd.spool(result_ads)
     for i in range(60):
         ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
         self.assertEqual(len(ads), 1)
         if ads[0]["JobStatus"] == 4:
             break
         if i % 5 == 0:
             schedd.reschedule()
         time.sleep(1)
     schedd.retrieve("ClusterId == %d" % cluster)
     schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster])
     ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
     self.assertEqual(len(ads), 0)
     self.assertEqual(open(output_file).read(), "hello world\n")
示例#18
0
    # so we catch all exceptions, try to write to the outfile if we can
    # and always exit -1 on error.
    #
    # Exiting -1 without an outfile thus means one of two things:
    # 1. Couldn't parse arguments.
    # 2. Couldn't open outfile for writing.

    try:
        args = parse_args()
    except Exception:
        sys.exit(-1)

    try:
        try:
            scratch_dir = Path.cwd()
            job_ad = classad.parseOne((scratch_dir / ".job.ad").read_text())
            out, err = scratch_dir / job_ad["Out"], scratch_dir / job_ad["Err"]
            with out.open(mode="a") as out_file, err.open(
                    mode="a") as err_file:
                with contextlib.redirect_stdout(
                        out_file), contextlib.redirect_stderr(err_file):
                    print("\n------  TRANSFER PLUGIN OUTPUT  ------\n")
                    print("\n------  TRANSFER PLUGIN ERROR   ------\n",
                          file=sys.stderr)
                    main(args)
        except FileNotFoundError:
            main(args)
    except Exception as e:
        tb = traceback.format_exc().replace("\n", " ")
        write_dict_to_file_as_ad(
            {
from datetime import datetime
# Import classad

__BASENAME__ = os.path.basename(__file__)


# Quick and dirty debug function, please replace.
def debug(will_debug, _fd, message): 
  if will_debug:
    dt = datetime.utcnow().strftime("%Y%m%d %s")
    _fd.write("[{0}] {1}\n".format(dt, message))
    return True
  else:
    return False

job_classad = classad.parseOne(sys.stdin.read())
home = pwd.getpwnam(pwd.getpwuid(os.getuid())[0]).pw_dir

# 'HMDCNewSubmit'

try:
  hmdc_new_submit = job_classad['HMDCNewSubmit']
  hmdc_interactive_job = job_classad['HMDCInteractive']
except:
  sys.exit(0)

if hmdc_new_submit == False or hmdc_interactive_job == False:
  sys.exit(0)

# Should we debug this hook?
try:
示例#20
0
    def testTransaction(self):
        self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
        output_file = os.path.join(testdir, "test.out")
        log_file = os.path.join(testdir, "test.log")
        if os.path.exists(output_file):
            os.unlink(output_file)
        if os.path.exists(log_file):
            os.unlink(log_file)
        schedd = htcondor.Schedd()
        ad = classad.parseOne(open("submit_sleep.ad"))
        result_ads = []
        cluster = schedd.submit(ad, 1, True, result_ads)

        with schedd.transaction() as txn:
            schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(1))
            schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(2))
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar'])
        self.assertEqual(len(ads), 1)
        self.assertEqual(ads[0]['foo'], 1)
        self.assertEqual(ads[0]['bar'], 2)

        with schedd.transaction() as txn:
            schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(3))
            with schedd.transaction(
                    htcondor.TransactionFlags.NonDurable
                    | htcondor.TransactionFlags.ShouldLog, True) as txn:
                schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(4))
                schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(5))
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar', 'baz'])
        self.assertEqual(len(ads), 1)
        self.assertEqual(ads[0]['foo'], 4)
        self.assertEqual(ads[0]['bar'], 5)
        self.assertEqual(ads[0]['baz'], 3)

        try:
            with schedd.transaction() as txn:
                schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(6))
                schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(7))
                raise Exception("force abort")
        except:
            exctype, e = sys.exc_info()[:2]
            if not issubclass(exctype, Exception):
                raise
            self.assertEqual(str(e), "force abort")
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar'])
        self.assertEqual(len(ads), 1)
        self.assertEqual(ads[0]['foo'], 4)
        self.assertEqual(ads[0]['bar'], 5)

        try:
            with schedd.transaction() as txn:
                schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(8))
                with schedd.transaction(
                        htcondor.TransactionFlags.NonDurable
                        | htcondor.TransactionFlags.ShouldLog, True) as txn:
                    schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(9))
                    schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(10))
                raise Exception("force abort")
        except:
            exctype, e = sys.exc_info()[:2]
            if not issubclass(exctype, Exception):
                raise
            self.assertEqual(str(e), "force abort")
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar', 'baz'])
        self.assertEqual(len(ads), 1)
        self.assertEqual(ads[0]['foo'], 4)
        self.assertEqual(ads[0]['bar'], 5)
        self.assertEqual(ads[0]['baz'], 3)

        # A removed job may persist in the queue for a short time, but its
        # JobStatus will be 3 (REMOVED)
        schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster])
        ads = schedd.query("ClusterId == %d && JobStatus != 3" % cluster,
                           ["JobStatus"])
        self.assertEqual(len(ads), 0)
示例#21
0
 def __init__(self, result):
     self.status = result[0]
     self.ad = classad.parseOne(result[1])
示例#22
0
 def run_next_task():
   parsed_ad = classad.parseOne(ad)
   self.progress_bar_window.start_task("Attaching to job {0}".format(parsed_ad['HMDCApplicationName']))
   self.dispatcher = RCEGraphicalTaskDispatcher('attach_app', self.rceapps, self.jobid, ad)
   self.dispatcher.start() 
def test_parse_one_ad_from_file_like_object(ad_file):
    ad = classad.parseOne(ad_file.open(mode="r"))

    assert ad["foo"] == "wiz"
示例#24
0
                "_id": id
            }}) + "\n"
        body += ad + "\n"
    print es.bulk(body=body)['took']


for fname in sys.argv[1:]:
    print "Processing file", fname
    fp = os.popen("condor_history -file %s -l" % fname)

    count = 0
    ad = ''
    ads = []
    for line in fp.xreadlines():
        if line == "\n":
            job_ad = classad.parseOne(ad)
            if not job_ad:
                continue
            #print job_ad
            json_ad = convert_to_json(job_ad)
            #print es.index(index=idx, doc_type="job", body=json_ad, id=job_ad["GlobalJobId"])
            ads.append((job_ad["GlobalJobId"], json_ad))
            count += 1
            ad = ''
            if len(ads) == 100:
                post_ads(ads)
                ads = []
        ad += line
    if ad:
        job_ad = classad.parseOne(line)
        if 'GlobalJobId' in job_ad:
示例#25
0
 def test_load_classad_from_file_v2(self):
     ad = classad.parseOne(open("tests/test.ad"))
     self.assertEqual(ad["foo"], "bar")
     self.assertEqual(ad["baz"], classad.Value.Undefined)
     self.assertRaises(KeyError, ad.__getitem__, "bar")
示例#26
0
 def test_load_classad_from_file_v2(self):
     ad = classad.parseOne(open("tests/test.ad"))
     self.assertEqual(ad["foo"], "bar")
     self.assertEqual(ad["baz"], classad.Value.Undefined)
     self.assertRaises(KeyError, ad.__getitem__, "bar")
示例#27
0
 def redo_sites(self, new_submit_text, crab_retry, use_resubmit_info):
     """
     Re-define the set of sites where the job can run on by taking into account
     any site-white-list and site-black-list.
     """
     ## If there is an automatic site blacklist, add it to the Job.<job_id>.submit
     ## content.
     automatic_siteblacklist = self.calculate_blacklist()
     if automatic_siteblacklist:
         self.task_ad[
             'CRAB_SiteAutomaticBlacklist'] = automatic_siteblacklist
         new_submit_text += '+CRAB_SiteAutomaticBlacklist = %s\n' % str(
             self.task_ad.lookup('CRAB_SiteAutomaticBlacklist'))
     ## Get the site black- and whitelists either from the task ad or from
     ## self.resubmit_info.
     siteblacklist = set()
     sitewhitelist = set()
     if not use_resubmit_info:
         if 'CRAB_SiteBlacklist' in self.task_ad:
             siteblacklist = set(self.task_ad['CRAB_SiteBlacklist'])
         if 'CRAB_SiteWhitelist' in self.task_ad:
             sitewhitelist = set(self.task_ad['CRAB_SiteWhitelist'])
     else:
         inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
         while inkey not in self.resubmit_info and int(inkey) > 0:
             inkey = str(int(inkey) - 1)
         siteblacklist = set(self.resubmit_info[inkey].get(
             'site_blacklist', []))
         sitewhitelist = set(self.resubmit_info[inkey].get(
             'site_whitelist', []))
     ## Save the current site black- and whitelists in self.resubmit_info for the
     ## current job retry number.
     outkey = str(crab_retry)
     if outkey not in self.resubmit_info:
         self.resubmit_info[outkey] = {}
     self.resubmit_info[outkey]['site_blacklist'] = list(siteblacklist)
     self.resubmit_info[outkey]['site_whitelist'] = list(sitewhitelist)
     ## Add the current site black- and whitelists to the Job.<job_id>.submit
     ## content.
     if siteblacklist:
         new_submit_text += '+CRAB_SiteBlacklist = {"%s"}\n' % (
             '", "'.join(siteblacklist))
     else:
         new_submit_text += '+CRAB_SiteBlacklist = {}\n'
     if sitewhitelist:
         new_submit_text += '+CRAB_SiteWhitelist = {"%s"}\n' % (
             '", "'.join(sitewhitelist))
     else:
         new_submit_text += '+CRAB_SiteWhitelist = {}\n'
     ## Get the list of available sites (the sites where this job could run).
     if os.path.exists("site.ad.json"):
         with open("site.ad.json") as fd:
             site_info = json.load(fd)
         group = site_info[self.job_id]
         available = set(site_info['group_sites'][str(group)])
         datasites = set(site_info['group_datasites'][str(group)])
     else:
         with open("site.ad") as fd:
             site_ad = classad.parseOne(fd)
         available = set(site_ad['Job%s' % (self.job_id)])
     ## Take the intersection between the available sites and the site whitelist.
     ## This is the new set of available sites.
     if sitewhitelist:
         available &= sitewhitelist
     ## Remove from the available sites the ones that are in the site blacklist,
     ## unless they are also in the site whitelist (i.e. never blacklist something
     ## on the whitelist).
     siteblacklist.update(automatic_siteblacklist)
     available -= (siteblacklist - sitewhitelist)
     if not available:
         self.logger.error(
             "Can not submit since DESIRED_Sites list is empty")
         self.prejob_exit_code = 1
         sys.exit(self.prejob_exit_code)
     ## Add DESIRED_SITES to the Job.<job_id>.submit content.
     new_submit_text = '+DESIRED_SITES="%s"\n%s' % (",".join(available),
                                                    new_submit_text)
     new_submit_text = '+DESIRED_CMSDataLocations="%s"\n%s' % (
         ",".join(datasites), new_submit_text)
     return new_submit_text
def test_parse_one_ad_from_string(ad_string):
    ad = classad.parseOne(ad_string)

    assert ad["foo"] == "wiz"
示例#29
0
    body = ''
    for id, ad in ads:
        body += json.dumps({"index": {"_index": idx, "_type": "job", "_id": id}}) + "\n"
        body += ad + "\n"
    print es.bulk(body=body)['took']

for fname in sys.argv[1:]:
    print "Processing file", fname
    fp = os.popen("condor_history -file %s -l" % fname)

    count = 0
    ad = ''
    ads = []
    for line in fp.xreadlines():
        if line == "\n":
            job_ad = classad.parseOne(ad)
            if not job_ad:
                continue
            #print job_ad
            json_ad = convert_to_json(job_ad)
            #print es.index(index=idx, doc_type="job", body=json_ad, id=job_ad["GlobalJobId"])
            ads.append((job_ad["GlobalJobId"], json_ad))
            count += 1
            ad = ''
            if len(ads) == 100:
                post_ads(ads)
                ads = []
        ad += line
    if ad:
        job_ad = classad.parseOne(line)
        if 'GlobalJobId' in job_ad:
 def __init__(self, result):
   self.status = result[0]
   self.ad = classad.parseOne(result[1])
示例#31
0
    def executeInternal(self, *args):
        """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise"""
        self.stage = args[0]
        self.completion = int(args[1])
        self.prefix = args[2]

        self.setupLog()

        self.statusCacheInfo = {
        }  #Will be filled with the status from the status cache

        self.readJobStatus()
        completed = set(self.completedJobs(stage=self.stage))
        if len(completed) < self.completion:
            return 4

        self.readProcessedJobs()
        unprocessed = completed - self.processedJobs
        estimates = copy.copy(unprocessed)
        self.logger.info("jobs remaining to process: %s",
                         ", ".join(sorted(unprocessed)))
        if self.stage == 'tail' and len(estimates - set(self.failedJobs)) == 0:
            estimates = set(
                self.completedJobs(stage='processing', processFailed=False))
        self.logger.info("jobs remaining to process: %s",
                         ", ".join(sorted(unprocessed)))

        # The TaskWorker saves some files that now we are gonna read
        with open('datadiscovery.pkl', 'rb') as fd:
            dataset = pickle.load(fd)  #Output from the discovery process
        with open('taskinformation.pkl', 'rb') as fd:
            task = pickle.load(
                fd
            )  #A dictionary containing information about the task as in the Oracle DB
        with open('taskworkerconfig.pkl', 'rb') as fd:
            config = pickle.load(fd)  #Task worker configuration

        # need to use user proxy as credential for talking with cmsweb
        config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY')
        config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY')
        config.TaskWorker.envForCMSWEB = newX509env(
            X509_USER_CERT=config.TaskWorker.cmscert,
            X509_USER_KEY=config.TaskWorker.cmskey)

        # need to get username from classAd to setup for Rucio access
        task_ad = classad.parseOne(open(os.environ['_CONDOR_JOB_AD']))
        username = task_ad['CRAB_UserHN']
        config.Services.Rucio_account = username

        # need the global black list
        config.TaskWorker.scratchDir = './scratchdir'
        if not os.path.exists(config.TaskWorker.scratchDir):
            os.makedirs(config.TaskWorker.scratchDir)
        from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites
        banSites = CRAB3BanDestinationSites(config, self.logger)
        with config.TaskWorker.envForCMSWEB:
            banSites.execute()

        # Read the automatic_splitting/throughputs/0-N files where the PJ
        # saved the EventThroughput
        # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput'])
        # and the average size of the output per event
        sumEventsThr = 0
        sumEventsSize = 0
        count = 0
        for jid in estimates:
            if jid in self.failedJobs:
                continue
            fn = "automatic_splitting/throughputs/{0}".format(jid)
            with open(fn) as fd:
                throughput, eventsize = json.load(fd)
                sumEventsThr += throughput
                sumEventsSize += eventsize
                count += 1
        eventsThr = sumEventsThr / count
        eventsSize = sumEventsSize / count

        self.logger.info("average throughput for %s jobs: %s evt/s", count,
                         eventsThr)
        self.logger.info("average eventsize for %s jobs: %s bytes", count,
                         eventsSize)

        maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum',
                          5 * 1000**3)
        maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0

        runtime = task['tm_split_args'].get('minutes_per_job', -1)
        if self.stage == "processing":
            # Build in a 33% error margin in the runtime to not create too
            # many tails. This essentially moves the peak to lower
            # runtimes and cuts off less of the job distribution tail.
            target = int(0.75 * runtime)
        elif self.stage == 'tail':
            target = int(
                max(
                    getattr(config.TaskWorker,
                            'automaticTailRuntimeMinimumMins', 45),
                    getattr(config.TaskWorker, 'automaticTailRuntimeFraction',
                            0.2) * runtime))
        # `target` is in minutes, `eventsThr` is in events/second!
        events = int(target * eventsThr * 60)
        if events > maxEvents and maxEvents > 0:
            self.logger.info(
                "reduced the target event count from %s to %s to obey output size",
                events, maxEvents)
            events = int(maxEvents)
        splitTask = dict(task)
        splitTask['tm_split_algo'] = 'EventAwareLumiBased'
        splitTask['tm_split_args']['events_per_job'] = events

        if self.stage == 'tail' and not self.adjustLumisForCompletion(
                splitTask, unprocessed):
            self.logger.info("nothing to process for completion")
            self.saveProcessedJobs(unprocessed)
            return 0

        # Disable retries for processing: every lumi is attempted to be
        # processed once in processing, thrice in the tails -> four times.
        # That should be enough "retries"
        #
        # See note in DagmanCreator about getting this from the Task DB
        if self.stage == "processing":
            config.TaskWorker.numAutomJobRetries = 0

        try:
            splitter = Splitter(config, crabserver=None)
            split_result = splitter.execute(dataset, task=splitTask)
            self.logger.info("Splitting results:")
            for g in split_result.result[0]:
                msg = "Created jobgroup with length {0}".format(
                    len(g.getJobs()))
                self.logger.info(msg)
        except TaskWorkerException as e:
            retmsg = "Splitting failed with:\n{0}".format(e)
            self.logger.error(retmsg)
            #            self.set_dashboard_state('FAILED')
            return 1
        try:
            parent = self.prefix if self.stage == 'tail' else None
            rucioClient = getNativeRucioClient(config=config,
                                               logger=self.logger)
            creator = DagmanCreator(config,
                                    crabserver=None,
                                    rucioClient=rucioClient)
            with config.TaskWorker.envForCMSWEB:
                creator.createSubdag(split_result.result,
                                     task=task,
                                     parent=parent,
                                     stage=self.stage)
            self.submitSubdag(
                'RunJobs{0}.subdag'.format(self.prefix),
                getattr(config.TaskWorker, 'maxIdle', MAX_IDLE_JOBS),
                getattr(config.TaskWorker, 'maxPost', MAX_POST_JOBS),
                self.stage)
        except TaskWorkerException as e:
            retmsg = "DAG creation failed with:\n{0}".format(e)
            self.logger.error(retmsg)
            #            self.set_dashboard_state('FAILED')
            return 1
        self.saveProcessedJobs(unprocessed)
        return 0
示例#32
0
def main(args):
    transfers = [
        pickle.load(f.open("rb"))
        for f in Path(TRANSFER_PLUGIN_CACHE).iterdir()
    ]

    print(f"Found {len(transfers)} URL transfers to process.\n")

    if len(transfers) == 0:
        print("Nothing to do!")

        write_dict_to_file_as_ad(
            {
                "TransferSuccess": True,
                "TransferFileName": "",
                "TransferUrl": "",
            },
            args["outfile"],
        )
        return

    builtin_plugins = htcondor.param["FILETRANSFER_PLUGINS"].split(", ")

    available_methods = {
        plugin: classad.parseOne(
            subprocess.run([plugin, "-classad"],
                           stdout=subprocess.PIPE).stdout.decode("utf-8"))
        ["SupportedMethods"].split(",")
        for plugin in reversed(builtin_plugins)
    }

    print("Available plugins and methods (in search order):")
    for k, v in available_methods.items():
        print(f"{k} => {v}")
    print()

    deferred_transfers = []
    for output_file, destination in transfers:
        protocol = determine_protocol(destination)
        plugin = find_first_plugin(available_methods, protocol)
        print(
            f"Will transfer {output_file} to {destination} using protocol {protocol} implemented by plugin {plugin}"
        )
        deferred_transfers.append(
            DeferredTransfer(output_file=output_file,
                             destination=destination,
                             plugin=plugin))

    # TODO: group transfers by plugin

    working = Path(USER_URL_TRANSFER_WORKING)
    working.mkdir(parents=True, exist_ok=True)
    for transfer in deferred_transfers:
        infile = working / f"{transfer.id}.in"
        outfile = working / f"{transfer.id}.out"

        infile.write_text(
            str(
                classad.ClassAd({
                    "LocalFileName": str(transfer.output_file),
                    "Url": transfer.destination,
                })))

        cmd = [
            transfer.plugin,
            "-infile",
            str(infile),
            "-outfile",
            str(outfile),
            "-upload",
        ]
        print(f"Invoking {' '.join(cmd)}")
        run_plugin = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        if run_plugin.returncode != 0:
            print(
                f"Plugin {transfer.plugin} failed! Its return code was {run_plugin.returncode}"
            )
            print(f"Captured stdout:")
            print(run_plugin.stdout.decode())
            print(f"Captured stderr:")
            print(run_plugin.stderr.decode())

            outfile.rename(Path(args["outfile"]))
            sys.exit(-1)

        print(
            f"Transferred {transfer.output_file} to {transfer.destination} successfully!"
        )

    write_dict_to_file_as_ad(
        {
            "TransferSuccess": True,
            "TransferFileName": "",
            "TransferUrl": "",
        },
        args["outfile"],
    )
# Import classad

__BASENAME__ = os.path.basename(__file__)


# Quick and dirty debug function, please replace.
def debug(will_debug, _fd, message):
    if will_debug:
        dt = datetime.utcnow().strftime("%Y%m%d %s")
        _fd.write("[{0}] {1}\n".format(dt, message))
        return True
    else:
        return False


job_classad = classad.parseOne(sys.stdin.read())
home = pwd.getpwnam(pwd.getpwuid(os.getuid())[0]).pw_dir

# 'HMDCNewSubmit'

try:
    hmdc_new_submit = job_classad['HMDCNewSubmit']
    hmdc_interactive_job = job_classad['HMDCInteractive']
except:
    sys.exit(0)

if hmdc_new_submit == False or hmdc_interactive_job == False:
    sys.exit(0)

# Should we debug this hook?
try:
示例#34
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(
            os.environ["_CONDOR_JOB_AD"]):
        printLog(
            "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist"
        )
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" %
             os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOne(fd)
    printLog("Parsed ad: %s" % ad)

    # instantiate a server object to talk with crabserver
    host = ad['CRAB_RestHost']
    dbInstance = ad['CRAB_DbInstance']
    cert = ad['X509UserProxy']
    crabserver = CRABRest(host, cert, cert, retry=3, userAgent='CRABSchedd')
    crabserver.setDbInstance(dbInstance)

    checkTaskInfo(crabserver, ad)

    # is this the first time this script runs for this task ? (it runs at each resubmit as well !)
    if not os.path.exists('WEB_DIR'):
        makeWebDir(ad)
        printLog(
            "Webdir has been set up. Uploading the webdir URL to the REST")

        retries = 0
        exitCode = 1
        maxRetries = 3
        while retries < maxRetries and exitCode != 0:
            exitCode = uploadWebDir(crabserver, ad)
            if exitCode != 0:
                time.sleep(retries * 20)
            retries += 1
        if exitCode != 0:
            printLog(
                "Exiting AdjustSites because the webdir upload failed %d times."
                % maxRetries)
            sys.exit(1)
        printLog(
            "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir"
            % exitCode)

        saveProxiedWebdir(crabserver, ad)
        printLog("Proxied webdir saved")

    printLog(
        "Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(
        ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log",
                            "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(
                        adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(
                    adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parseOne(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")