示例#1
0
 def Drain(self):
     self.launch_daemons(["COLLECTOR", "STARTD"])
     output_file = os.path.abspath(os.path.join(testdir, "test.out"))
     if os.path.exists(output_file):
         os.unlink(output_file)
     coll = htcondor.Collector()
     for i in range(10):
         ads = coll.locateAll(htcondor.DaemonTypes.Startd)
         if len(ads) > 0: break
         time.sleep(1)
     startd = htcondor.Startd(ads[0])
     drain_id = startd.drainJobs(htcondor.DrainTypes.Fast)
     startd.cancelDrainJobs(drain_id)
示例#2
0
def process_startd(
    start_time, since, checkpoint_queue, startd_ad, args, metadata=None
):
    """
    Given a startd, process its entire set of history since last checkpoint.
    """
    last_completion = since["EnteredCurrentStatus"]
    since_str = f"""(GlobalJobId == "{since['GlobalJobId']}") && (EnteredCurrentStatus == {since['EnteredCurrentStatus']})"""
    my_start = time.time()
    if utils.time_remaining(start_time) < 0:
        message = (
            "No time remaining to process %s history; exiting." % startd_ad["Machine"]
        )
        logging.error(message)
        utils.send_email_alert(
            args.email_alerts, "spider history timeout warning", message
        )
        return since

    metadata = metadata or {}
    startd = htcondor.Startd(startd_ad)
    logging.info(
        "Querying %s for history",
        startd_ad["Machine"]
    )
    buffered_ads = {}
    count = 0
    total_upload = 0
    sent_warnings = False
    timed_out = False
    if not args.read_only and args.es_feed_startd_history:
        es = elastic.get_server_handle(args)
    try:
        if not args.dry_run:
            history_iter = startd.history("True", [], since=since_str)
        else:
            history_iter = []

        for job_ad in history_iter:
            try:
                dict_ad = convert.to_json(job_ad, return_dict=True)
            except Exception as e:
                message = f"Failure when converting document on {startd_ad['Machine']} history: {e}"
                exc = traceback.format_exc()
                message += f"\n{exc}"
                logging.warning(message)
                if not sent_warnings:
                    utils.send_email_alert(
                        args.email_alerts,
                        "spider history document conversion error",
                        message,
                    )
                    sent_warnings = True

                continue

            idx = elastic.get_index(
                index_time(args.es_index_date_attr, job_ad),
                template=args.es_index_name,
                update_es=(args.es_feed_startd_history and not args.read_only),
            )
            ad_list = buffered_ads.setdefault(idx, [])
            ad_list.append((convert.unique_doc_id(dict_ad), dict_ad))

            if len(ad_list) == args.es_bunch_size:
                st = time.time()
                if not args.read_only and args.es_feed_startd_history:
                    elastic.post_ads(es.handle, idx, ad_list, metadata=metadata)
                logging.debug(
                    "...posting %d ads from %s (process_startd)",
                    len(ad_list),
                    startd_ad["Machine"],
                )
                total_upload += time.time() - st
                buffered_ads[idx] = []

            count += 1

            job_completion = job_ad.get("EnteredCurrentStatus")
            if job_completion > last_completion:
                last_completion = job_completion
                since = {
                    "GlobalJobId": job_ad.get("GlobalJobId"),
                    "EnteredCurrentStatus": job_ad.get("EnteredCurrentStatus"),
                }

            if utils.time_remaining(start_time) < 0:
                message = f"History crawler on {startd_ad['Machine']} has been running for more than {utils.TIMEOUT_MINS:d} minutes; exiting."
                logging.error(message)
                utils.send_email_alert(
                    args.email_alerts, "spider history timeout warning", message
                )
                timed_out = True
                break

            if args.process_max_documents and count > args.process_max_documents:
                logging.warning(
                    "Aborting after %d documents (--process_max_documents option)"
                    % args.process_max_documents
                )
                break

    except RuntimeError:
        message = "Failed to query startd for job history: %s" % startd_ad["Machine"]
        exc = traceback.format_exc()
        message += f"\n{exc}"
        logging.error(message)

    except Exception as exn:
        message = f"Failure when processing startd history query on {startd_ad['Machine']}: {str(exn)}"
        exc = traceback.format_exc()
        message += f"\n{exc}"
        logging.exception(message)
        utils.send_email_alert(
            args.email_alerts, "spider startd history query error", message
        )

    # Post the remaining ads
    for idx, ad_list in list(buffered_ads.items()):
        if ad_list:
            logging.debug(
                "...posting remaining %d ads from %s " "(process_startd)",
                len(ad_list),
                startd_ad["Machine"],
            )
            if not args.read_only:
                if args.es_feed_startd_history:
                    elastic.post_ads(es.handle, idx, ad_list, metadata=metadata)

    total_time = (time.time() - my_start) / 60.0
    total_upload /= 60.0
    last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime(
        "%Y-%m-%d %H:%M:%S"
    )
    logging.warning(
        "Startd %-25s history: response count: %5d; last completion %s; query time %.2f min; upload time %.2f min",
        startd_ad["Machine"],
        count,
        last_formatted,
        total_time - total_upload,
        total_upload,
    )

    # If we got to this point without a timeout, all these jobs have
    # been processed and uploaded, so we can update the checkpoint
    if not timed_out:
        checkpoint_queue.put((startd_ad["Machine"], since))

    return since
示例#3
0
def process_startd(start_time,
                   since,
                   checkpoint_queue,
                   startd_ad,
                   args,
                   metadata=None):
    """
    Given a startd, process its entire set of history since last checkpoint.
    """
    my_start = time.time()
    metadata = metadata or {}
    metadata["condor_history_source"] = "startd"
    metadata["condor_history_runtime"] = int(my_start)
    metadata["condor_history_host_version"] = startd_ad.get(
        "CondorVersion", "UNKNOWN")
    metadata["condor_history_host_platform"] = startd_ad.get(
        "CondorPlatform", "UNKNOWN")
    metadata["condor_history_host_machine"] = startd_ad.get(
        "Machine", "UNKNOWN")
    metadata["condor_history_host_name"] = startd_ad.get("Name", "UNKNOWN")
    last_completion = since["EnteredCurrentStatus"]
    since_str = f"""(GlobalJobId == "{since['GlobalJobId']}") && (EnteredCurrentStatus == {since['EnteredCurrentStatus']})"""

    max_ads = args.startd_history_max_ads  # specify number of history entries to read
    if max_ads > 10000:
        logging.debug(
            f"Please note that the maximum number of queries per scheduler is also limited by the scheduler's config (HISTORY_HELPER_MAX_HISTORY)."
        )
        logging.info(
            f"Note that a too large number of startd_history_max_ads can cause condor_adstash to break!"
        )

    startd = htcondor.Startd(startd_ad)
    logging.info(
        f"Querying {startd_ad['Machine']} for history since: {since_str}")
    buffered_ads = {}
    count = 0
    total_upload = 0
    timed_out = False
    if not args.read_only:
        es = elastic.get_server_handle(args)
    try:
        if not args.dry_run:
            history_iter = startd.history(
                requirements="true",
                projection=[],
                match=max_ads,  # default=10000
                since=since_str)
        else:
            history_iter = []

        for job_ad in history_iter:
            try:
                dict_ad = convert.to_json(job_ad, return_dict=True)
            except Exception as e:
                message = f"Failure when converting document on {startd_ad['Machine']} history: {e}"
                exc = traceback.format_exc()
                message += f"\n{exc}"
                logging.warning(message)
                continue

            idx = elastic.get_index(args.es_index_name)
            ad_list = buffered_ads.setdefault(idx, [])
            ad_list.append((convert.unique_doc_id(dict_ad), dict_ad))

            if len(ad_list) == args.es_bunch_size:
                st = time.time()
                if not args.read_only:
                    elastic.post_ads(es.handle,
                                     idx,
                                     ad_list,
                                     metadata=metadata)
                logging.debug(
                    f"Posting {len(ad_list)} ads from {startd_ad['Machine']} (process_startd)"
                )
                total_upload += time.time() - st
                buffered_ads[idx] = []

            count += 1

            job_completion = job_ad.get("EnteredCurrentStatus")
            if job_completion > last_completion:
                last_completion = job_completion
                since = {
                    "GlobalJobId": job_ad.get("GlobalJobId"),
                    "EnteredCurrentStatus": job_ad.get("EnteredCurrentStatus"),
                }

            if utils.time_remaining(my_start,
                                    args.startd_history_timeout) <= 0:
                message = f"History crawler on {startd_ad['Machine']} has been running for more than {args.schedd_history_timeout} seconds; pushing last ads and exiting."
                logging.error(message)
                timed_out = True
                break

    except RuntimeError:
        message = f"Failed to query startd {startd_ad['Machine']} for job history"
        logging.exception(message)
        return since

    except Exception:
        message = f"Failure when processing startd history query on {startd_ad['Machine']}"
        logging.exception(message)
        return since

    # Post the remaining ads
    for idx, ad_list in list(buffered_ads.items()):
        if ad_list:
            logging.debug(
                f"Posting remaining {len(ad_list)} ads from {startd_ad['Machine']} (process_startd)"
            )
            if not args.read_only:
                elastic.post_ads(es.handle, idx, ad_list, metadata=metadata)

    total_time = (time.time() - my_start) / 60.0
    total_upload /= 60.0
    last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime(
        "%Y-%m-%d %H:%M:%S")
    logging.info(
        f"Startd {startd_ad['Machine']} history: response count: {count}; last job {last_formatted}; query time {total_time - total_upload:.2f} min; upload time {total_upload:.2f} min"
    )
    if count >= max_ads:
        logging.warning(
            f"Max ads ({max_ads}) was reached "
            f"for {startd_ad['Machine']}, some history may be missing!")

    # If we got to this point without a timeout, all these jobs have
    # been processed and uploaded, so we can update the checkpoint
    if not timed_out:
        checkpoint_queue.put((startd_ad["Machine"], since))

    return since