Пример #1
0
def process_docs(
    docs,
    reduce_data=True,
    pool_name="UNKNOWN",
    feed_es=True,
    es_index="cms-test-k8s",
    metadata=None,
    feed_es_only_completed=True,
    start_time=None,
):
    """Celery task which processes the documents to a suitable format send to AMQ and ES.

    Sends converted ClassAd json to AMQ by running `amq_post_ads` function
    and also to ES if required by calling `post_ads_es` task.

    Args:
        docs (list[dict]): iterable with the jobs' classads.
        reduce_data (bool): Should we slim down the running/pending jobs records? Related with `keep_full_queue_data`
            see `send_data` method: `reduce_data=not keep_full_queue_data`
        pool_name (str): Pool of the source schedd
        feed_es (bool): Should we send the data to ES?
        es_index (str): Elasticsearch index prefix
        metadata (dict): dictionary with the additional metadata (used only for ES documents).
        feed_es_only_completed (bool): Should we send all the documents to ES or only the completed/removed.
            Default is used as True.
        start_time (float): Comes from celery_spider_cms -> query_schedd -> send_data

    Returns:
        htcondor_es.amq.amq_post_ads # It returns tuple of (successful tasks, total tasks, elapsed time)

    """
    converted_docs = []
    #: list(tuple(str, dict)): unique_doc_id and converted ClassAd dict pairs.
    es_docs = []
    for doc in docs:
        try:
            c_doc = convert_to_json(
                doc,
                return_dict=True,
                reduce_data=reduce_data,
                pool_name=pool_name,
                start_time=start_time,
            )
            if c_doc:
                converted_docs.append(
                    (unique_doc_id(c_doc),
                     convert_dates_to_millisecs(c_doc.copy())))
            if (feed_es and c_doc
                    and (not feed_es_only_completed
                         or c_doc["Status"] in ("Completed", "Removed"))):
                es_docs.append((unique_doc_id(c_doc), c_doc))
        except Exception as e:
            print("WARNING: Error on convert_to_json: {} {}".format(
                str(e), str(traceback.format_exc())))
            continue
    if es_docs:
        post_ads_es.si(es_docs, es_index, metadata).apply_async()
    return amq_post_ads(converted_docs,
                        metadata=metadata) if converted_docs else []
Пример #2
0
def process_schedd(starttime, last_completion, checkpoint_queue, schedd_ad,
                   args):
    """
    Given a schedd, process its entire set of history since last checkpoint.
    """
    my_start = time.time()
    if time_remaining(starttime) < 0:
        message = ("No time remaining to process %s history; exiting." %
                   schedd_ad['Name'])
        logging.error(message)
        send_email_alert(args.email_alerts,
                         "spider_cms history timeout warning", message)
        return last_completion

    schedd = htcondor.Schedd(schedd_ad)
    history_query = classad.ExprTree("EnteredCurrentStatus >= %d" %
                                     last_completion)
    logging.info("Querying %s for history: %s.  "
                 "%.1f minutes of ads", schedd_ad["Name"], history_query,
                 (time.time() - last_completion) / 60.)
    buffered_ads = {}
    count = 0
    total_upload = 0
    sent_warnings = False
    timed_out = False
    if not args.read_only:
        if args.feed_es:
            es = htcondor_es.es.get_server_handle(args)
    try:
        if not args.dry_run:
            history_iter = schedd.history(history_query, [], 10000)
        else:
            history_iter = []

        for job_ad in history_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(job_ad, return_dict=True)
            except Exception as e:
                message = (
                    "Failure when converting document on %s history: %s" %
                    (schedd_ad["Name"], str(e)))
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms history document conversion error",
                        message)
                    sent_warnings = True

            if not dict_ad:
                continue

            idx = htcondor_es.es.get_index(job_ad["QDate"],
                                           template=args.es_index_template,
                                           update_es=(args.feed_es
                                                      and not args.read_only))
            ad_list = buffered_ads.setdefault(idx, [])
            ad_list.append((unique_doc_id(dict_ad), dict_ad))

            if len(ad_list) == args.es_bunch_size:
                st = time.time()
                if not args.read_only:
                    if args.feed_es:
                        data_for_es = [(id_, json.dumps(dict_ad))
                                       for id_, dict_ad in ad_list]
                        htcondor_es.es.post_ads(es=es.handle,
                                                idx=idx,
                                                ads=data_for_es)
                    if args.feed_amq:
                        data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \
                                        id_, dict_ad in ad_list]
                        htcondor_es.amq.post_ads(data_for_amq)
                logging.debug("...posting %d ads from %s (process_schedd)",
                              len(ad_list), schedd_ad["Name"])
                total_upload += time.time() - st
                buffered_ads[idx] = []

            count += 1

            # Find the most recent job and use that date as the new
            # last_completion date
            job_completion = job_ad.get("EnteredCurrentStatus")
            if job_completion > last_completion:
                last_completion = job_completion

            if time_remaining(starttime) < 0:
                message = ("History crawler on %s has been running for "
                           "more than %d minutes; exiting." %
                           (schedd_ad["Name"], TIMEOUT_MINS))
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms history timeout warning", message)
                timed_out = True
                break

    except RuntimeError:
        logging.error("Failed to query schedd for job history: %s",
                      schedd_ad["Name"])

    except Exception as exn:
        message = ("Failure when processing schedd history query on %s: %s" %
                   (schedd_ad["Name"], str(exn)))
        logging.exception(message)
        send_email_alert(args.email_alerts,
                         "spider_cms schedd history query error", message)

    # Post the remaining ads
    for idx, ad_list in buffered_ads.items():
        if ad_list:
            logging.debug(
                "...posting remaining %d ads from %s "
                "(process_schedd)", len(ad_list), schedd_ad["Name"])
            if not args.read_only:
                if args.feed_es:
                    htcondor_es.es.post_ads(es=es.handle,
                                            idx=idx,
                                            ads=[(id_, json.dumps(dict_ad))
                                                 for id_, dict_ad in ad_list])
                if args.feed_amq:
                    data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \
                                    id_, dict_ad in ad_list]
                    htcondor_es.amq.post_ads(data_for_amq)

    total_time = (time.time() - my_start) / 60.
    total_upload /= 60.
    last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime(
        "%Y-%m-%d %H:%M:%S")
    logging.warning(
        "Schedd %-25s history: response count: %5d; last completion %s; "
        "query time %.2f min; upload time %.2f min", schedd_ad["Name"], count,
        last_formatted, total_time - total_upload, total_upload)

    # If we got to this point without a timeout, all these jobs have
    # been processed and uploaded, so we can update the checkpoint
    if not timed_out:
        checkpoint_queue.put((schedd_ad["Name"], last_completion))

    return last_completion
Пример #3
0
def query_schedd_queue(starttime, schedd_ad, queue, args):
    my_start = time.time()
    pool_name = schedd_ad.get("CMS_Pool", "Unknown")
    logging.info("Querying %s queue for jobs.", schedd_ad["Name"])
    if time_remaining(starttime) < 10:
        message = ("No time remaining to run queue crawler on %s; "
                   "exiting." % schedd_ad["Name"])
        logging.error(message)
        send_email_alert(args.email_alerts, "spider_cms queue timeout warning",
                         message)
        return

    count_since_last_report = 0
    count = 0
    cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime
    queue.put(schedd_ad["Name"], timeout=time_remaining(starttime))

    schedd = htcondor.Schedd(schedd_ad)
    sent_warnings = False
    batch = []
    # Query for a snapshot of the jobs running/idle/held,
    # but only the completed that had changed in the last period of time.
    _completed_since = starttime - (TIMEOUT_MINS + 1) * 60
    query = ("""
         (JobStatus < 3 || JobStatus > 4 
         || EnteredCurrentStatus >= %(completed_since)d
         || CRAB_PostJobLastUpdate >= %(completed_since)d
         ) && (CMS_Type != "DONOTMONIT")
         """ % {
        "completed_since": _completed_since
    })
    try:
        query_iter = schedd.xquery(
            requirements=query) if not args.dry_run else []
        for job_ad in query_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(
                    job_ad,
                    return_dict=True,
                    reduce_data=not args.keep_full_queue_data,
                    pool_name=pool_name)
            except Exception as e:
                message = "Failure when converting document on %s queue: %s" % (
                    schedd_ad["Name"],
                    str(e),
                )
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms queue document conversion error",
                        message,
                    )
                    sent_warnings = True

            if not dict_ad:
                continue

            batch.append((unique_doc_id(dict_ad), dict_ad))
            count += 1
            count_since_last_report += 1

            if not args.dry_run and len(batch) == args.query_queue_batch_size:
                if time_remaining(starttime) < 10:
                    message = ("Queue crawler on %s has been running for "
                               "more than %d minutes; exiting" %
                               (schedd_ad["Name"], TIMEOUT_MINS))
                    logging.error(message)
                    send_email_alert(args.email_alerts,
                                     "spider_cms queue timeout warning",
                                     message)
                    break
                queue.put(batch, timeout=time_remaining(starttime))
                batch = []
                if count_since_last_report >= 1000:
                    cpu_usage_now = resource.getrusage(
                        resource.RUSAGE_SELF).ru_utime
                    cpu_usage = cpu_usage_now - cpu_usage
                    processing_rate = count_since_last_report / cpu_usage
                    cpu_usage = cpu_usage_now
                    logging.info(
                        "Processor for %s has processed %d jobs "
                        "(%.1f jobs per CPU-second)",
                        schedd_ad["Name"],
                        count,
                        processing_rate,
                    )
                    count_since_last_report = 0

            if args.max_documents_to_process and count > args.max_documents_to_process:
                logging.warning(
                    "Aborting after %d documents (--max_documents_to_process option)"
                    % args.max_documents_to_process)
                break

    except RuntimeError as e:
        logging.error("Failed to query schedd %s for jobs: %s",
                      schedd_ad["Name"], str(e))
    except Exception as e:
        message = "Failure when processing schedd queue query on %s: %s" % (
            schedd_ad["Name"],
            str(e),
        )
        logging.error(message)
        send_email_alert(args.email_alerts,
                         "spider_cms schedd queue query error", message)
        traceback.print_exc()

    if batch:  # send remaining docs
        queue.put(batch, timeout=time_remaining(starttime))
        batch = []

    queue.put(schedd_ad["Name"], timeout=time_remaining(starttime))
    total_time = (time.time() - my_start) / 60.0
    logging.warning(
        "Schedd %-25s queue: response count: %5d; "
        "query time %.2f min; ",
        schedd_ad["Name"],
        count,
        total_time,
    )

    return count
Пример #4
0
def process_schedd(starttime,
                   last_completion,
                   checkpoint_queue,
                   schedd_ad,
                   args,
                   metadata=None):
    """
    Given a schedd, process its entire set of history since last checkpoint.
    """
    my_start = time.time()
    pool_name = schedd_ad.get("CMS_Pool", "Unknown")
    if time_remaining(starttime) < 0:
        message = ("No time remaining to process %s history; exiting." %
                   schedd_ad["Name"])
        logging.error(message)
        send_email_alert(args.email_alerts,
                         "spider_cms history timeout warning", message)
        return last_completion

    metadata = metadata or {}
    schedd = htcondor.Schedd(schedd_ad)
    _q = """
        ( EnteredCurrentStatus >= %(last_completion)d 
        || CRAB_PostJobLastUpdate >= %(last_completion)d )
        && (CMS_Type != "DONOTMONIT")
        """
    history_query = classad.ExprTree(_q % {"last_completion": last_completion})
    logging.info(
        "Querying %s for history: %s.  "
        "%.1f minutes of ads",
        schedd_ad["Name"],
        history_query,
        (time.time() - last_completion) / 60.0,
    )
    buffered_ads = {}
    count = 0
    total_upload = 0
    sent_warnings = False
    timed_out = False
    if not args.read_only:
        if args.feed_es:
            es = htcondor_es.es.get_server_handle(args)
    try:
        if not args.dry_run:
            history_iter = schedd.history(history_query, [], 10000)
        else:
            history_iter = []

        for job_ad in history_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(job_ad,
                                          return_dict=True,
                                          pool_name=pool_name)
            except Exception as e:
                message = "Failure when converting document on %s history: %s" % (
                    schedd_ad["Name"],
                    str(e),
                )
                exc = traceback.format_exc()
                message += "\n{}".format(exc)
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms history document conversion error",
                        message,
                    )
                    sent_warnings = True

            if not dict_ad:
                continue

            idx = htcondor_es.es.get_index(
                job_ad["QDate"],
                template=args.es_index_template,
                update_es=(args.feed_es and not args.read_only),
            )
            ad_list = buffered_ads.setdefault(idx, [])
            ad_list.append((unique_doc_id(dict_ad), dict_ad))

            if len(ad_list) == args.es_bunch_size:
                st = time.time()
                if not args.read_only:
                    if args.feed_es:
                        htcondor_es.es.post_ads(es.handle,
                                                idx,
                                                ad_list,
                                                metadata=metadata)
                    if args.feed_amq:
                        data_for_amq = [(id_,
                                         convert_dates_to_millisecs(dict_ad))
                                        for id_, dict_ad in ad_list]
                        htcondor_es.amq.post_ads(data_for_amq,
                                                 metadata=metadata)

                logging.debug(
                    "...posting %d ads from %s (process_schedd)",
                    len(ad_list),
                    schedd_ad["Name"],
                )
                total_upload += time.time() - st
                buffered_ads[idx] = []

            count += 1

            # Find the most recent job and use that date as the new
            # last_completion date
            job_completion = job_ad.get("EnteredCurrentStatus")
            if job_completion > last_completion:
                last_completion = job_completion

            if time_remaining(starttime) < 0:
                message = ("History crawler on %s has been running for "
                           "more than %d minutes; exiting." %
                           (schedd_ad["Name"], TIMEOUT_MINS))
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms history timeout warning", message)
                timed_out = True
                break

            if args.max_documents_to_process and count > args.max_documents_to_process:
                logging.warning(
                    "Aborting after %d documents (--max_documents_to_process option)"
                    % args.max_documents_to_process)
                break

    except RuntimeError:
        message = "Failed to query schedd for job history: %s" % schedd_ad[
            "Name"]
        exc = traceback.format_exc()
        message += "\n{}".format(exc)
        logging.error(message)

    except Exception as exn:
        message = "Failure when processing schedd history query on %s: %s" % (
            schedd_ad["Name"],
            str(exn),
        )
        exc = traceback.format_exc()
        message += "\n{}".format(exc)
        logging.exception(message)
        send_email_alert(args.email_alerts,
                         "spider_cms schedd history query error", message)

    # Post the remaining ads
    for idx, ad_list in list(buffered_ads.items()):
        if ad_list:
            logging.debug(
                "...posting remaining %d ads from %s "
                "(process_schedd)",
                len(ad_list),
                schedd_ad["Name"],
            )
            if not args.read_only:
                if args.feed_es:
                    htcondor_es.es.post_ads(es.handle,
                                            idx,
                                            ad_list,
                                            metadata=metadata)
                if args.feed_amq:
                    data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad))
                                    for id_, dict_ad in ad_list]
                    htcondor_es.amq.post_ads(data_for_amq, metadata=metadata)

    total_time = (time.time() - my_start) / 60.0
    total_upload /= 60.0
    last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime(
        "%Y-%m-%d %H:%M:%S")
    logging.warning(
        "Schedd %-25s history: response count: %5d; last completion %s; "
        "query time %.2f min; upload time %.2f min",
        schedd_ad["Name"],
        count,
        last_formatted,
        total_time - total_upload,
        total_upload,
    )

    # If we got to this point without a timeout, all these jobs have
    # been processed and uploaded, so we can update the checkpoint
    if not timed_out:
        checkpoint_queue.put((schedd_ad["Name"], last_completion))

    return last_completion
Пример #5
0
def process_schedd_queue(starttime, schedd_ad, queue, args):
    my_start = time.time()
    logging.info("Querying %s queue for jobs.", schedd_ad["Name"])
    if time_remaining(starttime) < 10:
        message = ("No time remaining to run queue crawler on %s; "
                   "exiting." % schedd_ad['Name'])
        logging.error(message)
        send_email_alert(args.email_alerts, "spider_cms queue timeout warning",
                         message)
        return

    count_since_last_report = 0
    count = 0
    cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime
    queue.put(schedd_ad['Name'], timeout=time_remaining(starttime))

    schedd = htcondor.Schedd(schedd_ad)
    sent_warnings = False
    batch = []
    try:
        query_iter = schedd.xquery() if not args.dry_run else []
        for job_ad in query_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(
                    job_ad,
                    return_dict=True,
                    reduce_data=not args.keep_full_queue_data)
            except Exception as e:
                message = ("Failure when converting document on %s queue: %s" %
                           (schedd_ad["Name"], str(e)))
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms queue document conversion error", message)
                    sent_warnings = True

            if not dict_ad:
                continue

            batch.append((unique_doc_id(dict_ad), dict_ad))
            count += 1
            count_since_last_report += 1

            if not args.dry_run and len(batch) == args.query_queue_batch_size:
                if time_remaining(starttime) < 10:
                    message = ("Queue crawler on %s has been running for "
                               "more than %d minutes; exiting" %
                               (schedd_ad['Name'], TIMEOUT_MINS))
                    logging.error(message)
                    send_email_alert(args.email_alerts,
                                     "spider_cms queue timeout warning",
                                     message)
                    break
                queue.put(batch, timeout=time_remaining(starttime))
                batch = []
                if count_since_last_report >= 1000:
                    cpu_usage_now = resource.getrusage(
                        resource.RUSAGE_SELF).ru_utime
                    cpu_usage = cpu_usage_now - cpu_usage
                    processing_rate = count_since_last_report / cpu_usage
                    cpu_usage = cpu_usage_now
                    logging.info(
                        "Processor for %s has processed %d jobs "
                        "(%.1f jobs per CPU-second)", schedd_ad['Name'], count,
                        processing_rate)
                    count_since_last_report = 0

    except RuntimeError, e:
        logging.error("Failed to query schedd %s for jobs: %s",
                      schedd_ad["Name"], str(e))