def process_docs( docs, reduce_data=True, pool_name="UNKNOWN", feed_es=True, es_index="cms-test-k8s", metadata=None, feed_es_only_completed=True, start_time=None, ): """Celery task which processes the documents to a suitable format send to AMQ and ES. Sends converted ClassAd json to AMQ by running `amq_post_ads` function and also to ES if required by calling `post_ads_es` task. Args: docs (list[dict]): iterable with the jobs' classads. reduce_data (bool): Should we slim down the running/pending jobs records? Related with `keep_full_queue_data` see `send_data` method: `reduce_data=not keep_full_queue_data` pool_name (str): Pool of the source schedd feed_es (bool): Should we send the data to ES? es_index (str): Elasticsearch index prefix metadata (dict): dictionary with the additional metadata (used only for ES documents). feed_es_only_completed (bool): Should we send all the documents to ES or only the completed/removed. Default is used as True. start_time (float): Comes from celery_spider_cms -> query_schedd -> send_data Returns: htcondor_es.amq.amq_post_ads # It returns tuple of (successful tasks, total tasks, elapsed time) """ converted_docs = [] #: list(tuple(str, dict)): unique_doc_id and converted ClassAd dict pairs. es_docs = [] for doc in docs: try: c_doc = convert_to_json( doc, return_dict=True, reduce_data=reduce_data, pool_name=pool_name, start_time=start_time, ) if c_doc: converted_docs.append( (unique_doc_id(c_doc), convert_dates_to_millisecs(c_doc.copy()))) if (feed_es and c_doc and (not feed_es_only_completed or c_doc["Status"] in ("Completed", "Removed"))): es_docs.append((unique_doc_id(c_doc), c_doc)) except Exception as e: print("WARNING: Error on convert_to_json: {} {}".format( str(e), str(traceback.format_exc()))) continue if es_docs: post_ads_es.si(es_docs, es_index, metadata).apply_async() return amq_post_ads(converted_docs, metadata=metadata) if converted_docs else []
def process_schedd(starttime, last_completion, checkpoint_queue, schedd_ad, args): """ Given a schedd, process its entire set of history since last checkpoint. """ my_start = time.time() if time_remaining(starttime) < 0: message = ("No time remaining to process %s history; exiting." % schedd_ad['Name']) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) return last_completion schedd = htcondor.Schedd(schedd_ad) history_query = classad.ExprTree("EnteredCurrentStatus >= %d" % last_completion) logging.info("Querying %s for history: %s. " "%.1f minutes of ads", schedd_ad["Name"], history_query, (time.time() - last_completion) / 60.) buffered_ads = {} count = 0 total_upload = 0 sent_warnings = False timed_out = False if not args.read_only: if args.feed_es: es = htcondor_es.es.get_server_handle(args) try: if not args.dry_run: history_iter = schedd.history(history_query, [], 10000) else: history_iter = [] for job_ad in history_iter: dict_ad = None try: dict_ad = convert_to_json(job_ad, return_dict=True) except Exception as e: message = ( "Failure when converting document on %s history: %s" % (schedd_ad["Name"], str(e))) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms history document conversion error", message) sent_warnings = True if not dict_ad: continue idx = htcondor_es.es.get_index(job_ad["QDate"], template=args.es_index_template, update_es=(args.feed_es and not args.read_only)) ad_list = buffered_ads.setdefault(idx, []) ad_list.append((unique_doc_id(dict_ad), dict_ad)) if len(ad_list) == args.es_bunch_size: st = time.time() if not args.read_only: if args.feed_es: data_for_es = [(id_, json.dumps(dict_ad)) for id_, dict_ad in ad_list] htcondor_es.es.post_ads(es=es.handle, idx=idx, ads=data_for_es) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \ id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq) logging.debug("...posting %d ads from %s (process_schedd)", len(ad_list), schedd_ad["Name"]) total_upload += time.time() - st buffered_ads[idx] = [] count += 1 # Find the most recent job and use that date as the new # last_completion date job_completion = job_ad.get("EnteredCurrentStatus") if job_completion > last_completion: last_completion = job_completion if time_remaining(starttime) < 0: message = ("History crawler on %s has been running for " "more than %d minutes; exiting." % (schedd_ad["Name"], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) timed_out = True break except RuntimeError: logging.error("Failed to query schedd for job history: %s", schedd_ad["Name"]) except Exception as exn: message = ("Failure when processing schedd history query on %s: %s" % (schedd_ad["Name"], str(exn))) logging.exception(message) send_email_alert(args.email_alerts, "spider_cms schedd history query error", message) # Post the remaining ads for idx, ad_list in buffered_ads.items(): if ad_list: logging.debug( "...posting remaining %d ads from %s " "(process_schedd)", len(ad_list), schedd_ad["Name"]) if not args.read_only: if args.feed_es: htcondor_es.es.post_ads(es=es.handle, idx=idx, ads=[(id_, json.dumps(dict_ad)) for id_, dict_ad in ad_list]) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \ id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq) total_time = (time.time() - my_start) / 60. total_upload /= 60. last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime( "%Y-%m-%d %H:%M:%S") logging.warning( "Schedd %-25s history: response count: %5d; last completion %s; " "query time %.2f min; upload time %.2f min", schedd_ad["Name"], count, last_formatted, total_time - total_upload, total_upload) # If we got to this point without a timeout, all these jobs have # been processed and uploaded, so we can update the checkpoint if not timed_out: checkpoint_queue.put((schedd_ad["Name"], last_completion)) return last_completion
def query_schedd_queue(starttime, schedd_ad, queue, args): my_start = time.time() pool_name = schedd_ad.get("CMS_Pool", "Unknown") logging.info("Querying %s queue for jobs.", schedd_ad["Name"]) if time_remaining(starttime) < 10: message = ("No time remaining to run queue crawler on %s; " "exiting." % schedd_ad["Name"]) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) return count_since_last_report = 0 count = 0 cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime queue.put(schedd_ad["Name"], timeout=time_remaining(starttime)) schedd = htcondor.Schedd(schedd_ad) sent_warnings = False batch = [] # Query for a snapshot of the jobs running/idle/held, # but only the completed that had changed in the last period of time. _completed_since = starttime - (TIMEOUT_MINS + 1) * 60 query = (""" (JobStatus < 3 || JobStatus > 4 || EnteredCurrentStatus >= %(completed_since)d || CRAB_PostJobLastUpdate >= %(completed_since)d ) && (CMS_Type != "DONOTMONIT") """ % { "completed_since": _completed_since }) try: query_iter = schedd.xquery( requirements=query) if not args.dry_run else [] for job_ad in query_iter: dict_ad = None try: dict_ad = convert_to_json( job_ad, return_dict=True, reduce_data=not args.keep_full_queue_data, pool_name=pool_name) except Exception as e: message = "Failure when converting document on %s queue: %s" % ( schedd_ad["Name"], str(e), ) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms queue document conversion error", message, ) sent_warnings = True if not dict_ad: continue batch.append((unique_doc_id(dict_ad), dict_ad)) count += 1 count_since_last_report += 1 if not args.dry_run and len(batch) == args.query_queue_batch_size: if time_remaining(starttime) < 10: message = ("Queue crawler on %s has been running for " "more than %d minutes; exiting" % (schedd_ad["Name"], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) break queue.put(batch, timeout=time_remaining(starttime)) batch = [] if count_since_last_report >= 1000: cpu_usage_now = resource.getrusage( resource.RUSAGE_SELF).ru_utime cpu_usage = cpu_usage_now - cpu_usage processing_rate = count_since_last_report / cpu_usage cpu_usage = cpu_usage_now logging.info( "Processor for %s has processed %d jobs " "(%.1f jobs per CPU-second)", schedd_ad["Name"], count, processing_rate, ) count_since_last_report = 0 if args.max_documents_to_process and count > args.max_documents_to_process: logging.warning( "Aborting after %d documents (--max_documents_to_process option)" % args.max_documents_to_process) break except RuntimeError as e: logging.error("Failed to query schedd %s for jobs: %s", schedd_ad["Name"], str(e)) except Exception as e: message = "Failure when processing schedd queue query on %s: %s" % ( schedd_ad["Name"], str(e), ) logging.error(message) send_email_alert(args.email_alerts, "spider_cms schedd queue query error", message) traceback.print_exc() if batch: # send remaining docs queue.put(batch, timeout=time_remaining(starttime)) batch = [] queue.put(schedd_ad["Name"], timeout=time_remaining(starttime)) total_time = (time.time() - my_start) / 60.0 logging.warning( "Schedd %-25s queue: response count: %5d; " "query time %.2f min; ", schedd_ad["Name"], count, total_time, ) return count
def process_schedd(starttime, last_completion, checkpoint_queue, schedd_ad, args, metadata=None): """ Given a schedd, process its entire set of history since last checkpoint. """ my_start = time.time() pool_name = schedd_ad.get("CMS_Pool", "Unknown") if time_remaining(starttime) < 0: message = ("No time remaining to process %s history; exiting." % schedd_ad["Name"]) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) return last_completion metadata = metadata or {} schedd = htcondor.Schedd(schedd_ad) _q = """ ( EnteredCurrentStatus >= %(last_completion)d || CRAB_PostJobLastUpdate >= %(last_completion)d ) && (CMS_Type != "DONOTMONIT") """ history_query = classad.ExprTree(_q % {"last_completion": last_completion}) logging.info( "Querying %s for history: %s. " "%.1f minutes of ads", schedd_ad["Name"], history_query, (time.time() - last_completion) / 60.0, ) buffered_ads = {} count = 0 total_upload = 0 sent_warnings = False timed_out = False if not args.read_only: if args.feed_es: es = htcondor_es.es.get_server_handle(args) try: if not args.dry_run: history_iter = schedd.history(history_query, [], 10000) else: history_iter = [] for job_ad in history_iter: dict_ad = None try: dict_ad = convert_to_json(job_ad, return_dict=True, pool_name=pool_name) except Exception as e: message = "Failure when converting document on %s history: %s" % ( schedd_ad["Name"], str(e), ) exc = traceback.format_exc() message += "\n{}".format(exc) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms history document conversion error", message, ) sent_warnings = True if not dict_ad: continue idx = htcondor_es.es.get_index( job_ad["QDate"], template=args.es_index_template, update_es=(args.feed_es and not args.read_only), ) ad_list = buffered_ads.setdefault(idx, []) ad_list.append((unique_doc_id(dict_ad), dict_ad)) if len(ad_list) == args.es_bunch_size: st = time.time() if not args.read_only: if args.feed_es: htcondor_es.es.post_ads(es.handle, idx, ad_list, metadata=metadata) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq, metadata=metadata) logging.debug( "...posting %d ads from %s (process_schedd)", len(ad_list), schedd_ad["Name"], ) total_upload += time.time() - st buffered_ads[idx] = [] count += 1 # Find the most recent job and use that date as the new # last_completion date job_completion = job_ad.get("EnteredCurrentStatus") if job_completion > last_completion: last_completion = job_completion if time_remaining(starttime) < 0: message = ("History crawler on %s has been running for " "more than %d minutes; exiting." % (schedd_ad["Name"], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) timed_out = True break if args.max_documents_to_process and count > args.max_documents_to_process: logging.warning( "Aborting after %d documents (--max_documents_to_process option)" % args.max_documents_to_process) break except RuntimeError: message = "Failed to query schedd for job history: %s" % schedd_ad[ "Name"] exc = traceback.format_exc() message += "\n{}".format(exc) logging.error(message) except Exception as exn: message = "Failure when processing schedd history query on %s: %s" % ( schedd_ad["Name"], str(exn), ) exc = traceback.format_exc() message += "\n{}".format(exc) logging.exception(message) send_email_alert(args.email_alerts, "spider_cms schedd history query error", message) # Post the remaining ads for idx, ad_list in list(buffered_ads.items()): if ad_list: logging.debug( "...posting remaining %d ads from %s " "(process_schedd)", len(ad_list), schedd_ad["Name"], ) if not args.read_only: if args.feed_es: htcondor_es.es.post_ads(es.handle, idx, ad_list, metadata=metadata) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq, metadata=metadata) total_time = (time.time() - my_start) / 60.0 total_upload /= 60.0 last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime( "%Y-%m-%d %H:%M:%S") logging.warning( "Schedd %-25s history: response count: %5d; last completion %s; " "query time %.2f min; upload time %.2f min", schedd_ad["Name"], count, last_formatted, total_time - total_upload, total_upload, ) # If we got to this point without a timeout, all these jobs have # been processed and uploaded, so we can update the checkpoint if not timed_out: checkpoint_queue.put((schedd_ad["Name"], last_completion)) return last_completion
def process_schedd_queue(starttime, schedd_ad, queue, args): my_start = time.time() logging.info("Querying %s queue for jobs.", schedd_ad["Name"]) if time_remaining(starttime) < 10: message = ("No time remaining to run queue crawler on %s; " "exiting." % schedd_ad['Name']) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) return count_since_last_report = 0 count = 0 cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime queue.put(schedd_ad['Name'], timeout=time_remaining(starttime)) schedd = htcondor.Schedd(schedd_ad) sent_warnings = False batch = [] try: query_iter = schedd.xquery() if not args.dry_run else [] for job_ad in query_iter: dict_ad = None try: dict_ad = convert_to_json( job_ad, return_dict=True, reduce_data=not args.keep_full_queue_data) except Exception as e: message = ("Failure when converting document on %s queue: %s" % (schedd_ad["Name"], str(e))) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms queue document conversion error", message) sent_warnings = True if not dict_ad: continue batch.append((unique_doc_id(dict_ad), dict_ad)) count += 1 count_since_last_report += 1 if not args.dry_run and len(batch) == args.query_queue_batch_size: if time_remaining(starttime) < 10: message = ("Queue crawler on %s has been running for " "more than %d minutes; exiting" % (schedd_ad['Name'], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) break queue.put(batch, timeout=time_remaining(starttime)) batch = [] if count_since_last_report >= 1000: cpu_usage_now = resource.getrusage( resource.RUSAGE_SELF).ru_utime cpu_usage = cpu_usage_now - cpu_usage processing_rate = count_since_last_report / cpu_usage cpu_usage = cpu_usage_now logging.info( "Processor for %s has processed %d jobs " "(%.1f jobs per CPU-second)", schedd_ad['Name'], count, processing_rate) count_since_last_report = 0 except RuntimeError, e: logging.error("Failed to query schedd %s for jobs: %s", schedd_ad["Name"], str(e))