def Drain(self): self.launch_daemons(["COLLECTOR", "STARTD"]) output_file = os.path.abspath(os.path.join(testdir, "test.out")) if os.path.exists(output_file): os.unlink(output_file) coll = htcondor.Collector() for i in range(10): ads = coll.locateAll(htcondor.DaemonTypes.Startd) if len(ads) > 0: break time.sleep(1) startd = htcondor.Startd(ads[0]) drain_id = startd.drainJobs(htcondor.DrainTypes.Fast) startd.cancelDrainJobs(drain_id)
def process_startd( start_time, since, checkpoint_queue, startd_ad, args, metadata=None ): """ Given a startd, process its entire set of history since last checkpoint. """ last_completion = since["EnteredCurrentStatus"] since_str = f"""(GlobalJobId == "{since['GlobalJobId']}") && (EnteredCurrentStatus == {since['EnteredCurrentStatus']})""" my_start = time.time() if utils.time_remaining(start_time) < 0: message = ( "No time remaining to process %s history; exiting." % startd_ad["Machine"] ) logging.error(message) utils.send_email_alert( args.email_alerts, "spider history timeout warning", message ) return since metadata = metadata or {} startd = htcondor.Startd(startd_ad) logging.info( "Querying %s for history", startd_ad["Machine"] ) buffered_ads = {} count = 0 total_upload = 0 sent_warnings = False timed_out = False if not args.read_only and args.es_feed_startd_history: es = elastic.get_server_handle(args) try: if not args.dry_run: history_iter = startd.history("True", [], since=since_str) else: history_iter = [] for job_ad in history_iter: try: dict_ad = convert.to_json(job_ad, return_dict=True) except Exception as e: message = f"Failure when converting document on {startd_ad['Machine']} history: {e}" exc = traceback.format_exc() message += f"\n{exc}" logging.warning(message) if not sent_warnings: utils.send_email_alert( args.email_alerts, "spider history document conversion error", message, ) sent_warnings = True continue idx = elastic.get_index( index_time(args.es_index_date_attr, job_ad), template=args.es_index_name, update_es=(args.es_feed_startd_history and not args.read_only), ) ad_list = buffered_ads.setdefault(idx, []) ad_list.append((convert.unique_doc_id(dict_ad), dict_ad)) if len(ad_list) == args.es_bunch_size: st = time.time() if not args.read_only and args.es_feed_startd_history: elastic.post_ads(es.handle, idx, ad_list, metadata=metadata) logging.debug( "...posting %d ads from %s (process_startd)", len(ad_list), startd_ad["Machine"], ) total_upload += time.time() - st buffered_ads[idx] = [] count += 1 job_completion = job_ad.get("EnteredCurrentStatus") if job_completion > last_completion: last_completion = job_completion since = { "GlobalJobId": job_ad.get("GlobalJobId"), "EnteredCurrentStatus": job_ad.get("EnteredCurrentStatus"), } if utils.time_remaining(start_time) < 0: message = f"History crawler on {startd_ad['Machine']} has been running for more than {utils.TIMEOUT_MINS:d} minutes; exiting." logging.error(message) utils.send_email_alert( args.email_alerts, "spider history timeout warning", message ) timed_out = True break if args.process_max_documents and count > args.process_max_documents: logging.warning( "Aborting after %d documents (--process_max_documents option)" % args.process_max_documents ) break except RuntimeError: message = "Failed to query startd for job history: %s" % startd_ad["Machine"] exc = traceback.format_exc() message += f"\n{exc}" logging.error(message) except Exception as exn: message = f"Failure when processing startd history query on {startd_ad['Machine']}: {str(exn)}" exc = traceback.format_exc() message += f"\n{exc}" logging.exception(message) utils.send_email_alert( args.email_alerts, "spider startd history query error", message ) # Post the remaining ads for idx, ad_list in list(buffered_ads.items()): if ad_list: logging.debug( "...posting remaining %d ads from %s " "(process_startd)", len(ad_list), startd_ad["Machine"], ) if not args.read_only: if args.es_feed_startd_history: elastic.post_ads(es.handle, idx, ad_list, metadata=metadata) total_time = (time.time() - my_start) / 60.0 total_upload /= 60.0 last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime( "%Y-%m-%d %H:%M:%S" ) logging.warning( "Startd %-25s history: response count: %5d; last completion %s; query time %.2f min; upload time %.2f min", startd_ad["Machine"], count, last_formatted, total_time - total_upload, total_upload, ) # If we got to this point without a timeout, all these jobs have # been processed and uploaded, so we can update the checkpoint if not timed_out: checkpoint_queue.put((startd_ad["Machine"], since)) return since
def process_startd(start_time, since, checkpoint_queue, startd_ad, args, metadata=None): """ Given a startd, process its entire set of history since last checkpoint. """ my_start = time.time() metadata = metadata or {} metadata["condor_history_source"] = "startd" metadata["condor_history_runtime"] = int(my_start) metadata["condor_history_host_version"] = startd_ad.get( "CondorVersion", "UNKNOWN") metadata["condor_history_host_platform"] = startd_ad.get( "CondorPlatform", "UNKNOWN") metadata["condor_history_host_machine"] = startd_ad.get( "Machine", "UNKNOWN") metadata["condor_history_host_name"] = startd_ad.get("Name", "UNKNOWN") last_completion = since["EnteredCurrentStatus"] since_str = f"""(GlobalJobId == "{since['GlobalJobId']}") && (EnteredCurrentStatus == {since['EnteredCurrentStatus']})""" max_ads = args.startd_history_max_ads # specify number of history entries to read if max_ads > 10000: logging.debug( f"Please note that the maximum number of queries per scheduler is also limited by the scheduler's config (HISTORY_HELPER_MAX_HISTORY)." ) logging.info( f"Note that a too large number of startd_history_max_ads can cause condor_adstash to break!" ) startd = htcondor.Startd(startd_ad) logging.info( f"Querying {startd_ad['Machine']} for history since: {since_str}") buffered_ads = {} count = 0 total_upload = 0 timed_out = False if not args.read_only: es = elastic.get_server_handle(args) try: if not args.dry_run: history_iter = startd.history( requirements="true", projection=[], match=max_ads, # default=10000 since=since_str) else: history_iter = [] for job_ad in history_iter: try: dict_ad = convert.to_json(job_ad, return_dict=True) except Exception as e: message = f"Failure when converting document on {startd_ad['Machine']} history: {e}" exc = traceback.format_exc() message += f"\n{exc}" logging.warning(message) continue idx = elastic.get_index(args.es_index_name) ad_list = buffered_ads.setdefault(idx, []) ad_list.append((convert.unique_doc_id(dict_ad), dict_ad)) if len(ad_list) == args.es_bunch_size: st = time.time() if not args.read_only: elastic.post_ads(es.handle, idx, ad_list, metadata=metadata) logging.debug( f"Posting {len(ad_list)} ads from {startd_ad['Machine']} (process_startd)" ) total_upload += time.time() - st buffered_ads[idx] = [] count += 1 job_completion = job_ad.get("EnteredCurrentStatus") if job_completion > last_completion: last_completion = job_completion since = { "GlobalJobId": job_ad.get("GlobalJobId"), "EnteredCurrentStatus": job_ad.get("EnteredCurrentStatus"), } if utils.time_remaining(my_start, args.startd_history_timeout) <= 0: message = f"History crawler on {startd_ad['Machine']} has been running for more than {args.schedd_history_timeout} seconds; pushing last ads and exiting." logging.error(message) timed_out = True break except RuntimeError: message = f"Failed to query startd {startd_ad['Machine']} for job history" logging.exception(message) return since except Exception: message = f"Failure when processing startd history query on {startd_ad['Machine']}" logging.exception(message) return since # Post the remaining ads for idx, ad_list in list(buffered_ads.items()): if ad_list: logging.debug( f"Posting remaining {len(ad_list)} ads from {startd_ad['Machine']} (process_startd)" ) if not args.read_only: elastic.post_ads(es.handle, idx, ad_list, metadata=metadata) total_time = (time.time() - my_start) / 60.0 total_upload /= 60.0 last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime( "%Y-%m-%d %H:%M:%S") logging.info( f"Startd {startd_ad['Machine']} history: response count: {count}; last job {last_formatted}; query time {total_time - total_upload:.2f} min; upload time {total_upload:.2f} min" ) if count >= max_ads: logging.warning( f"Max ads ({max_ads}) was reached " f"for {startd_ad['Machine']}, some history may be missing!") # If we got to this point without a timeout, all these jobs have # been processed and uploaded, so we can update the checkpoint if not timed_out: checkpoint_queue.put((startd_ad["Machine"], since)) return since