def necromancer(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. param worker_number: The number of the worker (thread). param total_number: The total number of workers (threads). chunk_size: The chunk of the size to process. once: To run only once """ sleep_time = 60 while not graceful_stop.is_set(): stime = time.time() try: replicas = list_bad_replicas(limit=chunk_size, worker_number=worker_number, total_workers=total_workers) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info('Thread [%i/%i] : Working on %s:%s on %s' % (worker_number, total_workers, scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info('Thread [%i/%i] : File %s:%s has no other replicas, it will be marked as lost' % (worker_number, total_workers, scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) else: logging.info('Thread [%i/%i] : File %s:%s can be recovered. Available sources : %s' % (worker_number, total_workers, scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) logging.info('Thread [%i/%i] : It took %s seconds to process %s replicas' % (worker_number, total_workers, str(time.time() - stime), str(len(replicas))))
def necromancer(thread=0, bulk=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. """ sleep_time = 60 update_history_threshold = 3600 update_history_time = time.time() executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) stime = time.time() try: replicas = list_bad_replicas(limit=bulk, thread=hb['assign_thread'], total_threads=hb['nr_threads']) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info(prepend_str + 'File %s:%s has no other replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) else: logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s' % (scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
def necromancer(thread=0, bulk=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. """ sleep_time = 60 update_history_threshold = 3600 update_history_time = time.time() executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) stime = time.time() replicas = [] try: replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logging.info(prepend_str + 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException as error: logging.info(prepend_str + '%s' % (str(error))) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, []) logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException as error: logging.info(prepend_str + '%s' % (str(error))) logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break else: now = time.time() if (now - update_history_time) > update_history_threshold: logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time)) bad_replicas = list_bad_replicas_history(limit=1000000, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for rse_id in bad_replicas: chunk_size = 1000 nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size)) logging.debug(prepend_str + 'Update history for rse_id %s' % (rse_id)) cnt = 0 for chunk in chunks(bad_replicas[rse_id], chunk_size): logging.debug(prepend_str + ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk)) cnt += 1 update_bad_replicas_history(chunk, rse_id) logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now)) update_history_time = time.time() tottime = time.time() - stime if len(replicas) == bulk: logging.info(prepend_str + 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle') elif tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): stime = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.DEBUG, 'Starting new cycle') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get('necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get('necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int(max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: evaluator_backlog_count, evaluator_backlog_duration = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ evaluator_backlog_count and \ max_evaluator_backlog_duration and \ evaluator_backlog_duration and \ evaluator_backlog_count > max_evaluator_backlog_count and \ evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and evaluator_backlog_count and evaluator_backlog_count > max_evaluator_backlog_count: logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and evaluator_backlog_duration and evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get('necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int(max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = REGION.get('bad_replicas_backlog') if bad_replicas_backlog is NO_VALUE: bad_replicas_backlog = get_bad_replicas_backlog() REGION.set('bad_replicas_backlog', bad_replicas_backlog) tot_bad_files = sum([bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(bad_replicas_backlog) > 1: logger(logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs') rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=True): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) tot_processed = 0 if tot_bad_files == 0: logger(logging.INFO, 'No bad replicas to process.') else: ttime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger(logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.lostfile') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, []) logger(logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.recovering') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) tot_processed += len(replicas) logger(logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - ttime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break elif tot_processed == 0 or tot_bad_files == 0: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ update_history_threshold = 3600 update_history_time = time.time() executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get( 'necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get( 'necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int( max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get( 'necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int( max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = get_bad_replicas_backlog() tot_bad_files = sum( [bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len( bad_replicas_backlog) > 1: logger( logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs' ) rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=False): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) stime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas( limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica[ 'name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger( logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.lostfile') except DatabaseException as error: logger(logging.WARNING, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get( ReplicaState.TEMPORARY_UNAVAILABLE, []) logger( logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.recovering') except DatabaseException as error: logger(logging.WARNING, str(error)) logger( logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger( logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break else: now = time.time() if (now - update_history_time) > update_history_threshold: logger( logging.INFO, 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time)) bad_replicas = list_bad_replicas_history( limit=1000000, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for rse_id in bad_replicas: chunk_size = 1000 nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size)) logger(logging.DEBUG, 'Update history for rse_id %s' % (rse_id)) cnt = 0 for chunk in chunks(bad_replicas[rse_id], chunk_size): logger( logging.DEBUG, ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk)) cnt += 1 update_bad_replicas_history(chunk, rse_id) logger( logging.INFO, 'History table updated in %s seconds' % (time.time() - now)) update_history_time = time.time() if len(replicas) == bulk: logger( logging.INFO, 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle' ) else: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')