Exemplo n.º 1
0
def rule_injector(once=False, sleep_time=60):
    """
    Main loop to check for asynchronous creation of replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try
    executable = 'judge-injector'
    heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60)
    prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60)
            prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            rules = get_injected_rules(total_workers=heartbeat['nr_threads'],
                                       worker_number=heartbeat['assign_thread'],
                                       limit=100,
                                       blocked_rules=[key for key in paused_rules])
            logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules)))

            if not rules and not once:
                logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules)))
                daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger)
            else:
                for rule in rules:
                    rule_id = rule[0]
                    logger(logging.INFO, 'Injecting rule %s' % rule_id)
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        inject_rule(rule_id=rule_id, logger=logger)
                        logger(logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                            record_counter('rule.judge.exceptions.LocksDetected')
                            logger(logging.WARNING, 'Locks detected for %s' % rule_id)
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logger(logging.WARNING, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logger(logging.WARNING, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logger(logging.ERROR, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except (RSEWriteBlocked) as e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                        logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id)
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed as e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                        logger(logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id)
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except RuleNotFound:
                        pass
                    except InsufficientAccountLimit:
                        # A rule with InsufficientAccountLimit on injection hangs there potentially forever
                        # It should be marked as SUSPENDED
                        logger(logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id)
                        update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'})

        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            else:
                logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        except Exception as e:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        if once:
            break

    die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
Exemplo n.º 2
0
def rule_injector(once=False):
    """
    Main loop to check for asynchronous creation of replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try
    live(executable='rucio-judge-injector',
         hostname=hostname,
         pid=pid,
         thread=current_thread,
         older_than=2 * 60 * 60)
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable='rucio-judge-injector',
                             hostname=hostname,
                             pid=pid,
                             thread=current_thread,
                             older_than=2 * 60 * 60)

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            rules = get_injected_rules(
                total_workers=heartbeat['nr_threads'] - 1,
                worker_number=heartbeat['assign_thread'],
                limit=100,
                blacklisted_rules=[key for key in paused_rules])
            logging.debug(
                'rule_injector[%s/%s] index query time %f fetch size is %d' %
                (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1,
                 time.time() - start, len(rules)))

            if not rules and not once:
                logging.debug(
                    'rule_injector[%s/%s] did not get any work (paused_rules=%s)'
                    % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1,
                       str(len(paused_rules))))
                graceful_stop.wait(60)
            else:
                for rule in rules:
                    rule_id = rule[0]
                    logging.info('rule_injector[%s/%s]: Injecting rule %s' %
                                 (heartbeat['assign_thread'],
                                  heartbeat['nr_threads'] - 1, rule_id))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        inject_rule(rule_id=rule_id)
                        logging.debug(
                            'rule_injector[%s/%s]: injection of %s took %f' %
                            (heartbeat['assign_thread'],
                             heartbeat['nr_threads'] - 1, rule_id,
                             time.time() - start))
                    except (DatabaseException, DatabaseError), e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow(
                            ) + timedelta(seconds=randint(60, 600))
                            record_counter(
                                'rule.judge.exceptions.LocksDetected')
                            logging.warning(
                                'rule_injector[%s/%s]: Locks detected for %s' %
                                (heartbeat['assign_thread'],
                                 heartbeat['nr_threads'] - 1, rule_id))
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                    except RSEBlacklisted, e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(
                            seconds=randint(60, 600))
                        logging.warning(
                            'rule_injector[%s/%s]: RSEBlacklisted for rule %s'
                            % (heartbeat['assign_thread'],
                               heartbeat['nr_threads'] - 1, rule_id))
                        record_counter('rule.judge.exceptions.%s' %
                                       e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed, e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(
                            seconds=randint(60, 600))
                        logging.warning(
                            'rule_injector[%s/%s]: ReplicationRuleCreationTemporaryFailed for rule %s'
                            % (heartbeat['assign_thread'],
                               heartbeat['nr_threads'] - 1, rule_id))
                        record_counter('rule.judge.exceptions.%s' %
                                       e.__class__.__name__)
Exemplo n.º 3
0
def run_once(paused_rules, heartbeat_handler, **_kwargs):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        start = time.time()

        # Refresh paused rules
        iter_paused_rules = deepcopy(paused_rules)
        for key in iter_paused_rules:
            if datetime.utcnow() > paused_rules[key]:
                del paused_rules[key]

        rules = get_injected_rules(total_workers=total_workers,
                                   worker_number=worker_number,
                                   limit=100,
                                   blocked_rules=[key for key in paused_rules])
        logger(
            logging.DEBUG, 'index query time %f fetch size is %d' %
            (time.time() - start, len(rules)))

        if not rules:
            logger(
                logging.DEBUG, 'did not get any work (paused_rules=%s)' %
                str(len(paused_rules)))
            return

        for rule in rules:
            _, _, logger = heartbeat_handler.live()
            rule_id = rule[0]
            logger(logging.INFO, 'Injecting rule %s' % rule_id)
            if graceful_stop.is_set():
                break
            try:
                start = time.time()
                inject_rule(rule_id=rule_id, logger=logger)
                logger(
                    logging.DEBUG,
                    'injection of %s took %f' % (rule_id, time.time() - start))
            except (DatabaseException, DatabaseError) as e:
                if match('.*ORA-00054.*', str(e.args[0])):
                    paused_rules[rule_id] = datetime.utcnow() + timedelta(
                        seconds=randint(60, 600))
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': 'LocksDetected'})
                    logger(logging.WARNING, 'Locks detected for %s' % rule_id)
                elif match('.*QueuePool.*', str(e.args[0])):
                    logger(logging.WARNING, 'DatabaseException', exc_info=True)
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': e.__class__.__name__})
                elif match('.*ORA-03135.*', str(e.args[0])):
                    logger(logging.WARNING, 'DatabaseException', exc_info=True)
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': e.__class__.__name__})
                else:
                    logger(logging.ERROR, 'DatabaseException', exc_info=True)
                    record_counter('rule.judge.exceptions.{exception}',
                                   labels={'exception': e.__class__.__name__})
            except (RSEWriteBlocked) as e:
                paused_rules[rule_id] = datetime.utcnow() + timedelta(
                    seconds=randint(60, 600))
                logger(logging.WARNING,
                       'RSEWriteBlocked for rule %s' % rule_id)
                record_counter('rule.judge.exceptions.{exception}',
                               labels={'exception': e.__class__.__name__})
            except ReplicationRuleCreationTemporaryFailed as e:
                paused_rules[rule_id] = datetime.utcnow() + timedelta(
                    seconds=randint(60, 600))
                logger(
                    logging.WARNING,
                    'ReplicationRuleCreationTemporaryFailed for rule %s' %
                    rule_id)
                record_counter('rule.judge.exceptions.{exception}',
                               labels={'exception': e.__class__.__name__})
            except RuleNotFound:
                pass
            except InsufficientAccountLimit:
                # A rule with InsufficientAccountLimit on injection hangs there potentially forever
                # It should be marked as SUSPENDED
                logger(
                    logging.INFO,
                    'Marking rule %s as SUSPENDED due to InsufficientAccountLimit'
                    % rule_id)
                update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'})

    except (DatabaseException, DatabaseError) as e:
        if match('.*QueuePool.*', str(e.args[0])):
            logger(logging.WARNING, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.{exception}',
                           labels={'exception': e.__class__.__name__})
        elif match('.*ORA-03135.*', str(e.args[0])):
            logger(logging.WARNING, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.{exception}',
                           labels={'exception': e.__class__.__name__})
        else:
            logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.{exception}',
                           labels={'exception': e.__class__.__name__})
    except Exception as e:
        logger(logging.CRITICAL, 'Exception', exc_info=True)
        record_counter('rule.judge.exceptions.{exception}',
                       labels={'exception': e.__class__.__name__})