示例#1
0
class Ha:

    def __init__(self, patroni):
        self.patroni = patroni
        self.state_handler = patroni.postgresql
        self.dcs = patroni.dcs
        self.cluster = None
        self.old_cluster = None
        self._async_executor = AsyncExecutor()

    def load_cluster_from_dcs(self):
        cluster = self.dcs.get_cluster()

        # We want to keep the state of cluster when it was healhy
        if not cluster.is_unlocked() or not self.old_cluster:
            self.old_cluster = cluster
        self.cluster = cluster

    def acquire_lock(self):
        return self.dcs.attempt_to_acquire_leader()

    def update_lock(self):
        ret = self.dcs.update_leader()
        if ret:
            try:
                self.dcs.write_leader_optime(self.state_handler.last_operation())
            except:
                pass
        return ret

    def has_lock(self):
        lock_owner = self.cluster.leader and self.cluster.leader.name
        logger.info('Lock owner: %s; I am %s', lock_owner, self.state_handler.name)
        return lock_owner == self.state_handler.name

    def touch_member(self):
        data = {
            'conn_url': self.state_handler.connection_string,
            'api_url': self.patroni.api.connection_string,
            'state': self.state_handler.state,
            'role': self.state_handler.role,
            'tags': self.patroni.tags
        }
        if data['state'] in ['running', 'restarting', 'starting']:
            try:
                data['xlog_location'] = self.state_handler.xlog_position()
            except:
                pass
        self.dcs.touch_member(json.dumps(data, separators=(',', ':')))

    def copy_backup_from_leader(self, leader):
        if self.state_handler.bootstrap(leader):
            logger.info('bootstrapped from leader')
        else:
            self.state_handler.stop('immediate')
            self.state_handler.remove_data_directory()
            logger.error('failed to bootstrap from leader')

    def bootstrap(self):
        if not self.cluster.is_unlocked():  # cluster already has leader
            self._async_executor.schedule('bootstrap from leader')
            self._async_executor.run_async(self.copy_backup_from_leader, args=(self.cluster.leader, ))
            return 'trying to bootstrap from leader'
        elif not self.cluster.initialize and not self.patroni.nofailover:  # no initialize key
            if self.dcs.initialize(create_new=True):  # race for initialization
                try:
                    self.state_handler.bootstrap()
                    self.dcs.initialize(create_new=False, sysid=self.state_handler.sysid)
                except:  # initdb or start failed
                    # remove initialization key and give a chance to other members
                    logger.info("removing initialize key after failed attempt to initialize the cluster")
                    self.dcs.cancel_initialization()
                    self.state_handler.stop('immediate')
                    self.state_handler.move_data_directory()
                    raise
                self.dcs.take_leader()
                return 'initialized a new cluster'
            else:
                return 'failed to acquire initialize lock'
        else:
            return 'waiting for leader to bootstrap'

    def recover(self):
        has_lock = self.has_lock()

        # try to see if we are the former master that crashed. If so - we likely need to run pg_rewind
        # in order to join the former standby being promoted.
        pg_controldata = self.state_handler.controldata()
        if not has_lock and pg_controldata and\
                pg_controldata.get('Database cluster state', '') == 'in production':  # crashed master
            self.state_handler.require_rewind()

        # XXX: follow the leader calls stop, which might take quite some time.
        # perhaps we should run sync asynchronously
        # (we still need the exit code from follow_the_leader)
        ret = self.state_handler.follow_the_leader(None if has_lock else self.cluster.leader, recovery=True)
        if not ret:
            if not has_lock:
                return 'failed to start postgres'
            self.dcs.delete_leader()
            self.dcs.reset_cluster()
            return 'removed leader key after trying and failing to start postgres'
        if not has_lock:
            return 'started as a secondary'
        logger.info('started as readonly because i had the session lock')
        self.load_cluster_from_dcs()

    def follow_the_leader(self, demote_reason, follow_reason, refresh=True):
        refresh and self.load_cluster_from_dcs()
        ret = demote_reason if self.state_handler.is_leader() else follow_reason
        leader = self.cluster.leader
        leader = None if (leader and leader.name) == self.state_handler.name else leader
        if not self.state_handler.check_recovery_conf(leader):
            self._async_executor.schedule('changing primary_conninfo and restarting')
            self._async_executor.run_async(self.state_handler.follow_the_leader, (leader, ))
        return ret

    def enforce_master_role(self, message, promote_message):
        if self.state_handler.is_leader() or self.state_handler.role == 'master':
            return message
        else:
            self.state_handler.promote()
            self.touch_member()
            return promote_message

    @staticmethod
    def fetch_node_status(member):
        """This function perform http get request on member.api_url and fetches its status
        :returns: tuple(`member`, reachable, in_recovery, xlog_location)

        reachable - `!False` if the node is not reachable or is not responding with correct JSON
        in_recovery - `!True` if pg_is_in_recovery() == true
        xlog_location - value of `replayed_location` or `location` from JSON, dependin on its role.
        tags - dictionary with values of different tags (i.e. nofailover)
        """

        try:
            response = requests.get(member.api_url, timeout=2, verify=False)
            logger.info('Got response from %s %s: %s', member.name, member.api_url, response.content)
            json = response.json()
            is_master = json['role'] == 'master'
            xlog_location = json['xlog']['location' if is_master else 'replayed_location']
            tags = json.get('tags', dict())
            return (member, True, not is_master, xlog_location, tags)
        except:
            logging.exception('request failed: GET %s', member.api_url)
        return (member, False, None, 0, {})

    def fetch_nodes_statuses(self, members):
        pool = ThreadPool(len(members))
        results = pool.map(self.fetch_node_status, members)  # Run API calls on members in parallel
        pool.close()
        pool.join()
        return results

    def _is_healthiest_node(self, members, check_replication_lag=True):
        """This method tries to determine whether I am healthy enough to became a new leader candidate or not."""

        if self.state_handler.is_leader():
            return True

        if self.patroni.nofailover is True:
            return False

        if check_replication_lag and not self.state_handler.check_replication_lag(self.cluster.last_leader_operation):
            return False  # Too far behind last reported xlog location on master

        # Prepare list of nodes to run check against
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]

        if members:
            my_xlog_location = self.state_handler.xlog_position()
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):  # If the node is unreachable it's not healhy
                    if not in_recovery:
                        logger.warning('Master (%s) is still alive', member.name)
                        return False
                    if my_xlog_location < xlog_location:
                        return False
        return True

    def is_failover_possible(self, members):
        ret = False
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]
        if members:
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):
                    ret = True  # TODO: check xlog_location
                elif not reachable:
                    logger.info('Member %s is not reachable', member.name)
                elif tags.get('nofailover', False):
                    logger.info('Member %s is not allowed to promote', member.name)
        else:
            logger.warning('manual failover: members list is empty')
        return ret

    def manual_failover_process_no_leader(self):
        failover = self.cluster.failover
        if failover.member:  # manual failover to specific member
            if failover.member == self.state_handler.name:  # manual failover to me
                return True

            # find specific node and check that it is healthy
            members = [m for m in self.cluster.members if m.name == failover.member]
            if members:
                member, reachable, in_recovery, xlog_location, tags = self.fetch_node_status(members[0])
                if reachable and not tags.get('nofailover', False):  # node is healthy
                    logger.info('manual failover: to %s, i am %s', member.name, self.state_handler.name)
                    return False
                # we wanted to failover to specific member but it is not healthy
                if not reachable:
                    logger.warning('manual failover: member %s is unhealthy', member.name)
                elif tags.get('nofailover', False):
                    logger.warning('manual failover: member %s is not allowed to promote', member.name)

            # at this point we should consider all members as a candidates for failover
            # i.e. we assume that failover.member is None

        # try to pick some other members to failover and check that they are healthy
        if failover.leader:
            if self.state_handler.name == failover.leader:  # I was the leader
                # exclude me and desired member which is unhealthy (failover.member can be None)
                members = [m for m in self.cluster.members if m.name not in (failover.member, failover.leader)]
                if self.is_failover_possible(members):  # check that there are healthy members
                    return False
                else:  # I was the leader and it looks like currently I am the only healthy member
                    return True

            # at this point we assume that our node is a candidate for a failover among all nodes except former leader

        # exclude former leader from the list (failover.leader can be None)
        members = [m for m in self.cluster.members if m.name != failover.leader]
        return self._is_healthiest_node(members, check_replication_lag=False)

    def is_healthiest_node(self):

        if self.state_handler.is_leader():  # leader is always the healthiest
            return True

        if self.patroni.nofailover:  # nofailover tag makes node always unhealthy
            return False

        if self.cluster.failover:
            return self.manual_failover_process_no_leader()

        # run usual health check
        members = {m.name: m for m in self.cluster.members + self.old_cluster.members}
        return self._is_healthiest_node(members.values())

    def demote(self, delete_leader=True):
        if delete_leader:
            self.state_handler.stop()
            self.dcs.delete_leader()
            self.touch_member()
            self.dcs.reset_cluster()
        self.state_handler.follow_the_leader(None)

    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover
        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.member or failover.member != self.state_handler.name:
                members = [m for m in self.cluster.members if not failover.member or m.name == failover.member]
                if self.is_failover_possible(members):  # check that there are healthy members
                    self._async_executor.schedule('manual failover: demote')
                    self._async_executor.run_async(self.demote)
                    return 'manual failover: demoting myself'
                else:
                    logger.warning('manual failover: no healthy members found, failover is not possible')
            else:
                logger.warning('manual failover: I am already the leader, no need to failover')
        else:
            logger.warning('manual failover: leader name does not match: %s != %s',
                           self.cluster.failover.leader, self.state_handler.name)

        logger.info('Trying to clean up failover key')
        self.dcs.manual_failover('', '', self.cluster.failover.index)

    def process_unhealthy_cluster(self):
        if self.is_healthiest_node():
            if self.acquire_lock():
                if self.cluster.failover:
                    logger.info('Cleaning up failover key after acquiring leader lock...')
                    self.dcs.manual_failover('', '')
                self.dcs.get_cluster()
                return self.enforce_master_role('acquired session lock as a leader',
                                                'promoted self to leader by acquiring session lock')
            else:
                return self.follow_the_leader('demoted self due after trying and failing to obtain lock',
                                              'following new leader after trying and failing to obtain lock')
        else:
            if self.patroni.nofailover:
                return self.follow_the_leader('demoting self because I am not allowed to become master',
                                              'following a different leader because I am not allowed to promote')
            return self.follow_the_leader('demoting self because i am not the healthiest node',
                                          'following a different leader because i am not the healthiest node')

    def process_healthy_cluster(self):
        if self.has_lock():
            if self.cluster.failover:
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

            if self.update_lock():
                return self.enforce_master_role('no action.  i am the leader with the lock',
                                                'promoted self to leader because i had the session lock')
            else:
                # Either there is no connection to DCS or someone else acquired the lock
                logger.error('failed to update leader lock')
                self.load_cluster_from_dcs()
        else:
            logger.info('does not have lock')
        return self.follow_the_leader('demoting self because i do not have the lock and i was a leader',
                                      'no action.  i am a secondary and i am following a leader', False)

    def schedule(self, action):
        with self._async_executor:
            return self._async_executor.schedule(action)

    def restart_scheduled(self):
        return self._async_executor.scheduled_action == 'restart'

    def schedule_reinitialize(self):
        return self.schedule('reinitialize')

    def reinitialize_scheduled(self):
        return self._async_executor.scheduled_action == 'reinitialize'

    def restart(self):
        with self._async_executor:
            prev = self._async_executor.schedule('restart', True)
            if prev is not None:
                return (False, prev + ' already in progress')
        if self._async_executor.run(self.state_handler.restart):
            return (True, 'restarted successfully')
        else:
            return (False, 'restart failed')

    def reinitialize(self, cluster):
        self.state_handler.stop('immediate')
        self.state_handler.remove_data_directory()
        self.copy_backup_from_leader(cluster.leader)

    def process_scheduled_action(self):
        if self.reinitialize_scheduled():
            if self.cluster.is_unlocked():
                logger.error('Cluster has no leader, can not reinitialize')
                self._async_executor.reset_scheduled_action()
            elif self.has_lock():
                logger.error('I am the leader, can not reinitialize')
                self._async_executor.reset_scheduled_action()
            else:
                self._async_executor.run_async(self.reinitialize, args=(self.cluster, ))
                return 'reinitialize started'

    def handle_long_action_in_progress(self):
        if self.has_lock():
            if self.update_lock():
                return 'updated leader lock during ' + self._async_executor.scheduled_action
            else:
                return 'failed to update leader lock during ' + self._async_executor.scheduled_action
        elif self.cluster.is_unlocked():
            return 'not healthy enough for leader race'
        else:
            return self._async_executor.scheduled_action + ' in progress'

    def sysid_valid(self, sysid):
        # sysid does tv_sec << 32, where tv_sec is the number of seconds sine 1970,
        # so even 1 << 32 would have 10 digits.
        return str(sysid) and len(str(sysid)) >= 10 and str(sysid).isdigit()

    def _run_cycle(self):
        try:
            self.load_cluster_from_dcs()

            self.touch_member()

            # cluster has leader key but not initialize key
            if not self.cluster.is_unlocked() and not self.sysid_valid(self.cluster.initialize) and self.has_lock():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)

            if self._async_executor.busy:
                return self.handle_long_action_in_progress()

            # currently it can trigger only reinitialize
            msg = self.process_scheduled_action()
            if msg is not None:
                return msg

            # is data directory empty?
            if self.state_handler.data_directory_empty():
                return self.bootstrap()  # new node
            # "bootstrap", but data directory is not empty
            elif not self.sysid_valid(self.cluster.initialize) and self.cluster.is_unlocked():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)
            else:
                # check if we are allowed to join
                if self.sysid_valid(self.cluster.initialize) and self.cluster.initialize != self.state_handler.sysid:
                    logger.fatal("system ID mismatch, node {0} belongs to a different cluster".
                                 format(self.state_handler.name))
                    sys.exit(1)

            # try to start dead postgres
            if not self.state_handler.is_healthy():
                msg = self.recover()
                if msg is not None:
                    return msg

            try:
                if self.cluster.is_unlocked():
                    return self.process_unhealthy_cluster()
                else:
                    return self.process_healthy_cluster()
            finally:
                self.state_handler.sync_replication_slots(self.cluster)
        except DCSError:
            logger.error('Error communicating with DCS')
            if self.state_handler.is_running() and self.state_handler.is_leader():
                self.demote(delete_leader=False)
                return 'demoted self because DCS is not accessible and i was a leader'
        except (psycopg2.Error, PostgresConnectionException):
            logger.exception('Error communicating with Postgresql. Will try again later')

    def run_cycle(self):
        with self._async_executor:
            return self._run_cycle()
示例#2
0
class Ha(object):
    def __init__(self, patroni):
        self.patroni = patroni
        self.state_handler = patroni.postgresql
        self.dcs = patroni.dcs
        self.cluster = None
        self.old_cluster = None
        self.recovering = False
        self._async_executor = AsyncExecutor()

    def load_cluster_from_dcs(self):
        cluster = self.dcs.get_cluster()

        # We want to keep the state of cluster when it was healthy
        if not cluster.is_unlocked() or not self.old_cluster:
            self.old_cluster = cluster
        self.cluster = cluster

    def acquire_lock(self):
        return self.dcs.attempt_to_acquire_leader()

    def update_lock(self):
        ret = self.dcs.update_leader()
        if ret and not self._async_executor.busy:
            try:
                self.dcs.write_leader_optime(
                    self.state_handler.last_operation())
            except:
                pass
        return ret

    def has_lock(self):
        lock_owner = self.cluster.leader and self.cluster.leader.name
        logger.info('Lock owner: %s; I am %s', lock_owner,
                    self.state_handler.name)
        return lock_owner == self.state_handler.name

    def touch_member(self):
        data = {
            'conn_url': self.state_handler.connection_string,
            'api_url': self.patroni.api.connection_string,
            'state': self.state_handler.state,
            'role': self.state_handler.role
        }
        if self.patroni.tags:
            data['tags'] = self.patroni.tags
        if self.state_handler.pending_restart:
            data['pending_restart'] = True
        if not self._async_executor.busy and data['state'] in [
                'running', 'restarting', 'starting'
        ]:
            try:
                data['xlog_location'] = self.state_handler.xlog_position()
            except:
                pass
        self.dcs.touch_member(json.dumps(data, separators=(',', ':')))

    def clone(self, clone_member=None, msg='(without leader)'):
        if self.state_handler.clone(clone_member):
            logger.info('bootstrapped %s', msg)
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(node_to_follow, cluster.leader, True)
        else:
            logger.error('failed to bootstrap %s', msg)
            self.state_handler.remove_data_directory()

    def bootstrap(self):
        if not self.cluster.is_unlocked():  # cluster already has leader
            clone_member = self.cluster.get_clone_member()
            member_role = 'leader' if clone_member == self.cluster.leader else 'replica'
            msg = "from {0} '{1}'".format(member_role, clone_member.name)
            self._async_executor.schedule('bootstrap {0}'.format(msg))
            self._async_executor.run_async(self.clone,
                                           args=(clone_member, msg))
            return 'trying to bootstrap {0}'.format(msg)
        # no initialize key and node is allowed to be master and has 'bootstrap' section in a configuration file
        elif not (self.cluster.initialize or self.patroni.nofailover
                  ) and 'bootstrap' in self.patroni.config:
            if self.dcs.initialize(create_new=True):  # race for initialization
                try:
                    self.state_handler.bootstrap(
                        self.patroni.config['bootstrap'])
                    self.dcs.initialize(create_new=False,
                                        sysid=self.state_handler.sysid)
                except:  # initdb or start failed
                    # remove initialization key and give a chance to other members
                    logger.info(
                        "removing initialize key after failed attempt to initialize the cluster"
                    )
                    self.dcs.cancel_initialization()
                    self.state_handler.stop('immediate')
                    self.state_handler.move_data_directory()
                    raise
                self.dcs.set_config_value(
                    json.dumps(self.patroni.config.dynamic_configuration,
                               separators=(',', ':')))
                self.dcs.take_leader()
                self.load_cluster_from_dcs()
                return 'initialized a new cluster'
            else:
                return 'failed to acquire initialize lock'
        else:
            if self.state_handler.can_create_replica_without_replication_connection(
            ):
                self._async_executor.run_async(self.clone)
                return "trying to bootstrap (without leader)"
            return 'waiting for leader to bootstrap'

    def recover(self):
        self.recovering = True
        return self.follow(
            "starting as readonly because i had the session lock",
            "starting as a secondary", True, True)

    def _get_node_to_follow(self, cluster):
        # determine the node to follow. If replicatefrom tag is set,
        # try to follow the node mentioned there, otherwise, follow the leader.
        if not self.patroni.replicatefrom or self.patroni.replicatefrom == self.state_handler.name:
            node_to_follow = cluster.leader
        else:
            node_to_follow = cluster.get_member(self.patroni.replicatefrom)

        return node_to_follow if node_to_follow and node_to_follow.name != self.state_handler.name else None

    def follow(self,
               demote_reason,
               follow_reason,
               refresh=True,
               recovery=False):
        if refresh:
            self.load_cluster_from_dcs()

        if recovery:
            ret = demote_reason if self.has_lock() else follow_reason
        else:
            ret = demote_reason if self.state_handler.is_leader(
            ) else follow_reason

        node_to_follow = self._get_node_to_follow(self.cluster)

        if not self.state_handler.check_recovery_conf(
                node_to_follow) or recovery:
            self._async_executor.schedule(
                'changing primary_conninfo and restarting')
            self._async_executor.run_async(
                self.state_handler.follow,
                (node_to_follow, self.cluster.leader, recovery))
        return ret

    def enforce_master_role(self, message, promote_message):
        if self.state_handler.is_leader(
        ) or self.state_handler.role == 'master':
            return message
        else:
            self.state_handler.promote()
            self.touch_member()
            return promote_message

    @staticmethod
    def fetch_node_status(member):
        """This function perform http get request on member.api_url and fetches its status
        :returns: tuple(`member`, reachable, in_recovery, xlog_location)

        reachable - `!False` if the node is not reachable or is not responding with correct JSON
        in_recovery - `!True` if pg_is_in_recovery() == true
        xlog_location - value of `replayed_location` or `location` from JSON, dependin on its role.
        tags - dictionary with values of different tags (i.e. nofailover)
        """

        try:
            response = requests.get(member.api_url, timeout=2, verify=False)
            logger.info('Got response from %s %s: %s', member.name,
                        member.api_url, response.content)
            json = response.json()
            is_master = json['role'] == 'master'
            xlog_location = json['xlog'][
                'location' if is_master else 'replayed_location']
            tags = json.get('tags', dict())
            return (member, True, not is_master, xlog_location, tags)
        except:
            logging.exception('request failed: GET %s', member.api_url)
        return (member, False, None, 0, {})

    def fetch_nodes_statuses(self, members):
        pool = ThreadPool(len(members))
        results = pool.map(self.fetch_node_status,
                           members)  # Run API calls on members in parallel
        pool.close()
        pool.join()
        return results

    def _is_healthiest_node(self, members, check_replication_lag=True):
        """This method tries to determine whether I am healthy enough to became a new leader candidate or not."""

        if self.state_handler.is_leader():
            return True

        if self.patroni.nofailover is True:
            return False

        if check_replication_lag and not self.state_handler.check_replication_lag(
                self.cluster.last_leader_operation):
            return False  # Too far behind last reported xlog location on master

        # Prepare list of nodes to run check against
        members = [
            m for m in members if m.name != self.state_handler.name
            and not m.nofailover and m.api_url
        ]

        if members:
            my_xlog_location = self.state_handler.xlog_position()
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(
                    members):
                if reachable and not tags.get(
                        'nofailover',
                        False):  # If the node is unreachable it's not healhy
                    if not in_recovery:
                        logger.warning('Master (%s) is still alive',
                                       member.name)
                        return False
                    if my_xlog_location < xlog_location:
                        return False
        return True

    def is_failover_possible(self, members):
        ret = False
        members = [
            m for m in members if m.name != self.state_handler.name
            and not m.nofailover and m.api_url
        ]
        if members:
            for member, reachable, _, _, tags in self.fetch_nodes_statuses(
                    members):
                if reachable and not tags.get('nofailover', False):
                    ret = True  # TODO: check xlog_location
                elif not reachable:
                    logger.info('Member %s is not reachable', member.name)
                elif tags.get('nofailover', False):
                    logger.info('Member %s is not allowed to promote',
                                member.name)
        else:
            logger.warning('manual failover: members list is empty')
        return ret

    def manual_failover_process_no_leader(self):
        failover = self.cluster.failover
        if failover.candidate:  # manual failover to specific member
            if failover.candidate == self.state_handler.name:  # manual failover to me
                return True

            # find specific node and check that it is healthy
            member = self.cluster.get_member(failover.candidate,
                                             fallback_to_leader=False)
            if member:
                member, reachable, _, _, tags = self.fetch_node_status(member)
                if reachable and not tags.get('nofailover',
                                              False):  # node is healthy
                    logger.info('manual failover: to %s, i am %s', member.name,
                                self.state_handler.name)
                    return False
                # we wanted to failover to specific member but it is not healthy
                if not reachable:
                    logger.warning('manual failover: member %s is unhealthy',
                                   member.name)
                elif tags.get('nofailover', False):
                    logger.warning(
                        'manual failover: member %s is not allowed to promote',
                        member.name)

            # at this point we should consider all members as a candidates for failover
            # i.e. we assume that failover.candidate is None

        # try to pick some other members to failover and check that they are healthy
        if failover.leader:
            if self.state_handler.name == failover.leader:  # I was the leader
                # exclude me and desired member which is unhealthy (failover.candidate can be None)
                members = [
                    m for m in self.cluster.members
                    if m.name not in (failover.candidate, failover.leader)
                ]
                if self.is_failover_possible(
                        members):  # check that there are healthy members
                    return False
                else:  # I was the leader and it looks like currently I am the only healthy member
                    return True

            # at this point we assume that our node is a candidate for a failover among all nodes except former leader

        # exclude former leader from the list (failover.leader can be None)
        members = [
            m for m in self.cluster.members if m.name != failover.leader
        ]
        return self._is_healthiest_node(members, check_replication_lag=False)

    def is_healthiest_node(self):

        if self.state_handler.is_leader():  # leader is always the healthiest
            return True

        if self.patroni.nofailover:  # nofailover tag makes node always unhealthy
            return False

        if self.cluster.failover:
            return self.manual_failover_process_no_leader()

        # run usual health check
        members = {
            m.name: m
            for m in self.cluster.members + self.old_cluster.members
        }
        return self._is_healthiest_node(members.values())

    def demote(self, delete_leader=True):
        if delete_leader:
            self.state_handler.stop()
            self.dcs.delete_leader()
            self.touch_member()
            self.dcs.reset_cluster()
            sleep(2)  # Give a time to somebody to promote
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(node_to_follow, cluster.leader, True)
        else:
            self.state_handler.follow(None, None)

    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover

        if failover.scheduled_at:
            # If the failover is in the far future, we shouldn't do anything and just return.
            # If the failover is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the failover
            now = datetime.datetime.now(pytz.utc)
            try:
                delta = (failover.scheduled_at - now).total_seconds()

                if delta > self.patroni.nap_time:
                    logging.info('Awaiting failover at %s (in %.0f seconds)',
                                 failover.scheduled_at.isoformat(), delta)
                    return
                elif delta < -int(self.patroni.nap_time * 1.5):
                    logger.warning(
                        'Found a stale failover value, cleaning up: %s',
                        failover.scheduled_at)
                    self.dcs.manual_failover('',
                                             '',
                                             index=self.cluster.failover.index)
                    return

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled failover at {}'.format(
                    failover.scheduled_at.isoformat()))
            except TypeError:
                logger.warning('Incorrect value in of scheduled_at: %s',
                               failover.scheduled_at)

        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.candidate or failover.candidate != self.state_handler.name:
                members = [
                    m for m in self.cluster.members
                    if not failover.candidate or m.name == failover.candidate
                ]
                if self.is_failover_possible(
                        members):  # check that there are healthy members
                    self._async_executor.schedule('manual failover: demote')
                    self._async_executor.run_async(self.demote)
                    return 'manual failover: demoting myself'
                else:
                    logger.warning(
                        'manual failover: no healthy members found, failover is not possible'
                    )
            else:
                logger.warning(
                    'manual failover: I am already the leader, no need to failover'
                )
        else:
            logger.warning(
                'manual failover: leader name does not match: %s != %s',
                self.cluster.failover.leader, self.state_handler.name)

        logger.info('Trying to clean up failover key')
        self.dcs.manual_failover('', '', index=self.cluster.failover.index)

    def process_unhealthy_cluster(self):
        if self.is_healthiest_node():
            if self.acquire_lock():
                if self.cluster.failover:
                    logger.info(
                        'Cleaning up failover key after acquiring leader lock...'
                    )
                    self.dcs.manual_failover('', '')
                self.load_cluster_from_dcs()
                return self.enforce_master_role(
                    'acquired session lock as a leader',
                    'promoted self to leader by acquiring session lock')
            else:
                return self.follow(
                    'demoted self after trying and failing to obtain lock',
                    'following new leader after trying and failing to obtain lock'
                )
        else:
            if self.patroni.nofailover:
                return self.follow(
                    'demoting self because I am not allowed to become master',
                    'following a different leader because I am not allowed to promote'
                )
            return self.follow(
                'demoting self because i am not the healthiest node',
                'following a different leader because i am not the healthiest node'
            )

    def process_healthy_cluster(self):
        if self.has_lock():
            if self.cluster.failover:
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

            if self.update_lock():
                return self.enforce_master_role(
                    'no action.  i am the leader with the lock',
                    'promoted self to leader because i had the session lock')
            else:
                # Either there is no connection to DCS or someone else acquired the lock
                logger.error('failed to update leader lock')
                self.load_cluster_from_dcs()
        else:
            logger.info('does not have lock')
        return self.follow(
            'demoting self because i do not have the lock and i was a leader',
            'no action.  i am a secondary and i am following a leader', False)

    def schedule(self, action):
        with self._async_executor:
            return self._async_executor.schedule(action)

    def restart_scheduled(self):
        return self._async_executor.scheduled_action == 'restart'

    def schedule_reinitialize(self):
        return self.schedule('reinitialize')

    def reinitialize_scheduled(self):
        return self._async_executor.scheduled_action == 'reinitialize'

    def restart(self):
        with self._async_executor:
            prev = self._async_executor.schedule('restart', True)
            if prev is not None:
                return (False, prev + ' already in progress')
        if self._async_executor.run(self.state_handler.restart):
            return (True, 'restarted successfully')
        else:
            return (False, 'restart failed')

    def reinitialize(self, cluster):
        self.state_handler.stop('immediate')
        self.state_handler.remove_data_directory()

        clone_member = cluster.get_clone_member()
        member_role = 'leader' if clone_member == cluster.leader else 'replica'
        self.clone(clone_member,
                   "from {0} '{1}'".format(member_role, clone_member.name))

    def process_scheduled_action(self):
        if self.reinitialize_scheduled():
            if self.cluster.is_unlocked():
                logger.error('Cluster has no leader, can not reinitialize')
                self._async_executor.reset_scheduled_action()
            elif self.has_lock():
                logger.error('I am the leader, can not reinitialize')
                self._async_executor.reset_scheduled_action()
            else:
                self._async_executor.run_async(self.reinitialize,
                                               args=(self.cluster, ))
                return 'reinitialize started'

    def handle_long_action_in_progress(self):
        if self.has_lock():
            if self.update_lock():
                return 'updated leader lock during ' + self._async_executor.scheduled_action
            else:
                return 'failed to update leader lock during ' + self._async_executor.scheduled_action
        elif self.cluster.is_unlocked():
            return 'not healthy enough for leader race'
        else:
            return self._async_executor.scheduled_action + ' in progress'

    @staticmethod
    def sysid_valid(sysid):
        # sysid does tv_sec << 32, where tv_sec is the number of seconds sine 1970,
        # so even 1 << 32 would have 10 digits.
        return str(sysid) and len(str(sysid)) >= 10 and str(sysid).isdigit()

    def post_recover(self):
        if not self.state_handler.is_running():
            if self.has_lock():
                self.dcs.delete_leader()
                self.dcs.reset_cluster()
                return 'removed leader key after trying and failing to start postgres'
            return 'failed to start postgres'
        return None

    def _run_cycle(self):
        try:
            self.load_cluster_from_dcs()

            self.touch_member()

            # cluster has leader key but not initialize key
            if not (self.cluster.is_unlocked() or self.sysid_valid(
                    self.cluster.initialize)) and self.has_lock():
                self.dcs.initialize(
                    create_new=(self.cluster.initialize is None),
                    sysid=self.state_handler.sysid)

            if not (self.cluster.is_unlocked() or self.cluster.config
                    and self.cluster.config.data) and self.has_lock():
                self.dcs.set_config_value(
                    json.dumps(self.patroni.config.dynamic_configuration,
                               separators=(',', ':')))

            if self._async_executor.busy:
                return self.handle_long_action_in_progress()

            # we've got here, so any async action has finished. Check if we tried to recover and failed
            if self.recovering:
                self.recovering = False
                msg = self.post_recover()
                if msg is not None:
                    return msg

            # currently it can trigger only reinitialize
            msg = self.process_scheduled_action()
            if msg is not None:
                return msg

            # is data directory empty?
            if self.state_handler.data_directory_empty():
                return self.bootstrap()  # new node
            # "bootstrap", but data directory is not empty
            elif not self.sysid_valid(
                    self.cluster.initialize) and self.cluster.is_unlocked():
                self.dcs.initialize(
                    create_new=(self.cluster.initialize is None),
                    sysid=self.state_handler.sysid)
            else:
                # check if we are allowed to join
                if self.sysid_valid(
                        self.cluster.initialize
                ) and self.cluster.initialize != self.state_handler.sysid:
                    logger.fatal(
                        "system ID mismatch, node %s belongs to a different cluster: %s != %s",
                        self.state_handler.name, self.cluster.initialize,
                        self.state_handler.sysid)
                    sys.exit(1)

            # try to start dead postgres
            if not self.state_handler.is_healthy():
                msg = self.recover()
                if msg is not None:
                    return msg

            try:
                if self.cluster.is_unlocked():
                    return self.process_unhealthy_cluster()
                else:
                    return self.process_healthy_cluster()
            finally:
                # we might not have a valid PostgreSQL connection here if another thread
                # stops PostgreSQL, therefore, we only reload replication slots if no
                # asynchronous processes are running (should be always the case for the master)
                if not self._async_executor.busy:
                    self.state_handler.sync_replication_slots(self.cluster)
        except DCSError:
            logger.error('Error communicating with DCS')
            if self.state_handler.is_running(
            ) and self.state_handler.is_leader():
                self.demote(delete_leader=False)
                return 'demoted self because DCS is not accessible and i was a leader'
        except (psycopg2.Error, PostgresConnectionException):
            logger.exception(
                'Error communicating with PostgreSQL. Will try again later')

    def run_cycle(self):
        with self._async_executor:
            return self._run_cycle()
示例#3
0
文件: ha.py 项目: alkersan/patroni
class Ha(object):

    def __init__(self, patroni):
        self.patroni = patroni
        self.state_handler = patroni.postgresql
        self.dcs = patroni.dcs
        self.cluster = None
        self.old_cluster = None
        self.recovering = False
        self._async_executor = AsyncExecutor()

    def load_cluster_from_dcs(self):
        cluster = self.dcs.get_cluster()

        # We want to keep the state of cluster when it was healthy
        if not cluster.is_unlocked() or not self.old_cluster:
            self.old_cluster = cluster
        self.cluster = cluster

    def acquire_lock(self):
        return self.dcs.attempt_to_acquire_leader()

    def update_lock(self):
        ret = self.dcs.update_leader()
        if ret and not self._async_executor.busy:
            try:
                self.dcs.write_leader_optime(self.state_handler.last_operation())
            except:
                pass
        return ret

    def has_lock(self):
        lock_owner = self.cluster.leader and self.cluster.leader.name
        logger.info('Lock owner: %s; I am %s', lock_owner, self.state_handler.name)
        return lock_owner == self.state_handler.name

    def touch_member(self):
        data = {
            'conn_url': self.state_handler.connection_string,
            'api_url': self.patroni.api.connection_string,
            'state': self.state_handler.state,
            'role': self.state_handler.role
        }
        if self.patroni.tags:
            data['tags'] = self.patroni.tags
        if self.state_handler.pending_restart:
            data['pending_restart'] = True
        if not self._async_executor.busy and data['state'] in ['running', 'restarting', 'starting']:
            try:
                data['xlog_location'] = self.state_handler.xlog_position()
            except:
                pass
        if self.patroni.scheduled_restart:
            scheduled_restart_data = self.patroni.scheduled_restart.copy()
            scheduled_restart_data['schedule'] = scheduled_restart_data['schedule'].isoformat()
            data['scheduled_restart'] = scheduled_restart_data

        self.dcs.touch_member(json.dumps(data, separators=(',', ':')))

    def clone(self, clone_member=None, msg='(without leader)'):
        if self.state_handler.clone(clone_member):
            logger.info('bootstrapped %s', msg)
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(node_to_follow, cluster.leader, True)
        else:
            logger.error('failed to bootstrap %s', msg)
            self.state_handler.remove_data_directory()

    def bootstrap(self):
        if not self.cluster.is_unlocked():  # cluster already has leader
            clone_member = self.cluster.get_clone_member()
            member_role = 'leader' if clone_member == self.cluster.leader else 'replica'
            msg = "from {0} '{1}'".format(member_role, clone_member.name)
            self._async_executor.schedule('bootstrap {0}'.format(msg))
            self._async_executor.run_async(self.clone, args=(clone_member, msg))
            return 'trying to bootstrap {0}'.format(msg)
        # no initialize key and node is allowed to be master and has 'bootstrap' section in a configuration file
        elif self.cluster.initialize is None and not self.patroni.nofailover and 'bootstrap' in self.patroni.config:
            if self.dcs.initialize(create_new=True):  # race for initialization
                try:
                    self.state_handler.bootstrap(self.patroni.config['bootstrap'])
                    self.dcs.initialize(create_new=False, sysid=self.state_handler.sysid)
                except:  # initdb or start failed
                    # remove initialization key and give a chance to other members
                    logger.info("removing initialize key after failed attempt to initialize the cluster")
                    self.dcs.cancel_initialization()
                    self.state_handler.stop('immediate')
                    self.state_handler.move_data_directory()
                    raise
                self.dcs.set_config_value(json.dumps(self.patroni.config.dynamic_configuration, separators=(',', ':')))
                self.dcs.take_leader()
                self.load_cluster_from_dcs()
                return 'initialized a new cluster'
            else:
                return 'failed to acquire initialize lock'
        else:
            if self.state_handler.can_create_replica_without_replication_connection():
                self._async_executor.run_async(self.clone)
                return "trying to bootstrap (without leader)"
            return 'waiting for leader to bootstrap'

    def recover(self):
        self.recovering = True
        return self.follow("starting as readonly because i had the session lock", "starting as a secondary", True, True)

    def _get_node_to_follow(self, cluster):
        # determine the node to follow. If replicatefrom tag is set,
        # try to follow the node mentioned there, otherwise, follow the leader.
        if not self.patroni.replicatefrom or self.patroni.replicatefrom == self.state_handler.name:
            node_to_follow = cluster.leader
        else:
            node_to_follow = cluster.get_member(self.patroni.replicatefrom)

        return node_to_follow if node_to_follow and node_to_follow.name != self.state_handler.name else None

    def follow(self, demote_reason, follow_reason, refresh=True, recovery=False):
        if refresh:
            self.load_cluster_from_dcs()

        if recovery:
            ret = demote_reason if self.has_lock() else follow_reason
        else:
            ret = demote_reason if self.state_handler.is_leader() else follow_reason

        node_to_follow = self._get_node_to_follow(self.cluster)

        self.state_handler.follow(node_to_follow, self.cluster.leader, recovery, self._async_executor)

        return ret

    def enforce_master_role(self, message, promote_message):
        if self.state_handler.is_leader() or self.state_handler.role == 'master':
            return message
        else:
            self.state_handler.promote()
            self.touch_member()
            return promote_message

    @staticmethod
    def fetch_node_status(member):
        """This function perform http get request on member.api_url and fetches its status
        :returns: tuple(`member`, reachable, in_recovery, xlog_location)

        reachable - `!False` if the node is not reachable or is not responding with correct JSON
        in_recovery - `!True` if pg_is_in_recovery() == true
        xlog_location - value of `replayed_location` or `location` from JSON, dependin on its role.
        tags - dictionary with values of different tags (i.e. nofailover)
        """

        try:
            response = requests.get(member.api_url, timeout=2, verify=False)
            logger.info('Got response from %s %s: %s', member.name, member.api_url, response.content)
            json = response.json()
            is_master = json['role'] == 'master'
            xlog_location = None if is_master else json['xlog']['replayed_location']
            return (member, True, not is_master, xlog_location, json.get('tags', {}))
        except:
            logger.exception('request failed: GET %s', member.api_url)
        return (member, False, None, 0, {})

    def fetch_nodes_statuses(self, members):
        pool = ThreadPool(len(members))
        results = pool.map(self.fetch_node_status, members)  # Run API calls on members in parallel
        pool.close()
        pool.join()
        return results

    def _is_healthiest_node(self, members, check_replication_lag=True):
        """This method tries to determine whether I am healthy enough to became a new leader candidate or not."""

        if check_replication_lag and not self.state_handler.check_replication_lag(self.cluster.last_leader_operation):
            return False  # Too far behind last reported xlog location on master

        # Prepare list of nodes to run check against
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]

        if members:
            my_xlog_location = self.state_handler.xlog_position()
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):  # If the node is unreachable it's not healhy
                    if not in_recovery:
                        logger.warning('Master (%s) is still alive', member.name)
                        return False
                    if my_xlog_location < xlog_location:
                        return False
        return True

    def is_failover_possible(self, members):
        ret = False
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]
        if members:
            for member, reachable, _, _, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):
                    ret = True  # TODO: check xlog_location
                elif not reachable:
                    logger.info('Member %s is not reachable', member.name)
                elif tags.get('nofailover', False):
                    logger.info('Member %s is not allowed to promote', member.name)
        else:
            logger.warning('manual failover: members list is empty')
        return ret

    def manual_failover_process_no_leader(self):
        failover = self.cluster.failover
        if failover.candidate:  # manual failover to specific member
            if failover.candidate == self.state_handler.name:  # manual failover to me
                return True

            # find specific node and check that it is healthy
            member = self.cluster.get_member(failover.candidate, fallback_to_leader=False)
            if member:
                member, reachable, _, _, tags = self.fetch_node_status(member)
                if reachable and not tags.get('nofailover', False):  # node is healthy
                    logger.info('manual failover: to %s, i am %s', member.name, self.state_handler.name)
                    return False
                # we wanted to failover to specific member but it is not healthy
                if not reachable:
                    logger.warning('manual failover: member %s is unhealthy', member.name)
                elif tags.get('nofailover', False):
                    logger.warning('manual failover: member %s is not allowed to promote', member.name)

            # at this point we should consider all members as a candidates for failover
            # i.e. we assume that failover.candidate is None

        # try to pick some other members to failover and check that they are healthy
        if failover.leader:
            if self.state_handler.name == failover.leader:  # I was the leader
                # exclude me and desired member which is unhealthy (failover.candidate can be None)
                members = [m for m in self.cluster.members if m.name not in (failover.candidate, failover.leader)]
                if self.is_failover_possible(members):  # check that there are healthy members
                    return False
                else:  # I was the leader and it looks like currently I am the only healthy member
                    return True

            # at this point we assume that our node is a candidate for a failover among all nodes except former leader

        # exclude former leader from the list (failover.leader can be None)
        members = [m for m in self.cluster.members if m.name != failover.leader]
        return self._is_healthiest_node(members, check_replication_lag=False)

    def is_healthiest_node(self):
        if self.state_handler.is_leader():  # leader is always the healthiest
            return True

        if self.patroni.nofailover:  # nofailover tag makes node always unhealthy
            return False

        if self.cluster.failover:
            return self.manual_failover_process_no_leader()

        # run usual health check
        members = {m.name: m for m in self.cluster.members + self.old_cluster.members}
        return self._is_healthiest_node(members.values())

    def demote(self, delete_leader=True):
        if delete_leader:
            self.state_handler.stop()
            self.state_handler.set_role('unknown')
            self.dcs.delete_leader()
            self.touch_member()
            self.dcs.reset_cluster()
            sleep(2)  # Give a time to somebody to promote
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(node_to_follow, cluster.leader, True)
        else:
            self.state_handler.follow(None, None)

    def should_run_scheduled_action(self, action_name, scheduled_at, cleanup_fn):
        if scheduled_at:
            # If the scheduled action is in the far future, we shouldn't do anything and just return.
            # If the scheduled action is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the scheduled action
            # Additionally, if the scheduled action cannot be executed altogether, i.e. there is an error
            # or the action is in the past - we take care of cleaning it up.
            now = datetime.datetime.now(pytz.utc)
            try:
                delta = (scheduled_at - now).total_seconds()

                if delta > self.dcs.loop_wait:
                    logger.info('Awaiting %s at %s (in %.0f seconds)',
                                action_name, scheduled_at.isoformat(), delta)
                    return False
                elif delta < - int(self.dcs.loop_wait * 1.5):
                    logger.warning('Found a stale %s value, cleaning up: %s',
                                   action_name, scheduled_at.isoformat())
                    cleanup_fn()
                    self.dcs.manual_failover('', '', index=self.cluster.failover.index)
                    return False

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled {0} at %s'.format(action_name), scheduled_at.isoformat())
                return True
            except TypeError:
                logger.warning('Incorrect value of scheduled_at: %s', scheduled_at)
                cleanup_fn()
        return False

    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover

        if (failover.scheduled_at and not
            self.should_run_scheduled_action("failover", failover.scheduled_at, lambda:
                                             self.dcs.manual_failover('', '', index=self.cluster.failover.index))):
            return

        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.candidate or failover.candidate != self.state_handler.name:
                members = [m for m in self.cluster.members if not failover.candidate or m.name == failover.candidate]
                if self.is_failover_possible(members):  # check that there are healthy members
                    self._async_executor.schedule('manual failover: demote')
                    self._async_executor.run_async(self.demote)
                    return 'manual failover: demoting myself'
                else:
                    logger.warning('manual failover: no healthy members found, failover is not possible')
            else:
                logger.warning('manual failover: I am already the leader, no need to failover')
        else:
            logger.warning('manual failover: leader name does not match: %s != %s',
                           self.cluster.failover.leader, self.state_handler.name)

        logger.info('Trying to clean up failover key')
        self.dcs.manual_failover('', '', index=self.cluster.failover.index)

    def process_unhealthy_cluster(self):
        if self.is_healthiest_node():
            if self.acquire_lock():
                if self.cluster.failover:
                    logger.info('Cleaning up failover key after acquiring leader lock...')
                    self.dcs.manual_failover('', '')
                self.load_cluster_from_dcs()
                return self.enforce_master_role('acquired session lock as a leader',
                                                'promoted self to leader by acquiring session lock')
            else:
                return self.follow('demoted self after trying and failing to obtain lock',
                                   'following new leader after trying and failing to obtain lock')
        else:
            if self.patroni.nofailover:
                return self.follow('demoting self because I am not allowed to become master',
                                   'following a different leader because I am not allowed to promote')
            return self.follow('demoting self because i am not the healthiest node',
                               'following a different leader because i am not the healthiest node')

    def process_healthy_cluster(self):
        if self.has_lock():
            if self.cluster.failover:
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

            if self.update_lock():
                return self.enforce_master_role('no action.  i am the leader with the lock',
                                                'promoted self to leader because i had the session lock')
            else:
                # Either there is no connection to DCS or someone else acquired the lock
                logger.error('failed to update leader lock')
                self.load_cluster_from_dcs()
        else:
            logger.info('does not have lock')
        return self.follow('demoting self because i do not have the lock and i was a leader',
                           'no action.  i am a secondary and i am following a leader', False)

    def evaluate_scheduled_restart(self):
        # restart if we need to
        restart_data = self.future_restart_scheduled()
        if restart_data:
            recent_time = self.state_handler.postmaster_start_time()
            request_time = restart_data['postmaster_start_time']
            # check if postmaster start time has changed since the last restart
            if recent_time and request_time and recent_time != request_time:
                logger.info("Cancelling scheduled restart: postgres restart has already happened at %s", recent_time)
                self.delete_future_restart()
                return None

        if (restart_data and
           self.should_run_scheduled_action('restart', restart_data['schedule'], self.delete_future_restart)):
            try:
                ret, message = self.restart(restart_data, run_async=True)
                if not ret:
                    logger.warning("Scheduled restart: %s", message)
                    return None
                return message
            finally:
                self.delete_future_restart()

    def restart_matches(self, role, postgres_version, pending_restart):
        reason_to_cancel = ""
        # checking the restart filters here seem to be less ugly than moving them into the
        # run_scheduled_action.
        if role and role != self.state_handler.role:
            reason_to_cancel = "host role mismatch"

        if (postgres_version and
           self.state_handler.postgres_version_to_int(postgres_version) <= int(self.state_handler.server_version)):
            reason_to_cancel = "postgres version mismatch"

        if pending_restart and not self.state_handler.pending_restart:
            reason_to_cancel = "pending restart flag is not set"

        if not reason_to_cancel:
            return True
        else:
            logger.info("not proceeding with the restart: %s", reason_to_cancel)
        return False

    def schedule(self, action, immediate=False):
        with self._async_executor:
            return self._async_executor.schedule(action, immediate)

    def schedule_future_restart(self, restart_data):
        with self._async_executor:
            if not self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = restart_data
                self.touch_member()
                return True
        return False

    def delete_future_restart(self):
        ret = False
        with self._async_executor:
            if self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = {}
                self.touch_member()
                ret = True
        return ret

    def future_restart_scheduled(self):
        return self.patroni.scheduled_restart.copy() if (self.patroni.scheduled_restart and
                                                         isinstance(self.patroni.scheduled_restart, dict)) else None

    def schedule_reinitialize(self):
        return self.schedule('reinitialize')

    def reinitialize_scheduled(self):
        return self._async_executor.scheduled_action == 'reinitialize'

    def schedule_restart(self, immediate=False):
        return self.schedule('restart', immediate)

    def restart_scheduled(self):
        return self._async_executor.scheduled_action == 'restart'

    def restart(self, restart_data=None, run_async=False):
        """ conditional and unconditional restart """
        if (restart_data and isinstance(restart_data, dict) and
            not self.restart_matches(restart_data.get('role'),
                                     restart_data.get('postgres_version'),
                                     ('restart_pending' in restart_data))):
            return (False, "restart conditions are not satisfied")

        with self._async_executor:
            prev = self.schedule_restart(immediate=(not run_async))
            if prev is not None:
                return (False, prev + ' already in progress')
            if not run_async:
                if self._async_executor.run(self.state_handler.restart):
                    return (True, 'restarted successfully')
                else:
                    return (False, 'restart failed')
            else:
                self._async_executor.run_async(self.state_handler.restart)
                return (True, "restart initiated")

    def reinitialize(self, cluster):
        self.state_handler.stop('immediate')
        self.state_handler.remove_data_directory()

        clone_member = cluster.get_clone_member()
        member_role = 'leader' if clone_member == cluster.leader else 'replica'
        self.clone(clone_member, "from {0} '{1}'".format(member_role, clone_member.name))

    def process_scheduled_action(self):
        if self.reinitialize_scheduled():
            if self.cluster.is_unlocked():
                logger.error('Cluster has no leader, can not reinitialize')
                self._async_executor.reset_scheduled_action()
            elif self.has_lock():
                logger.error('I am the leader, can not reinitialize')
                self._async_executor.reset_scheduled_action()
            else:
                self._async_executor.run_async(self.reinitialize, args=(self.cluster, ))
                return 'reinitialize started'

    def handle_long_action_in_progress(self):
        if self.has_lock():
            if self.update_lock():
                return 'updated leader lock during ' + self._async_executor.scheduled_action
            else:
                return 'failed to update leader lock during ' + self._async_executor.scheduled_action
        elif self.cluster.is_unlocked():
            return 'not healthy enough for leader race'
        else:
            return self._async_executor.scheduled_action + ' in progress'

    @staticmethod
    def sysid_valid(sysid):
        # sysid does tv_sec << 32, where tv_sec is the number of seconds sine 1970,
        # so even 1 << 32 would have 10 digits.
        sysid = str(sysid)
        return len(sysid) >= 10 and sysid.isdigit()

    def post_recover(self):
        if not self.state_handler.is_running():
            if self.has_lock():
                self.dcs.delete_leader()
                self.dcs.reset_cluster()
                return 'removed leader key after trying and failing to start postgres'
            return 'failed to start postgres'
        return None

    def _run_cycle(self):
        try:
            self.load_cluster_from_dcs()

            self.touch_member()

            # cluster has leader key but not initialize key
            if not (self.cluster.is_unlocked() or self.sysid_valid(self.cluster.initialize)) and self.has_lock():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)

            if not (self.cluster.is_unlocked() or self.cluster.config and self.cluster.config.data) and self.has_lock():
                self.dcs.set_config_value(json.dumps(self.patroni.config.dynamic_configuration, separators=(',', ':')))

            if self._async_executor.busy:
                return self.handle_long_action_in_progress()

            # we've got here, so any async action has finished. Check if we tried to recover and failed
            if self.recovering:
                self.recovering = False
                msg = self.post_recover()
                if msg is not None:
                    return msg

            # currently it can trigger only reinitialize
            msg = self.process_scheduled_action()
            if msg is not None:
                return msg

            # is data directory empty?
            if self.state_handler.data_directory_empty():
                return self.bootstrap()  # new node
            # "bootstrap", but data directory is not empty
            elif not self.sysid_valid(self.cluster.initialize) and self.cluster.is_unlocked():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)
            else:
                # check if we are allowed to join
                if self.sysid_valid(self.cluster.initialize) and self.cluster.initialize != self.state_handler.sysid:
                    logger.fatal("system ID mismatch, node %s belongs to a different cluster: %s != %s",
                                 self.state_handler.name, self.cluster.initialize, self.state_handler.sysid)
                    sys.exit(1)

            # try to start dead postgres
            if not self.state_handler.is_healthy():
                msg = self.recover()
                if msg is not None:
                    return msg

            try:
                if self.cluster.is_unlocked():
                    return self.process_unhealthy_cluster()
                else:
                    msg = self.evaluate_scheduled_restart()
                    if msg is not None:
                        return msg
                    return self.process_healthy_cluster()
            finally:
                # we might not have a valid PostgreSQL connection here if another thread
                # stops PostgreSQL, therefore, we only reload replication slots if no
                # asynchronous processes are running (should be always the case for the master)
                if not self._async_executor.busy:
                    self.state_handler.sync_replication_slots(self.cluster)
        except DCSError:
            logger.error('Error communicating with DCS')
            if self.state_handler.is_running() and self.state_handler.is_leader():
                self.demote(delete_leader=False)
                return 'demoted self because DCS is not accessible and i was a leader'
        except (psycopg2.Error, PostgresConnectionException):
            logger.exception('Error communicating with PostgreSQL. Will try again later')

    def run_cycle(self):
        with self._async_executor:
            return self._run_cycle()