예제 #1
0
    def start(self, profile=False, master=True, slave=True):
        """Start cluster

        :param master: If exclude master cluster, set False
        :param slave: If exclude slave cluster, set False
        """
        logger.debug("command 'cluster start'")
        if not isinstance(profile, bool):
            msg = message.get('error_option_type_not_boolean')
            msg = msg.format(option='profile')
            logger.error(msg)
            return
        if not isinstance(master, bool):
            msg = message.get('error_option_type_not_boolean')
            msg = msg.format(option='master')
            logger.error(msg)
            return
        if not isinstance(slave, bool):
            msg = message.get('error_option_type_not_boolean')
            msg = msg.format(option='slave')
            logger.error(msg)
            return
        center = Center()
        center.update_ip_port()
        success = center.check_hosts_connection()
        if not success:
            return
        center.ensure_cluster_exist()
        if master:
            master_alive_count = center.get_alive_master_redis_count()
            master_alive_count_mine = center.get_alive_master_redis_count(
                check_owner=True
            )
            not_mine_count = master_alive_count - master_alive_count_mine
            if not_mine_count > 0:
                msg = message.get('error_cluster_start_master_collision')
                msg = '\n'.join(msg).format(count=not_mine_count)
                raise LightningDBError(11, msg)
        if slave:
            slave_alive_count = center.get_alive_slave_redis_count()
            slave_alive_count_mine = center.get_alive_slave_redis_count(
                check_owner=True
            )
            not_mine_count = slave_alive_count - slave_alive_count_mine
            if not_mine_count > 0:
                msg = message.get('error_cluster_start_slave_collision')
                msg = '\n'.join(msg).format(count=not_mine_count)
                raise LightningDBError(12, msg)
        center.backup_server_logs(master=master, slave=slave)
        center.create_redis_data_directory()

        # equal to cluster.configure()
        center.configure_redis()
        center.sync_conf(show_result=True)

        center.start_redis_process(profile, master=master, slave=slave)
        center.wait_until_all_redis_process_up(master=master, slave=slave)
예제 #2
0
def _deploy_zero_downtime(cluster_id):
    logger.debug("zero downtime update cluster {}".format(cluster_id))
    center = Center()
    center.update_ip_port()
    m_hosts = center.master_host_list
    m_ports = center.master_port_list
    s_hosts = center.slave_host_list
    s_ports = center.slave_port_list
    path_of_fb = config.get_path_of_fb(cluster_id)
    cluster_path = path_of_fb['cluster_path']

    # check master alive
    m_count = len(m_hosts) * len(m_ports)
    alive_m_count = center.get_alive_master_redis_count()
    if alive_m_count < m_count:
        logger.error(message.get('error_exist_disconnected_master'))
        return

    if not config.is_slave_enabled:
        logger.error(message.get('error_need_to_slave'))
        return

    # select installer
    installer_path = ask_util.installer()
    installer_name = os.path.basename(installer_path)

    # backup info
    current_time = time.strftime("%Y%m%d%H%M%S", time.gmtime())
    conf_backup_dir = 'cluster_{}_conf_bak_{}'.format(cluster_id, current_time)
    cluster_backup_dir = 'cluster_{}_bak_{}'.format(cluster_id, current_time)
    local_ip = config.get_local_ip()

    # backup conf
    center.conf_backup(local_ip, cluster_id, conf_backup_dir)

    # backup cluster
    for host in s_hosts:
        client = net.get_ssh(host)
        center.cluster_backup(host, cluster_id, cluster_backup_dir)
        client.close()

    # transfer & install
    logger.info(message.get('transfer_and_execute_installer'))
    for host in m_hosts:
        logger.info(' - {}'.format(host))
        client = net.get_ssh(host)
        cmd = 'mkdir -p {0} && touch {0}/.deploy.state'.format(cluster_path)
        net.ssh_execute(client=client, command=cmd)
        client.close()
        DeployUtil().transfer_installer(host, cluster_id, installer_path)
        try:
            DeployUtil().install(host, cluster_id, installer_name)
        except SSHCommandError as ex:
            msg = message.get('error_execute_installer')
            msg = msg.format(installer=installer_path)
            logger.error(msg)
            logger.exception(ex)
            return

    # restore conf
    center.conf_restore(local_ip, cluster_id, conf_backup_dir)

    # set deploy state complete
    for node in m_hosts:
        path_of_fb = config.get_path_of_fb(cluster_id)
        cluster_path = path_of_fb['cluster_path']
        client = net.get_ssh(node)
        cmd = 'rm -rf {}'.format(os.path.join(cluster_path, '.deploy.state'))
        net.ssh_execute(client=client, command=cmd)
        client.close()

    # restart slave
    center.stop_current_nodes(master=False, slave=True)
    center.configure_redis()
    center.sync_conf()
    center.start_current_nodes(master=False, slave=True)

    center.wait_until_all_redis_process_up()
    slaves_for_failover = center.get_slave_nodes()

    key = 'cluster-node-timeout'
    origin_m_value = center.cli_config_get(key, m_hosts[0], m_ports[0])
    origin_s_value = center.cli_config_get(key, s_hosts[0], s_ports[0])
    logger.debug('config set: cluster-node-timeout 2000')
    RedisCliConfig().set(key, '2000', all=True)

    # cluster failover (with no option)
    logger.info(message.get('failover_on_deploy'))
    logger.debug(slaves_for_failover)
    try_count = 0
    while try_count < 10:
        try_count += 1
        success = True
        for slave_addr in slaves_for_failover:
            host, port = slave_addr.split(':')
            stdout = center.run_failover("{}:{}".format(host, port))
            logger.debug("failover {}:{} {}".format(host, port, stdout))
            if stdout != "ERR You should send CLUSTER FAILOVER to a slave":
                # In some cases, the cluster failover is not complete
                # even if stdout is OK
                # If redis changed to master completely,
                # return 'ERR You should send CLUSTER FAILOVER to a slave'
                success = False
        if success:
            break
        msg = message.get('retry').format(try_count=try_count)
        logger.info(msg)
        time.sleep(5)
    logger.debug('restore config: cluster-node-timeout')
    center.cli_config_set_all(key, origin_m_value, m_hosts, m_ports)
    center.cli_config_set_all(key, origin_s_value, s_hosts, s_ports)
    if not success:
        logger.error(message.get('error_redis_failover'))
        return

    # restart master (current slave)
    center.stop_current_nodes(master=False, slave=True)
    center.configure_redis(slave=False)
    center.sync_conf()
    center.start_current_nodes(master=False, slave=True)
    center.wait_until_all_redis_process_up()