Пример #1
0
def clean_up_migration(source_replica_set):
    migration = start_shard_migration.check_migration_lock(source_replica_set)
    destination_replica_set = migration['destination_replica_set']
    mig_lock_identifier = migration['lock_identifier']
    zk = host_utils.MysqlZookeeper()
    destination_master = zk.get_mysql_instance_from_replica_set(
        destination_replica_set)

    try:
        mysql_lib.get_slave_status(destination_master)
        reset_repl = True
    except:
        reset_repl = False

    if reset_repl:
        log.info('Taking promotion locks')
        dest_lock_identifier = mysql_failover.get_promotion_lock(
            destination_replica_set)
        log.info('Removing replication from destination master {}'
                 ''.format(destination_master))
        try:
            mysql_lib.reset_slave(destination_master)
        except:
            raise
        finally:
            mysql_failover.release_promotion_lock(dest_lock_identifier)

    (orphans_tmp, orphaned_but_used_tmp, _) = \
            find_shard_mismatches.find_shard_mismatches(destination_master)

    orphans = orphans_tmp[destination_master] if \
            destination_master in orphans_tmp else []
    orphaned_but_used = orphaned_but_used_tmp[destination_master] if \
            destination_master in orphaned_but_used_tmp else []

    if orphaned_but_used:
        log.info('Orphaned but used dbs: {}'.format(
            ', '.join(orphaned_but_used)))
        raise Exception('Cowardly refusing to do anything')

    if orphans:
        log.info('Orphaned dbs: {}'.format(', '.join(orphans)))
        fix_orphaned_shards.rename_db_to_drop(destination_master, orphans)

    start_shard_migration.finish_migration_log(
        mig_lock_identifier, start_shard_migration.STATUS_ABORTED)
Пример #2
0
def mysql_failover(master, dry_run, skip_lock,
                   ignore_dr_slave, trust_me_its_dead, kill_old_master):
    """ Promte a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set,
                                                       repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(replica_set,
                                                              host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # let's make sure that what we think is the master, actually is
        confirm_replica_topology(master, replicas)

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)
        slave_conn = mysql_lib.connect_mysql(slave)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        try:
            mysql_lib.get_master_status(slave_conn)
        except mysql_lib.ReplicationError:
            log.error('New master {slave} is not setup to write replicaiton '
                      'logs!'.format(slave=slave))
            raise
        log.info('Slave/new master is setup to write replication logs')

        if kill_old_master:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas, MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(0)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master_conn, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master_conn)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas, 0,
                                    timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
            log.info('Setting up replication from old master ({master})'
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave_conn)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming relpica servers in sync')
                confirm_max_replica_lag(replicas, MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                        replicas_synced=True,
                                        dead_master=dead_master)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master_conn, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master_conn)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt+1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave_conn, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave_conn)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)
Пример #3
0
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave,
                   trust_me_its_dead, kill_old_master):
    """ Promote a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master,
                                                        rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(
            replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(
                replica_set, host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info(
            'DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        mysql_lib.get_master_status(slave)

        if kill_old_master and not dry_run:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NORMAL,
                                    dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                    dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(environment_specific.DRY_RUN_EXIT_CODE)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NONE,
                                    dead_master, True,
                                    mysql_lib.NORMAL_HEARTBEAT_LAG)
            log.info('Setting up replication from old master ({master}) '
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming replica servers are synced')
                confirm_max_replica_lag(replicas,
                                        mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                        dead_master, True)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt + 1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    # we don't really care if this fails, but we'll print a message anyway.
    try:
        environment_specific.generic_json_post(
            environment_specific.CHANGE_FEED_URL, {
                'type': 'MySQL Failover',
                'environment': replica_set,
                'description': "Failover from {m} to {s}".format(m=master,
                                                                 s=slave),
                'author': host_utils.get_user(),
                'automation': False,
                'source': "mysql_failover.py on {}".format(host_utils.HOSTNAME)
            })
    except Exception as e:
        log.warning("Failover completed, but change feed "
                    "not updated: {}".format(e))

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)
Пример #4
0
def mysql_failover(master, dry_run, skip_lock, ignore_dr_slave,
                   trust_me_its_dead, kill_old_master):
    """ Promte a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master,
                                                        rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(
            replica_set=replica_set, repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(
                replica_set, host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info(
            'DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # let's make sure that what we think is the master, actually is
        confirm_replica_topology(master, replicas)

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)
        slave_conn = mysql_lib.connect_mysql(slave)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        try:
            mysql_lib.get_master_status(slave_conn)
        except mysql_lib.ReplicationError:
            log.error('New master {slave} is not setup to write replicaiton '
                      'logs!'.format(slave=slave))
            raise
        log.info('Slave/new master is setup to write replication logs')

        if kill_old_master:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas,
                                    MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas,
                                    MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(0)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master_conn, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master_conn)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas,
                                    0,
                                    timeout=MAX_ALIVE_MASTER_SLAVE_LAG_SECONDS,
                                    dead_master=dead_master)
            log.info('Setting up replication from old master ({master})'
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave_conn)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming relpica servers in sync')
                confirm_max_replica_lag(replicas,
                                        MAX_DEAD_MASTER_SLAVE_LAG_SECONDS,
                                        replicas_synced=True,
                                        dead_master=dead_master)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master_conn, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master_conn)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt + 1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave_conn, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave_conn)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)
Пример #5
0
def mysql_failover(master, dry_run, skip_lock,
                   trust_me_its_dead, kill_old_master):
    """ Promote a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """

    log.info('Master to demote is {master}'.format(master=master))

    slaves = host_utils.get_slaves_of_master(master)
    if not slaves:
        raise Exception('No slaves found.')

    new_master = slaves[0]
    replicas = slaves[1:]

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on {new_master}'.format(new_master=new_master))
        cluster = host_utils.get_cluster_name(new_master)
        lock_identifier = get_promotion_lock(cluster)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        log.info('New master is detected as {slave}'.format(slave=new_master))

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, slaves)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if New master {new_master} is setup to write '
                 'replication logs'.format(new_master=new_master))
        mysql_lib.get_master_status(new_master)

        if kill_old_master and not dry_run:
            log.info('Killing old master {}, we hope you know what you are doing'.format(master))
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master {} is considered alive'.format(master))
            dead_master = False
            confirm_max_replica_lag(slaves,
                                    mysql_lib.REPLICATION_TOLERANCE_NORMAL,
                                    dead_master)
        else:
            log.info('Master {} is considered dead'.format(master))
            dead_master = True
            confirm_max_replica_lag(slaves,
                                    mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                    dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(0)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master {}'.format(master))
            mysql_lib.set_global_read_only(master)
            log.info('Confirming no writes to old master {}'.format(master))
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(slaves,
                                    mysql_lib.REPLICATION_TOLERANCE_NONE,
                                    dead_master,
                                    True,
                                    mysql_lib.NORMAL_HEARTBEAT_LAG)
            log.info('Setting up replication from old master ({master}) '
                     'to new master ({slave})'.format(master=master,
                                                      slave=new_master))
            mysql_lib.setup_replication(new_master=new_master, new_replica=master)
        else:
            log.info('Confirming replica/new master {} has processed all replication '
                     ' logs'.format(new_master))
            confirm_no_writes(new_master)
            log.info('Looks like no writes being processed by replica {} via '
                     'replication or other means'.format(new_master))
            if len(slaves) > 1:
                log.info('Confirming replica servers are synced')
                confirm_max_replica_lag(slaves,
                                        mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                        dead_master,
                                        True)
    except:
        log.info('Starting rollback')
        try:
            if master_conn:
                log.info('Releasing read_only on old master {}'.format(master))
                mysql_lib.unset_global_read_only(master)

                log.info('Clearing replication settings on old master {}'.format(master))
                mysql_lib.reset_slave(master)
        finally:
            if lock_identifier:
                log.info('Releasing promotion lock')
                release_promotion_lock(lock_identifier)
            log.info('Rollback complete, reraising exception')
        raise

    failed_replicas = []
    for s in replicas:
        log.info('Setting up replication {replication} from old master ({master}) '
         'to new master ({slave})'.format(master=master,
                                          slave=new_master,
                                          replication=s))
        try:
            mysql_lib.setup_replication(new_master=new_master, new_replica=s)
        except Exception as e:
            failed_replicas.append(s)
            log.error(e)
            log.error('Setting up replication on the slave {} failed. Failing forward!'.format(s))

    log.info('Updating config')
    host_utils.swap_master_and_slave(new_master)

    log.info('Removing read_only from new master {}'.format(new_master))
    mysql_lib.unset_global_read_only(new_master)
    log.info('Removing replication configuration from new master {}'.format(new_master))
    mysql_lib.reset_slave(new_master)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)
    report(master, slaves, new_master, replicas, failed_replicas)
Пример #6
0
def finish_shard_migration(source_replica_set,
                           dry_run=False,
                           confirm_row_counts=True):
    """ Run a partial failover to finish a migration

    Args:
    source_replica_set -  The source replica set for the migration
    dry_run - Just run verifications, do not failover shards
    confirm_row_counts - If true, confirm row counts of slaves, otherwise
                         check estimated row counts
    """
    if host_utils.get_security_role(
    ) not in environment_specific.ROLE_TO_MIGRATE:
        raise Exception(environment_specific.ROLE_ERROR_MSG)
    # Figure out what we are working on based on the lock present on the
    # source_replica_set
    migration = start_shard_migration.check_migration_lock(source_replica_set)
    if migration['status'] != start_shard_migration.STATUS_FAILOVER_READY:
        raise Exception('Migration is not ready to run')

    destination_replica_set = migration['destination_replica_set']
    source_replica_set = migration['source_replica_set']
    mig_databases = set(migration['mig_databases'].split(', '))
    non_mig_databases = set(migration['non_mig_databases'].split(', '))
    mig_lock_identifier = migration['lock_identifier']

    # Figure out some more data based on the lock
    zk = host_utils.MysqlZookeeper()
    source_master = zk.get_mysql_instance_from_replica_set(source_replica_set)
    destination_master = zk.get_mysql_instance_from_replica_set(
        destination_replica_set)
    log.info('Will migrate {dbs} from {s} to {d}'
             ''.format(dbs=mig_databases,
                       s=source_replica_set,
                       d=destination_replica_set))
    try:
        dest_lock_identifier = None
        src_lock_identifier = None
        read_only_set = False

        if not dry_run:
            log.info('Taking promotion locks')
            dest_lock_identifier = mysql_failover.get_promotion_lock(
                destination_replica_set)
            src_lock_identifier = mysql_failover.get_promotion_lock(
                source_replica_set)

        # Check replication
        log.info('Checking replication')
        check_replication_for_migration(source_replica_set,
                                        destination_replica_set)

        # Check schema and data
        log.info('Checking schema and row counts')
        verify_schema_for_migration(source_replica_set,
                                    destination_replica_set, mig_databases,
                                    confirm_row_counts)

        log.info('Confirming that no real tables exists on blackhole dbs')
        verify_blackhole_dbs(destination_master, non_mig_databases)

        if dry_run:
            log.info('In dry_run mode, existing now')
            os._exit(environment_specific.DRY_RUN_EXIT_CODE)

        log.info('Preliminary sanity checks complete, starting migration')

        log.info('Setting read_only on master servers')
        read_only_set = True
        mysql_lib.set_global_variable(source_master, 'read_only', True)

        # We don't strictly need to set read_only on the destination_master
        # but it seems sane out of paranoia
        mysql_lib.set_global_variable(destination_master, 'read_only', True)

        log.info('Confirming no writes to old master')
        mysql_failover.confirm_no_writes(source_master)

        log.info('Waiting for replicas to be caught up')
        wait_for_repl_sync(destination_master)

        log.info('Breaking replication from source to destination')
        mysql_lib.reset_slave(destination_master)

        log.info('Renaming migrated dbs on source master')
        fix_orphaned_shards.rename_db_to_drop(source_master,
                                              mig_databases,
                                              skip_check=True)

        log.info('Modifying shard mapping')
        migrate_shard_mapping(source_replica_set, destination_replica_set,
                              mig_databases)

        log.info('Dropping blackhole dbs on destination')
        for db in non_mig_databases:
            mysql_lib.drop_db(destination_master, db)

        start_shard_migration.finish_migration_log(
            mig_lock_identifier, start_shard_migration.STATUS_FINISHED)
    except:
        raise

    finally:
        if read_only_set:
            log.info('Remove read_only settings')
            mysql_lib.set_global_variable(source_master, 'read_only', False)
            mysql_lib.set_global_variable(destination_master, 'read_only',
                                          False)

        if dest_lock_identifier:
            log.info('Releasing destination promotion lock')
            mysql_failover.release_promotion_lock(dest_lock_identifier)

        if src_lock_identifier:
            log.info('Releasing source promotion lock')
            mysql_failover.release_promotion_lock(src_lock_identifier)

    # No exception got reraised in the the except, things must have worked
    log.info('To drop migrated dbs on source, wait a bit and then run:')
    log.info('/usr/local/bin/mysql_utils/fix_orphaned_shards.py -a '
             'drop -i {}'.format(source_master))
Пример #7
0
def mysql_failover(master, dry_run, skip_lock,
                   ignore_dr_slave, trust_me_its_dead, kill_old_master):
    """ Promote a new MySQL master

    Args:
    master - Hostaddr object of the master instance to be demoted
    dry_run - Do not change state, just do sanity testing and exit
    skip_lock - Do not take a promotion lock
    ignore_dr_slave - Ignore the existance of a dr_slave
    trust_me_its_dead - Do not test to see if the master is dead
    kill_old_master - Send a mysqladmin kill command to the old master

    Returns:
    new_master - The new master server
    """
    log.info('Master to demote is {master}'.format(master=master))

    zk = host_utils.MysqlZookeeper()
    (replica_set, _) = zk.get_replica_set_from_instance(master, rtypes=['master'])
    log.info('Replica set is detected as '
             '{replica_set}'.format(replica_set=replica_set))

    # take a lock here to make sure nothing changes underneath us
    if not skip_lock and not dry_run:
        log.info('Taking promotion lock on replica set')
        lock_identifier = get_promotion_lock(replica_set)
    else:
        lock_identifier = None

    # giant try. If there any problems we roll back from the except
    try:
        master_conn = False
        slave = zk.get_mysql_instance_from_replica_set(replica_set=replica_set,
                                                       repl_type=host_utils.REPLICA_ROLE_SLAVE)
        log.info('Slave/new master is detected as {slave}'.format(slave=slave))

        if ignore_dr_slave:
            log.info('Intentionally ignoring a dr_slave')
            dr_slave = None
        else:
            dr_slave = zk.get_mysql_instance_from_replica_set(replica_set,
                                                              host_utils.REPLICA_ROLE_DR_SLAVE)
        log.info('DR slave is detected as {dr_slave}'.format(dr_slave=dr_slave))
        if dr_slave:
            if dr_slave == slave:
                raise Exception('Slave and dr_slave appear to be the same')

            replicas = set([slave, dr_slave])
        else:
            replicas = set([slave])

        # We use master_conn as a mysql connection to the master server, if
        # it is False, the master is dead
        if trust_me_its_dead:
            master_conn = None
        else:
            master_conn = is_master_alive(master, replicas)

        # Test to see if the slave is setup for replication. If not, we are hosed
        log.info('Testing to see if Slave/new master is setup to write '
                 'replication logs')
        mysql_lib.get_master_status(slave)

        if kill_old_master and not dry_run:
            log.info('Killing old master, we hope you know what you are doing')
            mysql_lib.shutdown_mysql(master)
            master_conn = None

        if master_conn:
            log.info('Master is considered alive')
            dead_master = False
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NORMAL,
                                    dead_master)
        else:
            log.info('Master is considered dead')
            dead_master = True
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                    dead_master)

        if dry_run:
            log.info('In dry_run mode, so exiting now')
            # Using os._exit in order to not get catch in the giant try
            os._exit(environment_specific.DRY_RUN_EXIT_CODE)

        log.info('Preliminary sanity checks complete, starting promotion')

        if master_conn:
            log.info('Setting read_only on master')
            mysql_lib.set_global_variable(master, 'read_only', True)
            log.info('Confirming no writes to old master')
            # If there are writes with the master in read_only mode then the
            # promotion can not proceed.
            # A likely reason is a client has the SUPER privilege.
            confirm_no_writes(master)
            log.info('Waiting for replicas to be caught up')
            confirm_max_replica_lag(replicas,
                                    mysql_lib.REPLICATION_TOLERANCE_NONE,
                                    dead_master,
                                    True,
                                    mysql_lib.NORMAL_HEARTBEAT_LAG)
            log.info('Setting up replication from old master ({master}) '
                     'to new master ({slave})'.format(master=master,
                                                      slave=slave))
            mysql_lib.setup_replication(new_master=slave, new_replica=master)
        else:
            log.info('Starting up a zk connection to make sure we can connect')
            kazoo_client = environment_specific.get_kazoo_client()
            if not kazoo_client:
                raise Exception('Could not conect to zk')

            log.info('Confirming replica has processed all replication '
                     ' logs')
            confirm_no_writes(slave)
            log.info('Looks like no writes being processed by replica via '
                     'replication or other means')
            if len(replicas) > 1:
                log.info('Confirming replica servers are synced')
                confirm_max_replica_lag(replicas,
                                        mysql_lib.REPLICATION_TOLERANCE_LOOSE,
                                        dead_master,
                                        True)
    except:
        log.info('Starting rollback')
        if master_conn:
            log.info('Releasing read_only on old master')
            mysql_lib.set_global_variable(master, 'read_only', False)

            log.info('Clearing replication settings on old master')
            mysql_lib.reset_slave(master)
        if lock_identifier:
            log.info('Releasing promotion lock')
            release_promotion_lock(lock_identifier)
        log.info('Rollback complete, reraising exception')
        raise

    if dr_slave:
        try:
            mysql_lib.setup_replication(new_master=slave, new_replica=dr_slave)
        except Exception as e:
            log.error(e)
            log.error('Setting up replication on the dr_slave failed. '
                      'Failing forward!')

    log.info('Updating zk')
    zk_write_attempt = 0
    while True:
        try:
            modify_mysql_zk.swap_master_and_slave(slave, dry_run=False)
            break
        except:
            if zk_write_attempt > MAX_ZK_WRITE_ATTEMPTS:
                log.info('Final failure writing to zk, bailing')
                raise
            else:
                log.info('Write to zk failed, trying again')
                zk_write_attempt = zk_write_attempt+1

    log.info('Removing read_only from new master')
    mysql_lib.set_global_variable(slave, 'read_only', False)
    log.info('Removing replication configuration from new master')
    mysql_lib.reset_slave(slave)
    if lock_identifier:
        log.info('Releasing promotion lock')
        release_promotion_lock(lock_identifier)

    log.info('Failover complete')

    # we don't really care if this fails, but we'll print a message anyway.
    try:
        environment_specific.generic_json_post(
            environment_specific.CHANGE_FEED_URL,
            {'type': 'MySQL Failover',
             'environment': replica_set,
             'description': "Failover from {m} to {s}".format(m=master, s=slave),
             'author': host_utils.get_user(),
             'automation': False,
             'source': "mysql_failover.py on {}".format(host_utils.HOSTNAME)})
    except Exception as e:
        log.warning("Failover completed, but change feed "
                    "not updated: {}".format(e))

    if not master_conn:
        log.info('As master is dead, will try to launch a replacement. Will '
                 'sleep 20 seconds first to let things settle')
        time.sleep(20)
        launch_replacement_db_host.launch_replacement_db_host(master)