Пример #1
0
def check_replication_for_migration(source_replica_set,
                                    destination_replica_set):
    """ Confirm that replication is sane for finishing a shard migration

    Args:
    source_replica_set - Where shards are coming from
    destination_replica_set - Where shards are being sent
    """
    zk = host_utils.MysqlZookeeper()
    source_master = zk.get_mysql_instance_from_replica_set(source_replica_set)
    destination_master = zk.get_mysql_instance_from_replica_set(
        destination_replica_set)
    source_slave = zk.get_mysql_instance_from_replica_set(
        source_replica_set, host_utils.REPLICA_ROLE_SLAVE)
    destination_slave = zk.get_mysql_instance_from_replica_set(
        destination_replica_set, host_utils.REPLICA_ROLE_SLAVE)

    # First we will confirm that the slave of the source is caught up
    # this is important for row count comparisons
    mysql_lib.assert_replication_unlagged(
        source_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL)

    # Next, the slave of the destination replica set for the same reason
    mysql_lib.assert_replication_unlagged(
        destination_slave, mysql_lib.REPLICATION_TOLERANCE_NORMAL)

    # Next, the destination master is relatively caught up to the source master
    mysql_lib.assert_replication_unlagged(
        destination_master, mysql_lib.REPLICATION_TOLERANCE_NORMAL)

    # We will also verify that the source master is not replicating. A scary
    # scenario is if the there is some sort of ring replication going and db
    # drops of blackhole db's would propegate to the source db.
    try:
        source_slave_status = mysql_lib.get_slave_status(source_master)
    except mysql_lib.ReplicationError:
        source_slave_status = None

    if source_slave_status:
        raise Exception('Source master is setup for replication '
                        'this is super dangerous!')

    # We will also verify that the destination master is replicating from the
    # source master
    slave_status = mysql_lib.get_slave_status(destination_master)
    master_of_destination_master = host_utils.HostAddr(':'.join(
        (slave_status['Master_Host'], str(slave_status['Master_Port']))))
    if source_master != master_of_destination_master:
        raise Exception('Master of destination {d} is {actual} rather than '
                        'expected {expected} '
                        ''.format(d=destination_master,
                                  actual=master_of_destination_master,
                                  expected=destination_master))
    log.info('Replication looks ok for migration')
Пример #2
0
def clean_up_migration(source_replica_set):
    migration = start_shard_migration.check_migration_lock(source_replica_set)
    destination_replica_set = migration['destination_replica_set']
    mig_lock_identifier = migration['lock_identifier']
    zk = host_utils.MysqlZookeeper()
    destination_master = zk.get_mysql_instance_from_replica_set(
        destination_replica_set)

    try:
        mysql_lib.get_slave_status(destination_master)
        reset_repl = True
    except:
        reset_repl = False

    if reset_repl:
        log.info('Taking promotion locks')
        dest_lock_identifier = mysql_failover.get_promotion_lock(
            destination_replica_set)
        log.info('Removing replication from destination master {}'
                 ''.format(destination_master))
        try:
            mysql_lib.reset_slave(destination_master)
        except:
            raise
        finally:
            mysql_failover.release_promotion_lock(dest_lock_identifier)

    (orphans_tmp, orphaned_but_used_tmp, _) = \
            find_shard_mismatches.find_shard_mismatches(destination_master)

    orphans = orphans_tmp[destination_master] if \
            destination_master in orphans_tmp else []
    orphaned_but_used = orphaned_but_used_tmp[destination_master] if \
            destination_master in orphaned_but_used_tmp else []

    if orphaned_but_used:
        log.info('Orphaned but used dbs: {}'.format(
            ', '.join(orphaned_but_used)))
        raise Exception('Cowardly refusing to do anything')

    if orphans:
        log.info('Orphaned dbs: {}'.format(', '.join(orphans)))
        fix_orphaned_shards.rename_db_to_drop(destination_master, orphans)

    start_shard_migration.finish_migration_log(
        mig_lock_identifier, start_shard_migration.STATUS_ABORTED)
Пример #3
0
def is_master_alive(master, replicas):
    """ Determine if the master is alive

    The function will:
    1. Attempt to connect to the master via the mysql protcol. If successful
       the master is considered alive.
    2. If #1 fails, check the io thread of the replica instance(s). If the io
       thread is not running, the master will be considered dead. If step #1
       fails and step #2 succeeds, we are in a weird state and will throw an
       exception.

    Args:
    master - A hostaddr object for the master instance
    replicas -  A set of hostaddr objects for the replica instances

    Returns:
    A mysql connection to the master if the master is alive, False otherwise.
    """
    if len(replicas) == 0:
        raise Exception('At least one replica must be present to determine '
                        'a master is dead')
    try:
        master_conn = mysql_lib.connect_mysql(master)
        return master_conn
    except MySQLdb.OperationalError as detail:
        (error_code, msg) = detail.args
        if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
            raise
        master_conn = False
        log.info('Unable to connect to current master {master} from '
                 '{hostname}, will check replica servers beforce declaring '
                 'the master dead'.format(master=master,
                                          hostname=host_utils.HOSTNAME))
    except:
        log.info('This is an unknown connection error. If you are very sure '
                 'that the master is dead, please put a "return False" at the '
                 'top of is_master_alive and then send rwultsch a stack trace')
        raise

    # We can not get a connection to the master, so poll the replica servers
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        # If replication has not hit a timeout, a dead master can still have
        # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed
        # by a sleep will get us truthyness.
        mysql_lib.restart_replication(conn)
        ss = mysql_lib.get_slave_status(conn)
        if ss['Slave_IO_Running'] == 'Yes':
            raise Exception('Replica {replica} thinks it can connect to '
                            'master {master}, but failover script can not. '
                            'Possible network partition!'
                            ''.format(replica=replica,
                                      master=master))
        else:
            log.info('Replica {replica} also can not connect to master '
                     '{master}.'.format(replica=replica,
                                        master=master))
    return False
Пример #4
0
def is_master_alive(master, replicas):
    """ Determine if the master is alive

    The function will:
    1. Attempt to connect to the master via the mysql protcol. If successful
       the master is considered alive.
    2. If #1 fails, check the io thread of the replica instance(s). If the io
       thread is not running, the master will be considered dead. If step #1
       fails and step #2 succeeds, we are in a weird state and will throw an
       exception.

    Args:
    master - A hostaddr object for the master instance
    replicas -  A set of hostaddr objects for the replica instances

    Returns:
    A mysql connection to the master if the master is alive, False otherwise.
    """
    if len(replicas) == 0:
        raise Exception('At least one replica must be present to determine '
                        'a master is dead')
    try:
        master_conn = mysql_lib.connect_mysql(master)
        return master_conn
    except MySQLdb.OperationalError as detail:
        (error_code, msg) = detail.args
        if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR:
            raise
        master_conn = False
        log.info('Unable to connect to current master {master} from '
                 '{hostname}, will check replica servers beforce declaring '
                 'the master dead'.format(master=master,
                                          hostname=host_utils.HOSTNAME))
    except:
        log.info('This is an unknown connection error. If you are very sure '
                 'that the master is dead, please put a "return False" at the '
                 'top of is_master_alive and then send rwultsch a stack trace')
        raise

    # We can not get a connection to the master, so poll the replica servers
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        # If replication has not hit a timeout, a dead master can still have
        # a replica which thinks it is ok. "STOP SLAVE; START SLAVE" followed
        # by a sleep will get us truthyness.
        mysql_lib.restart_replication(conn)
        ss = mysql_lib.get_slave_status(conn)
        if ss['Slave_IO_Running'] == 'Yes':
            raise Exception('Replica {replica} thinks it can connect to '
                            'master {master}, but failover script can not. '
                            'Possible network partition!'
                            ''.format(replica=replica, master=master))
        else:
            log.info('Replica {replica} also can not connect to master '
                     '{master}.'.format(replica=replica, master=master))
    return False
Пример #5
0
def confirm_replicas_in_sync(replicas):
    """ Confirm that all replicas are in sync in terms of replication

    Args:
    replicas - A set of hostAddr objects
    """
    replication_progress = set()
    for replica in replicas:
        slave_status = mysql_lib.get_slave_status(replica)
        replication_progress.add(':'.join((slave_status['Relay_Master_Log_File'],
                                           str(slave_status['Exec_Master_Log_Pos']))))

    if len(replication_progress) == 1:
        return True
    else:
        return False
Пример #6
0
def confirm_replica_topology(master, replicas):
    """ Confirm that replica servers are actually replicating off of a master

    Args:
    master - A hostaddr object for the master instance
    replicas - A set of hostaddr objects for the replica instance
    """
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        ss = mysql_lib.get_slave_status(conn)
        repl_master = host_utils.HostAddr(':'.join(
            (ss['Master_Host'], str(ss['Master_Port']))))
        if repl_master != master:
            raise Exception('Slave {replica} is not a replica of master '
                            '{master}, but is instead a replica of '
                            '{repl_master}'.format(replica=replica,
                                                   repl_master=repl_master,
                                                   master=master))
        else:
            log.info('Replica {replica} is replicating from expected master '
                     'server {master}'.format(replica=replica, master=master))
Пример #7
0
def confirm_replica_topology(master, replicas):
    """ Confirm that replica servers are actually replicating off of a master

    Args:
    master - A hostaddr object for the master instance
    replicas - A set of hostaddr objects for the replica instance
    """
    for replica in replicas:
        conn = mysql_lib.connect_mysql(replica)
        ss = mysql_lib.get_slave_status(conn)
        repl_master = host_utils.HostAddr(':'.join((ss['Master_Host'],
                                                    str(ss['Master_Port']))))
        if repl_master != master:
            raise Exception('Slave {replica} is not a replica of master '
                            '{master}, but is instead a replica of '
                            '{repl_master}'.format(replica=replica,
                                                   repl_master=repl_master,
                                                   master=master))
        else:
            log.info('Replica {replica} is replicating from expected master '
                     'server {master}'.format(replica=replica,
                                              master=master))
Пример #8
0
def main():
    description = ("MySQL checksum wrapper\n\n"
                   "Wrapper of pt-table-checksum and pt-table-sync.\n"
                   "Defaults to checksumming 1/{k}th of databases on instance.\n"
                   "If diffs are found, use pt-table-sync to measure actual "
                   "divergence,\nbut only if the number of diffs is between "
                   "--min_diffs and --max_diffs.").format(k=DB_CHECK_FRACTION)

    parser = argparse.ArgumentParser(description=description,
                                     formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-i',
                        '--instance',
                        help='Instance to act on if other than localhost:3306',
                        default=''.join((socket.getfqdn(),
                                         ':3306')))
    parser.add_argument('-a',
                        '--all',
                        help='Checksums all dbs rather than the default',
                        action='store_true',
                        default=False)
    parser.add_argument('-d',
                        '--dbs',
                        help=("Comma separated list of db's to check rather "
                              "than the default"),
                        default=False)
    parser.add_argument('-q',
                        '--quiet',
                        help=("Do not print output to stdout"),
                        action='store_true',
                        default=False)
    parser.add_argument('-m',
                        '--min_diffs',
                        help=("Do per-row check if chunk diff count is at "
                              "least this value"),
                        dest='min_diffs',
                        default=MIN_DIFFS)
    parser.add_argument('-M',
                        '--max_diffs',
                        help=("Do not do per-row check if chunk diff count "
                              "is greater than this value"),
                        dest='max_diffs',
                        default=MAX_DIFFS)
    parser.add_argument('-C',
                        '--no_create_table',
                        help=("If test.checksum_detail is missing, do "
                              "not try to create it."),
                        dest='create_table',
                        action='store_false',
                        default=True)
    parser.add_argument('-v',
                        '--verbose',
                        help=("Store raw output from PT tools in the DB?"),
                        action='store_true',
                        default=False)
    parser.add_argument('-c',
                        '--check_fraction',
                        help=('Check this fraction of databases.'),
                        default=DB_CHECK_FRACTION)

    args = parser.parse_args()
    instance = host_utils.HostAddr(args.instance)
    zk = host_utils.MysqlZookeeper()

    if instance not in \
            zk.get_all_mysql_instances_by_type(host_utils.REPLICA_ROLE_MASTER):
        raise Exception("Instance is not a master in ZK")

    # If enabled, try to create the table that holds the checksum info.
    # If not enabled, make sure that the table exists.
    conn = mysql_lib.connect_mysql(instance, 'scriptro')
    if not mysql_lib.does_table_exist(conn, mysql_lib.METADATA_DB, CHECKSUM_TBL):
        if args.create_table:
            create_checksum_detail_table(instance)
        else:
            raise Exception("Checksum table not found.  Unable to continue."
                            "Consider not using the -C option or create it "
                            "yourself.")

    # Determine what replica set we belong to and get a list of slaves.
    replica_set = zk.get_replica_set_from_instance(instance)[0]
    slaves = set()
    for rtype in host_utils.REPLICA_ROLE_SLAVE, host_utils.REPLICA_ROLE_DR_SLAVE:
        s = zk.get_mysql_instance_from_replica_set(replica_set, rtype)
        if s:
            slaves.add(s)

    if len(slaves) == 0:
        log.info("This server has no slaves.  Nothing to do.")
        sys.exit(0)

    # before we even start this, make sure replication is OK.
    for slave in slaves:
        slave_conn = mysql_lib.connect_mysql(slave, 'scriptrw')
        ss = mysql_lib.get_slave_status(slave_conn)
        if ss['Slave_SQL_Running'] != "Yes" or ss['Slave_IO_Running'] != "Yes":
            raise Exception("Replication is NOT RUNNING on slave {s}: "
                            "SQL: {st} | IO: {it}".format(st=ss['Slave_SQL_Running'],
                                                          it=ss['Slave_IO_Running']))

    if args.dbs:
        db_to_check = set(args.dbs.split(','))
    else:
        dbs = mysql_lib.get_dbs(conn)

        if args.all:
            db_to_check = dbs
        else:
            # default behaviour, check a given DB every N days based on
            # day of year.  minimizes month-boundary issues.
            db_to_check = set()
            check_modulus = int(time.strftime("%j")) % int(args.check_fraction)
            counter = 0
            for db in dbs:
                modulus = counter % int(args.check_fraction)
                if modulus == check_modulus:
                    db_to_check.add(db)
                counter = counter + 1

    # Iterate through the list of DBs and check one table at a time.
    # We do it this way to ensure more coverage in case pt-table-checksum
    # loses its DB connection and errors out before completing a full scan
    # of a given database.
    #
    for db in db_to_check:
        conn = mysql_lib.connect_mysql(instance, 'scriptro')
        tables_to_check = mysql_lib.get_tables(conn, db, skip_views=True)
        for tbl in tables_to_check:
            c_cmd, c_out, c_err, c_ret = checksum_tbl(instance, db, tbl)
            if not args.quiet:
                log.info("Checksum command executed was:\n{cmd}".format(cmd=c_cmd))
                log.info("Standard out:\n{out}".format(out=c_out))
                log.info("Standard error:\n{err}".format(err=c_err))
                log.info("Return code: {ret}".format(ret=c_ret))

            # parse each line of STDOUT (there should only be one with
            # actual data).  We only care about errors, rows, chunks, and
            # skipped, since we'll need to figure out diffs separately for
            # each slave box.
            for line in c_out.split("\n"):
                results = parse_checksum_row(line)
                if results:
                    chunk_errors = int(results[1])
                    row_count = int(results[3])
                    chunk_count = int(results[4])
                    chunk_skips = int(results[5])

                    for slave in slaves:
                        rows_checked = 'NO'
                        sync_cmd = ""
                        sync_out = ""
                        sync_err = ""
                        sync_ret = -1
                        row_diffs = 0

                        elapsed_time_ms,\
                            chunk_diffs = check_one_replica(slave,
                                                            db, tbl)

                        # if we skipped some chunks or there were errors,
                        # this means we can't have complete information about the
                        # state of the replica. in the case of a hard error,
                        # we'll just stop.  in the case of a skipped chunk, we will
                        # treat it as a different chunk for purposes of deciding
                        # whether or not to do a more detailed analysis.
                        #
                        checkable_chunks = chunk_skips + chunk_diffs

                        if chunk_errors > 0:
                            checksum_status = 'ERRORS_IN_CHECKSUM_PROCESS'
                        elif checkable_chunks == 0:
                            checksum_status = 'GOOD'
                        else:
                            if checkable_chunks > int(args.max_diffs):
                                # too many chunk diffs, don't bother checking
                                # further.  not good.
                                checksum_status = 'TOO_MANY_CHUNK_DIFFS'
                            elif checkable_chunks < int(args.min_diffs):
                                # some diffs, but not enough that we care.
                                checksum_status = 'CHUNK_DIFFS_FOUND_BUT_OK'
                            else:
                                start_time = int(time.time()*1000)
                                rows_checked = 'YES'

                                # set the proper status - did we do a sync-based check
                                # because of explicit diffs or because of skipped chunks?
                                if chunk_diffs > 0:
                                    checksum_status = 'ROW_DIFFS_FOUND'
                                else:
                                    checksum_status = 'CHUNKS_WERE_SKIPPED'

                                sync_cmd, sync_out, sync_err, sync_ret, \
                                    row_diffs = checksum_tbl_via_sync(slave,
                                                                      db,
                                                                      tbl)

                                # Add in the time it took to do the sync.
                                elapsed_time_ms += int(time.time()*1000) - start_time

                                if not args.quiet:
                                    log.info("Sync command executed was:\n{cmd} ".format(cmd=sync_cmd))
                                    log.info("Standard out:\n {out}".format(out=sync_out))
                                    log.info("Standard error:\n {err}".format(err=sync_err))
                                    log.info("Return code: {ret}".format(ret=sync_ret))
                                    log.info("Row diffs found: {cnt}".format(cnt=row_diffs))

                        # Checksum process is complete, store the results.
                        #
                        data = {'instance': slave,
                                'master_instance': instance,
                                'db': db,
                                'tbl': tbl,
                                'elapsed_time_ms': elapsed_time_ms,
                                'chunk_count': chunk_count,
                                'chunk_errors': chunk_errors,
                                'chunk_diffs': chunk_diffs,
                                'chunk_skips': chunk_skips,
                                'row_count': row_count,
                                'row_diffs': row_diffs,
                                'rows_checked': rows_checked,
                                'checksum_status': checksum_status,
                                'checksum_cmd': None,
                                'checksum_stdout': None,
                                'checksum_stderr': None,
                                'checksum_rc': c_ret,
                                'sync_cmd': None,
                                'sync_stdout': None,
                                'sync_stderr': None,
                                'sync_rc': sync_ret}

                        if args.verbose:
                            data.update({'checksum_cmd': c_cmd,
                                         'checksum_stdout': c_out,
                                         'checksum_stderr': c_err,
                                         'sync_cmd': sync_cmd,
                                         'sync_stdout': sync_out,
                                         'sync_stderr': sync_err,
                                         'sync_rc': sync_ret})

                        write_checksum_status(instance, data)

        conn.close()