def log_to_retirement_queue(hostname, instance_id, activity): """ Add a record to the retirement queue log Args: hostname - The hostname of the server to be acted upon instance_id - The aws instance id activity - What is the state to log """ reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() # we are using a replace if we need to restart the process. That will # restart the clock on the replacement sql = ('REPLACE INTO mysqlops.retirement_queue ' 'SET ' 'hostname = %(hostname)s ,' 'instance_id = %(instance_id)s, ' 'activity = %(activity)s, ' 'happened = now() ') cursor.execute(sql, {'hostname': hostname, 'instance_id': instance_id, 'activity': activity}) log.info(cursor._executed) reporting_conn.commit()
def get_protected_hosts(return_type='tuple'): """ Get data on all protected hosts Args: return_type - Options are: 'set'- return a set of protected hosts 'tuple' - returns all data regarding protected hosts Returns: A tuple which may be empty, with entries similar to: ({'protecting_user': '******', 'reason': 'because', 'hostname': 'sharddb-14-4'}, {'protecting_user': '******', 'reason': 'because reasons', 'hostname': 'sharddb-14-5'}) """ if return_type != 'tuple' and return_type != 'set': raise Exception('Unsupported return_type ' '{return_type}'.format(return_type=return_type)) reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = "SELECT * FROM mysqlops.retirement_protection" cursor.execute(sql) results = cursor.fetchall() if return_type == 'tuple': return results elif return_type == 'set': results_set = set() for entry in results: results_set.add(entry['hostname']) return results_set
def protect_host(hostname, reason): """ Cause an host to not be acted on by the retirement queue Args: hostname - The hostname to protect reason - An explanation for why this host should not be retired dry_run - If set, don't modify state """ protecting_user = host_utils.get_user() if protecting_user == 'root': raise Exception('Can not modify retirement protection as root') reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("INSERT INTO mysqlops.retirement_protection " "SET " "hostname = %(hostname)s, " "reason = %(reason)s, " "protecting_user = %(protecting_user)s") cursor.execute(sql, { 'hostname': hostname, 'reason': reason, 'protecting_user': protecting_user }) reporting_conn.commit() log.info(cursor._executed)
def log_to_retirement_queue(hostname, instance_id, activity): """ Add a record to the retirement queue log Args: hostname - The hostname of the server to be acted upon instance_id - The aws instance id activity - What is the state to log """ reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() # we are using a replace if we need to restart the process. That will # restart the clock on the replacement sql = ('REPLACE INTO mysqlops.retirement_queue ' 'SET ' 'hostname = %(hostname)s ,' 'instance_id = %(instance_id)s, ' 'activity = %(activity)s, ' 'happened = now() ') cursor.execute(sql, { 'hostname': hostname, 'instance_id': instance_id, 'activity': activity }) log.info(cursor._executed) reporting_conn.commit()
def is_host_in_retirement_queue(hostname): sql = ("SELECT hostname " "FROM mysqlops.retirement_queue " "WHERE hostname = %(hostname)s") reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() cursor.execute(sql, {'hostname': hostname}) return cursor.rowcount > 0
def take_migration_lock(source_replica_set, destination_replica_set, mig_dbs, non_mig_dbs): """ Take a migration lock to ensure no other migration are run concurrenly Args: source_replica_set - Which replica set to take the shards from destination_replica_set - Which replica set to put the shards on mig_dbs - The names of the databases which map to the shards which are being migrated non_mig_dbs - The names of the databases which are created with blackhole tables for replication to function. Returns: a lock identifier """ conn = mysql_lib.get_mysqlops_connections() cursor = conn.cursor() lock_identifier = str(uuid.uuid4()) log.info('Migration lock identifier is {}'.format(lock_identifier)) log.info('Checking existing locks') existing_lock = check_migration_lock(source_replica_set) if not existing_lock: existing_lock = check_migration_lock(destination_replica_set) if existing_lock: log.error('Lock is already held by {}'.format(existing_lock)) log.error('You can abort this migration by running:') log.error('/usr/local/bin/mysql_utils/clean_up_unfinished_migration.py {}' ''.format(existing_lock['source_replica_set'])) raise Exception('Can not take migration lock') params = {'lock': lock_identifier, 'source_replica_set': source_replica_set, 'destination_replica_set': destination_replica_set, 'mig_dbs': ', '.join(mig_dbs), 'non_mig_dbs': ', '.join(non_mig_dbs), 'status': STATUS_IMPORTING} # Todo: turn on locking checking, swich to INSERT sql = ("INSERT INTO mysqlops.mysql_migration_locks " "SET " "lock_identifier = %(lock)s, " "lock_active = 'active', " "created_at = NOW(), " "released = NULL, " "source_replica_set = %(source_replica_set)s, " "destination_replica_set = %(destination_replica_set)s, " "mig_databases = %(mig_dbs)s, " "non_mig_databases = %(non_mig_dbs)s, " "status = %(status)s ") cursor.execute(sql, params) conn.commit() log.info(cursor._executed) return lock_identifier
def get_retirement_queue_servers(next_state): """ Pull instances in queue ready for termination Args: next_state - The desired next state of a server. Options are constants SHUTDOWN_MYSQL and TERMINATE_INSTANCE. Returns: A dict of the same form as what is returned from the cmdbs """ if next_state == SHUTDOWN_MYSQL: server_state = {'previous_state': RESET_STATS, 'next_state': SHUTDOWN_MYSQL} elif next_state == TERMINATE_INSTANCE: server_state = {'previous_state': SHUTDOWN_MYSQL, 'next_state': TERMINATE_INSTANCE} else: raise Exception('Invalid state param ' '"{next_state}"'.format(next_state=next_state)) reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("SELECT t1.hostname, t1.instance_id " "FROM ( " " SELECT hostname, instance_id " " FROM mysqlops.retirement_queue " " WHERE activity = %(previous_state)s " " AND happened > now() - INTERVAL 3 WEEK " " AND happened < now() - INTERVAL 1 DAY) t1 " "LEFT JOIN mysqlops.retirement_queue t2 on t1.instance_id = t2.instance_id " "AND t2.activity=%(next_state)s " "WHERE t2.hostname IS NULL;") cursor.execute(sql, server_state) instances = cursor.fetchall() all_servers = environment_specific.get_all_server_metadata() if len(all_servers) < MIN_CMDB_RESULTS: raise Exception('CMDB returned too few results') ret = dict() for instance in instances: if instance['hostname'] not in all_servers: log.error('Something killed {instance}, cleaning up ' 'retirement queue now'.format(instance=instance)) remove_from_retirement_queue(instance['hostname']) elif instance['instance_id'] != all_servers[instance['hostname']]['instance_id']: log.error('Possibly duplicate hostname for ' '{hostname}!'.format(hostname=instance['hostname'])) else: ret[instance['hostname']] = all_servers[instance['hostname']] return ret
def unprotect_host(hostname): """ Cause an host to able to be acted on by the retirement queue Args: hostname - The hostname to remove from protection """ reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("DELETE FROM mysqlops.retirement_protection " "WHERE hostname = %(hostname)s") cursor.execute(sql, {'hostname': hostname}) reporting_conn.commit() log.info(cursor._executed)
def remove_from_retirement_queue(hostname): """ Remove an host from the retirement queue Args: hostname - the hostname to remove from the queue """ reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ('DELETE FROM mysqlops.retirement_queue ' 'WHERE hostname = %(hostname)s') cursor.execute(sql, {'hostname': hostname}) log.info(cursor._executed) reporting_conn.commit()
def release_promotion_lock(lock_identifier): """ Release a promotion lock Args: lock_identifier - The lock to release """ conn = mysql_lib.get_mysqlops_connections() cursor = conn.cursor() params = {'lock_identifier': lock_identifier} sql = ('UPDATE mysqlops.promotion_locks ' 'SET lock_active = NULL AND released = NOW() ' 'WHERE lock_identifier = %(lock_identifier)s') cursor.execute(sql, params) conn.commit() log.info(cursor._executed)
def auto_add_instance_to_zk(port, dry_run): """ Try to do right thing in adding a server to zk Args: port - The port of replacement instance on localhost dry_run - If set, do not modify zk """ instance = host_utils.HostAddr(':'.join([host_utils.HOSTNAME, str(port)])) try: conn = mysql_lib.get_mysqlops_connections() log.info('Determining replacement for port {}'.format(port)) instance_id = host_utils.get_local_instance_id() role = determine_replacement_role(conn, instance_id) log.info('Adding server as role: {role}'.format(role=role)) except Exception, e: log.exception(e) raise
def get_promotion_lock(replica_set): """ Take a promotion lock Args: replica_set - The replica set to take the lock against Returns: A unique identifer for the lock """ lock_identifier = str(uuid.uuid4()) log.info('Promotion lock identifier is ' '{lock_identifier}'.format(lock_identifier=lock_identifier)) conn = mysql_lib.get_mysqlops_connections() log.info('Releasing any expired locks') release_expired_promotion_locks(conn) log.info('Checking existing locks') check_promotion_lock(conn, replica_set) log.info('Taking lock against replica set: ' '{replica_set}'.format(replica_set=replica_set)) params = { 'lock': lock_identifier, 'localhost': host_utils.HOSTNAME, 'replica_set': replica_set, 'user': host_utils.get_user() } sql = ("INSERT INTO mysqlops.promotion_locks " "SET " "lock_identifier = %(lock)s, " "lock_active = 'active', " "created_at = NOW(), " "expires = NOW() + INTERVAL 12 HOUR, " "released = NULL, " "replica_set = %(replica_set)s, " "promoting_host = %(localhost)s, " "promoting_user = %(user)s ") cursor = conn.cursor() cursor.execute(sql, params) conn.commit() log.info(cursor._executed) return lock_identifier
def get_promotion_lock(replica_set): """ Take a promotion lock Args: replica_set - The replica set to take the lock against Returns: A unique identifer for the lock """ lock_identifier = str(uuid.uuid4()) log.info('Promotion lock identifier is ' '{lock_identifier}'.format(lock_identifier=lock_identifier)) conn = mysql_lib.get_mysqlops_connections() log.info('Releasing any expired locks') release_expired_promotion_locks(conn) log.info('Checking existing locks') check_promotion_lock(conn, replica_set) log.info('Taking lock against replica set: ' '{replica_set}'.format(replica_set=replica_set)) params = {'lock': lock_identifier, 'localhost': host_utils.HOSTNAME, 'replica_set': replica_set, 'user': host_utils.get_user()} sql = ("INSERT INTO mysqlops.promotion_locks " "SET " "lock_identifier = %(lock)s, " "lock_active = 'active', " "created_at = NOW(), " "expires = NOW() + INTERVAL 12 HOUR, " "released = NULL, " "replica_set = %(replica_set)s, " "promoting_host = %(localhost)s, " "promoting_user = %(user)s ") cursor = conn.cursor() cursor.execute(sql, params) conn.commit() log.info(cursor._executed) return lock_identifier
def auto_add_instance_to_zk(instance, dry_run): """ Try to do right thing in adding a server to zk Args: instance - The replacement instance dry_run - If set, do not modify zk """ try: conn = mysql_lib.get_mysqlops_connections() log.info('Determining replacement for ' '{hostname}'.format(hostname=instance.hostname)) server_metadata = environment_specific.get_server_metadata(instance.hostname) if not server_metadata: raise Exception('CMDB lacks knowledge of replacement host') instance_id = server_metadata['id'] role = determine_replacement_role(conn, instance_id) log.info('Adding server as role: {role}'.format(role=role)) except Exception, e: log.exception(e) raise
def update_migration_status(lock_identifier, status): """ Update the migration lock table Args: lock_identifier - a lock id as returned by take_migration_lock status - The new status """ conn = mysql_lib.get_mysqlops_connections() cursor = conn.cursor() params = {'lock': lock_identifier, 'status': status} sql = ("UPDATE mysqlops.mysql_migration_locks " "SET " "status = %(status)s " "WHERE " "lock_identifier = %(lock)s ") cursor = conn.cursor() cursor.execute(sql, params) conn.commit() log.info(cursor._executed)
def auto_add_instance_to_zk(instance, dry_run): """ Try to do right thing in adding a server to zk Args: instance - The replacement instance dry_run - If set, do not modify zk """ try: conn = mysql_lib.get_mysqlops_connections() log.info('Determining replacement for ' '{hostname}'.format(hostname=instance.hostname)) server_metadata = environment_specific.get_server_metadata( instance.hostname) if not server_metadata: raise Exception('CMDB lacks knowledge of replacement host') instance_id = server_metadata['id'] role = determine_replacement_role(conn, instance_id) log.info('Adding server as role: {role}'.format(role=role)) except Exception, e: log.exception(e) raise
def protect_host(hostname, reason): """ Cause an host to not be acted on by the retirement queue Args: hostname - The hostname to protect reason - An explanation for why this host should not be retired dry_run - If set, don't modify state """ protecting_user = host_utils.get_user() if protecting_user == 'root': raise Exception('Can not modify retirement protection as root') reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("INSERT INTO mysqlops.retirement_protection " "SET " "hostname = %(hostname)s, " "reason = %(reason)s, " "protecting_user = %(protecting_user)s") cursor.execute(sql, {'hostname': hostname, 'reason': reason, 'protecting_user': protecting_user}) reporting_conn.commit() log.info(cursor._executed)
def check_migration_lock(replica_set): """ Confirm there are no active locks that would block taking a migration lock Args: replica_set - A name of a replica set """ conn = mysql_lib.get_mysqlops_connections() cursor = conn.cursor() params = {'replica_set': replica_set} sql = ('SELECT lock_identifier, ' ' source_replica_set, ' ' destination_replica_set, ' ' mig_databases, ' ' non_mig_databases, ' ' status ' 'FROM mysqlops.mysql_migration_locks ' "WHERE lock_active = 'active' AND " "( source_replica_set = %(replica_set)s OR" " destination_replica_set = %(replica_set)s )") cursor.execute(sql, params) row = cursor.fetchone() log.info(cursor._executed) return row
def xtrabackup_backup_instance(instance): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object """ starttime_sql = time.strftime('%Y-%m-%d %H:%M:%S') log.info('Logging initial status to mysqlops') row_id = None lock_handle = None try: reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("INSERT INTO mysqlops.mysql_backups " "SET " "hostname = %(hostname)s, " "port = %(port)s, " "started = %(started)s, " "backup_type = 'xbstream' ") metadata = { 'hostname': instance.hostname, 'port': instance.port, 'started': starttime_sql } cursor.execute(sql, metadata) row_id = cursor.lastrowid reporting_conn.commit() except Exception as e: log.warning("Unable to write log entry to " "mysqlopsdb001: {e}".format(e=e)) log.warning("However, we will attempt to continue with the backup.") # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Cleaning up old backups') purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) # Actually run the backup log.info('Running backup') backup_file = backup.xtrabackup_instance(instance) finished = time.strftime('%Y-%m-%d %H:%M:%S') # Upload file to s3 log.info('Uploading file to s3') backup.s3_upload(backup_file) # Update database with additional info now that backup is done. if row_id is None: log.info("The backup is complete, but we were not able to " "write to the central log DB.") else: log.info("Updating database log entry with final backup info") try: sql = ("UPDATE mysqlops.mysql_backups " "SET " "filename = %(filename)s, " "finished = %(finished)s, " "size = %(size)s " "WHERE id = %(id)s") metadata = { 'filename': backup_file, 'finished': finished, 'size': os.stat(backup_file).st_size, 'id': row_id } cursor.execute(sql, metadata) reporting_conn.commit() reporting_conn.close() except Exception as e: log.warning("Unable to update mysqlopsdb with " "backup status: {e}".format(e=e)) # Running purge again most for the chmod purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)
def launch_amazon_mysql_server(hostname, instance_type, vpc_security_group, availability_zone, ssh_group, mysql_major_version, mysql_minor_version, os_flavor, dry_run, skip_name_check=False): """ Launch a mysql server in aws Args: hostname - hostname of new server instance_type - hardware type vpc_security_group - VPC firewall rules. availability_zone - AWS availability zone ssh_group - What IAM/SSH zone to use mysql_major_version - MySQL major version. Example 5.5 or 5.6 mysql_minor_version - Which "branch" to use. Values are 'stable', 'staging' and 'latest'. os_flavor - Which OS to target - 'precise' or 'trusty' at the moment dry_run - Do not actually launch a host, just show the expected config. skip_name_check - Do not check if a hostname has already been used or log usage. The assumption is the caller has already done this Returns: An amazon instance id. """ args, _, _, values = inspect.getargvalues(inspect.currentframe()) for param in args: log.info("Requested {param} = {value}".format(param=param, value=values[param])) if host_utils.get_security_role( ) not in environment_specific.ROLE_TO_LAUNCH_INSTANCE: raise Exception(environment_specific.ROLE_ERROR_MSG) config = { 'key_name': environment_specific.PEM_KEY, 'placement': availability_zone, 'instance_profile_name': environment_specific.INSTANCE_PROFILE_NAME, 'image_id': environment_specific.SUPPORTED_HARDWARE[instance_type]['ami'] [os_flavor], 'instance_type': instance_type } (subnet_name, config['subnet_id']) = get_subnet_from_sg(vpc_security_group, availability_zone) ssh_security = environment_specific.SSH_SECURITY_MAP[subnet_name]['ssh'] config['instance_profile_name'] = environment_specific.SSH_SECURITY_MAP[ subnet_name]['iam'] config['security_group_ids'] = [ environment_specific.VPC_SECURITY_GROUPS[vpc_security_group] ] if ssh_group: if ssh_group >= ssh_security and ssh_group in environment_specific.SSH_IAM_MAPPING.keys( ): ssh_security = ssh_group config[ 'instance_profile_name'] = environment_specific.SSH_IAM_MAPPING[ ssh_group] else: raise Exception( "We are not allowed to provision a host in {0} env " "with a weaker access policy than {1} it's existing or default " "config".format(ssh_group, ssh_security)) hiera_config = environment_specific.HIERA_FORMAT.format( ssh_security=ssh_security, mysql_major_version=mysql_major_version.replace('.', ''), mysql_minor_version=mysql_minor_version) if hiera_config not in environment_specific.SUPPORTED_HIERA_CONFIGS: raise Exception( 'Hiera config {hiera_config} is not supported.' 'Supported configs are: {supported}' ''.format(hiera_config=hiera_config, supported=environment_specific.SUPPORTED_HIERA_CONFIGS)) config['user_data'] = ('#cloud-config\n' 'pinfo_team: {pinfo_team}\n' 'pinfo_env: {pinfo_env}\n' 'pinfo_role: {hiera_config}\n' 'hostname: {hostname}\n' 'raid: true\n' 'raid_fs: xfs\n' 'raid_mount: {raid_mount}' ''.format( pinfo_team=environment_specific.PINFO_TEAM, pinfo_env=environment_specific.PINFO_ENV, raid_mount=environment_specific.RAID_MOUNT, hiera_config=hiera_config, hostname=hostname)) log.info('Config for new server:\n{config}'.format(config=config)) conn = mysql_lib.get_mysqlops_connections() if not skip_name_check and not launch_replacement_db_host.is_hostname_new( hostname, conn): raise Exception('Hostname {hostname} has already been used!' ''.format(hostname=hostname)) if dry_run: log.info('In dry run mode, returning now') return else: conn = boto.ec2.connect_to_region(environment_specific.EC2_REGION) instance_id = conn.run_instances(**config).instances[0].id log.info('Launched instance {id}'.format(id=instance_id)) return instance_id
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format( host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now( ) - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} days ago. The timeout for servers builds is " "{timeout} days so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False version_server = original_server except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') version_server = zk.get_mysql_instance_from_replica_set( original_server.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata( original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = { 'availability_zone': cmdb_data['location'], 'vpc_security_group': cmdb_data['security_groups'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': mysql_lib.get_global_variables(version_server)['version'][0:3], 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True } # At this point, all our defaults should be good to go config_overridden = False # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) reasons.add('changing {key} from {old} to ' '{new}'.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) if not dead_server: try: mysql_lib.assert_replication_sanity(original_server) except Exception as e: log.info('Replication problem: {e}'.format(e=e)) reasons.add('replication broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason ' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server( **replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'classic_security_group', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now() - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} ago. The timeout for servers builds is " "{timeout} so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': get_master_mysql_major_version(original_server), 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} if cmdb_data.pop('cloud.aws.vpc_id', None): # Existing server is in VPC replacement_config['classic_security_group'] = None replacement_config['vpc_security_group'] = cmdb_data['security_groups'] else: # Existing server is in Classic replacement_config['classic_security_group'] = cmdb_data['security_groups'] replacement_config['vpc_security_group'] = None # At this point, all our defaults should be good to go config_overridden = False if replacement_config['classic_security_group'] and overrides['vpc_security_group']: # a VPC migration vpc_migration(replacement_config, overrides) reasons.add('vpc migration') config_overridden = True # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] reasons.add('changing {key} from {old} to ' '{old}'.format(key=key, old=replacement_config[key], new=overrides[key])) config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') if not dead_server: slave_status = mysql_lib.calc_slave_lag(original_server) if slave_status['ss']['Slave_SQL_Running'] != 'Yes': reasons.add('sql replication thread broken') if slave_status['ss']['Slave_IO_Running'] != 'Yes': reasons.add('io replication thread broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def launch_amazon_mysql_server(hostname, instance_type, vpc_security_group, classic_security_group, availability_zone, mysql_major_version, mysql_minor_version, dry_run, skip_name_check=False): """ Launch a mysql server in aws Args: hostname - hostname of new server instance_type - hardware type vpc_security_group - VPC firewall rules. This or classic_security_group must be supplied, but not both. classic_security_group - AWS classic firewall rules. See vpc_security_group availability_zone - AWS availability zone mysql_major_version - MySQL major version. Example 5.5 or 5.6 mysql_minor_version - Which "branch" to use. Values are 'stable', 'staging' and 'latest'. dry_run - Do not actually launch a host, just show the expected config. skip_name_check - Do not check if a hostname has already been used or log usage. The assumption is the caller has already done this Returns: An amazon instance id. """ args, _, _, values = inspect.getargvalues(inspect.currentframe()) for param in args: log.info("Requested {param} = {value}".format(param=param, value=values[param])) config = { 'key_name': environment_specific.PEM_KEY, 'placement': availability_zone, 'instance_profile_name': environment_specific.INSTANCE_PROFILE_NAME, 'image_id': environment_specific.SUPPORTED_HARDWARE[instance_type]['ami'], 'instance_type': instance_type } if vpc_security_group and not classic_security_group: (subnet_name, config['subnet_id']) = \ get_subnet_from_sg(vpc_security_group, availability_zone) ssh_security = environment_specific.SSH_SECURITY_MAP[subnet_name][ 'ssh'] config[ 'instance_profile_name'] = environment_specific.SSH_SECURITY_MAP[ subnet_name]['iam'] config['security_group_ids'] = [ environment_specific.VPC_SECURITY_GROUPS[vpc_security_group] ] elif classic_security_group and not vpc_security_group: config['security_groups'] = [classic_security_group] if classic_security_group in environment_specific.CLASSIC_SECURE_SG: ssh_security = environment_specific.SSH_SECURITY_SECURE else: ssh_security = environment_specific.SSH_SECURITY_DEV config[ 'instance_profile_name'] = environment_specific.INSTANCE_PROFILE_NAME else: raise Exception('One and only one of vpc_security_group and ' 'classic_security_group must be specified. Received:\n' 'vpc_security_group: {vpc}, \n' 'classic_security_group: {classic_security_group}' ''.format( vpc=vpc_security_group, classic_security_group=classic_security_group)) hiera_config = environment_specific.HIERA_FORMAT.format( ssh_security=ssh_security, mysql_major_version=mysql_major_version.replace('.', ''), mysql_minor_version=mysql_minor_version) if hiera_config not in environment_specific.SUPPORTED_HIERA_CONFIGS: raise Exception( 'Hiera config {hiera_config} is not supported.' 'Supported configs are: {supported}' ''.format(hiera_config=hiera_config, supported=environment_specific.SUPPORTED_HIERA_CONFIGS)) config['user_data'] = ('#cloud-config\n' 'pinfo_team: {pinfo_team}\n' 'pinfo_env: {pinfo_env}\n' 'pinfo_role: {hiera_config}\n' 'hostname: {hostname}\n' 'raid: true\n' 'raid_fs: xfs\n' 'raid_mount: {raid_mount}' ''.format( pinfo_team=environment_specific.PINFO_TEAM, pinfo_env=environment_specific.PINFO_ENV, raid_mount=environment_specific.RAID_MOUNT, hiera_config=hiera_config, hostname=hostname)) log.info('Config for new server:\n{config}'.format(config=config)) conn = mysql_lib.get_mysqlops_connections() if not skip_name_check and not launch_replacement_db_host.is_hostname_new( hostname, conn): raise Exception('Hostname {hostname} has already been used!' ''.format(hostname=hostname)) if dry_run: log.info('In dry run mode, returning now') return else: conn = boto.ec2.connect_to_region(environment_specific.EC2_REGION) instance_id = conn.run_instances(**config).instances[0].id log.info('Launched instance {id}'.format(id=instance_id)) return instance_id
def xtrabackup_backup_instance(instance): """ Run a file based backup on a supplied local instance Args: instance - A hostaddr object """ starttime_sql = time.strftime('%Y-%m-%d %H:%M:%S') log.info('Logging initial status to mysqlops') row_id = None lock_handle = None try: reporting_conn = mysql_lib.get_mysqlops_connections() cursor = reporting_conn.cursor() sql = ("INSERT INTO mysqlops.mysql_backups " "SET " "hostname = %(hostname)s, " "port = %(port)s, " "started = %(started)s, " "backup_type = 'xbstream' ") metadata = {'hostname': instance.hostname, 'port': instance.port, 'started': starttime_sql} cursor.execute(sql, metadata) row_id = cursor.lastrowid reporting_conn.commit() except Exception as e: log.warning("Unable to write log entry to " "mysqlopsdb001: {e}".format(e=e)) log.warning("However, we will attempt to continue with the backup.") # Take a lock to prevent multiple backups from running concurrently try: log.info('Taking backup lock') lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE) log.info('Cleaning up old backups') purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) # Actually run the backup log.info('Running backup') backup_file = backup.xtrabackup_instance(instance) finished = time.strftime('%Y-%m-%d %H:%M:%S') # Upload file to s3 log.info('Uploading file to s3') backup.s3_upload(backup_file) # Update database with additional info now that backup is done. if row_id is None: log.info("The backup is complete, but we were not able to " "write to the central log DB.") else: log.info("Updating database log entry with final backup info") try: sql = ("UPDATE mysqlops.mysql_backups " "SET " "filename = %(filename)s, " "finished = %(finished)s, " "size = %(size)s " "WHERE id = %(id)s") metadata = {'filename': backup_file, 'finished': finished, 'size': os.stat(backup_file).st_size, 'id': row_id} cursor.execute(sql, metadata) reporting_conn.commit() reporting_conn.close() except Exception as e: log.warning("Unable to update mysqlopsdb with " "backup status: {e}".format(e=e)) # Running purge again most for the chmod purge_mysql_backups.purge_mysql_backups(instance, skip_lock=True) finally: if lock_handle: log.info('Releasing lock') host_utils.release_flock_lock(lock_handle)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'classic_security_group', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: if replace_again: log.info('A replacement has already been requested: ' '{new_host}'.format(new_host=existing_replacement)) else: raise Exception('A replacement already exists, but ' 'replace_again is not True') # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': get_master_mysql_major_version(original_server), 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} if cmdb_data.pop('cloud.aws.vpc_id', None): # Existing server is in VPC replacement_config['classic_security_group'] = None replacement_config['vpc_security_group'] = cmdb_data['security_groups'] else: # Existing server is in Classic replacement_config['classic_security_group'] = cmdb_data['security_groups'] replacement_config['vpc_security_group'] = None # At this point, all our defaults should be good to go config_overridden = False if replacement_config['classic_security_group'] and overrides['vpc_security_group']: # a VPC migration vpc_migration(replacement_config, overrides) reasons.add('vpc migration') config_overridden = True # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] reasons.add('changing {key} from {old} to ' '{old}'.format(key=key, old=replacement_config[key], new=overrides[key])) config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') if not dead_server: slave_status = mysql_lib.calc_slave_lag(original_server) if slave_status['ss']['Slave_SQL_Running'] != 'Yes': reasons.add('sql replication thread broken') if slave_status['ss']['Slave_IO_Running'] != 'Yes': reasons.add('io replication thread broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def launch_amazon_mysql_server(hostname, instance_type, vpc_security_group, availability_zone, mysql_major_version, mysql_minor_version, dry_run, skip_name_check=False): """ Launch a mysql server in aws Args: hostname - hostname of new server instance_type - hardware type vpc_security_group - VPC firewall rules. availability_zone - AWS availability zone mysql_major_version - MySQL major version. Example 5.5 or 5.6 mysql_minor_version - Which "branch" to use. Values are 'stable', 'staging' and 'latest'. dry_run - Do not actually launch a host, just show the expected config. skip_name_check - Do not check if a hostname has already been used or log usage. The assumption is the caller has already done this Returns: An amazon instance id. """ args, _, _, values = inspect.getargvalues(inspect.currentframe()) for param in args: log.info("Requested {param} = {value}".format(param=param, value=values[param])) config = {'key_name': environment_specific.PEM_KEY, 'placement': availability_zone, 'instance_profile_name': environment_specific.INSTANCE_PROFILE_NAME, 'image_id': environment_specific.SUPPORTED_HARDWARE[instance_type]['ami'], 'instance_type': instance_type} (subnet_name, config['subnet_id']) = get_subnet_from_sg(vpc_security_group, availability_zone) ssh_security = environment_specific.SSH_SECURITY_MAP[subnet_name]['ssh'] config['instance_profile_name'] = environment_specific.SSH_SECURITY_MAP[subnet_name]['iam'] config['security_group_ids'] = [environment_specific.VPC_SECURITY_GROUPS[vpc_security_group]] hiera_config = environment_specific.HIERA_FORMAT.format( ssh_security=ssh_security, mysql_major_version=mysql_major_version.replace('.', ''), mysql_minor_version=mysql_minor_version) if hiera_config not in environment_specific.SUPPORTED_HIERA_CONFIGS: raise Exception('Hiera config {hiera_config} is not supported.' 'Supported configs are: {supported}' ''.format(hiera_config=hiera_config, supported=environment_specific.SUPPORTED_HIERA_CONFIGS)) config['user_data'] = ('#cloud-config\n' 'pinfo_team: {pinfo_team}\n' 'pinfo_env: {pinfo_env}\n' 'pinfo_role: {hiera_config}\n' 'hostname: {hostname}\n' 'raid: true\n' 'raid_fs: xfs\n' 'raid_mount: {raid_mount}' ''.format(pinfo_team=environment_specific.PINFO_TEAM, pinfo_env=environment_specific.PINFO_ENV, raid_mount=environment_specific.RAID_MOUNT, hiera_config=hiera_config, hostname=hostname)) log.info('Config for new server:\n{config}'.format(config=config)) conn = mysql_lib.get_mysqlops_connections() if not skip_name_check and not launch_replacement_db_host.is_hostname_new(hostname, conn): raise Exception('Hostname {hostname} has already been used!' ''.format(hostname=hostname)) if dry_run: log.info('In dry run mode, returning now') return else: conn = boto.ec2.connect_to_region(environment_specific.EC2_REGION) instance_id = conn.run_instances(**config).instances[0].id log.info('Launched instance {id}'.format(id=instance_id)) return instance_id