def restart_maxwell_if_not_exists(instance): """ Start Maxwell if it isn't currently running. Args: instance: (host_utils.HostAddr): host to check Returns: none """ zk = host_utils.MysqlZookeeper() replica_type = zk.get_replica_type_from_instance(instance) gvars = mysql_lib.get_global_variables(instance) client_id = gvars['server_uuid'] gtid_mode = True if gvars.get('gtid_mode') == 'ON' else False (username, _) = mysql_lib.get_mysql_user_for_role('maxwell') output_target = 'file' # master writes to kafka, everything else writes to /dev/null, # at least for now. if instance.hostname_prefix in environment_specific.MAXWELL_TARGET_MAP \ and replica_type == host_utils.REPLICA_ROLE_MASTER: output_target = 'kafka' # we need to rewrite the config each time, because something may # have changed - i.e., a failover. this is just a stopgap solution # pending resolution of LP-809 mysql_cnf_builder.create_maxwell_config(client_id, instance, None, output_target, gtid_mode) # Check for the Maxwell PID file and then see if it belongs to Maxwell. maxwell_running = False try: with open(environment_specific.MAXWELL_PID, "r") as f: pid = f.read() proc = psutil.Process(int(pid)) cmdline = proc.cmdline() if 'java' in cmdline and 'com.zendesk.maxwell.Maxwell' in cmdline: maxwell_running = True except (IOError, psutil.NoSuchProcess, psutil.ZombieProcess): # No PID file or no process matching said PID, so maxwell is definitely # not running. If maxwell is a zombie then it's not running either. pass if maxwell_running: log.debug('Maxwell is already running') return if instance.hostname_prefix in environment_specific.MAXWELL_TARGET_MAP: host_utils.manage_maxwell(instance.port) log.info('Started Maxwell process')
def find_gtid_for_timestamp(instance, timestamp): """ Find the GTID for the supplied timestamp on the specified instance. Args: instance: a HostAddr object timestamp: the timestamp to search for Returns: If the instance doesn't support GTID, return None. If no GTID was found in the binlogs for the supplied timestamp, return a blank string. Otherwise, return a GTID. """ vars = mysql_lib.get_global_variables(instance) # we are not generating GTIDs / no GTID support if vars['gtid_mode'] == 'OFF' or vars['gtid_deployment_step'] == 'ON': log.warning('This replica set does not currently support GTID') return None # go in reverse order, because odds are that the log we want # is closer to the end than the beginning. master_logs = list(reversed(mysql_lib.get_master_logs(instance))) (username, password) = mysql_lib.get_mysql_user_for_role('replication') for binlog in master_logs: # if the timestamp we want is prior to the first entry in the # binlog, it can't possibly be in there. log_start = get_binlog_start(binlog['Log_name'], instance, username, password) if timestamp < log_start: log.debug('Skipping binlog {bl} because desired {ts} < ' '{ls}'.format(bl=binlog['Log_name'], ts=timestamp, ls=log_start)) continue # The binlog that we end up checking, if we check one at all, # is the first one that could possibly contain our GTID, so # if it isn't in this one, we're not going to find anything. log.debug('Checking for matching GTID in {}'.format( binlog['Log_name'])) gtid = check_one_binlog(timestamp, binlog['Log_name'], instance, username, password) if gtid: return gtid else: break log.warning("No matching GTID was found for that timestamp.") return ''
def disk_space_available_for_migration(instance): """ Check the disk space available for migrations on the data dir mount Args: instance - A hostaddr object Returns: The number of MB available """ datadir = mysql_lib.get_global_variables(instance)['datadir'] cmd = MIGRATION_SPACE_CMD.format(hostname=instance.hostname, datadir=datadir, disk_limit=DISK_LIMIT) log.info(cmd) out, err, ret = host_utils.shell_exec(cmd) return float(out.strip())
def get_master_mysql_major_version(instance): """ Given an instance, determine the mysql major version for the master of the replica set. Args: instance - a hostaddr object Returns - A string similar to '5.5' or '5.6' """ zk = host_utils.MysqlZookeeper() master = zk.get_mysql_instance_from_replica_set(instance.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) master_conn = mysql_lib.connect_mysql(master) mysql_version = mysql_lib.get_global_variables(master_conn)['version'][:3] return mysql_version
def get_master_mysql_major_version(instance): """ Given an instance, determine the mysql major version for the master of the replica set. Args: instance - a hostaddr object Returns - A string similar to '5.5' or '5.6' """ zk = host_utils.MysqlZookeeper() master = zk.get_mysql_instance_from_replica_set(instance.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) try: mysql_version = mysql_lib.get_global_variables(master)['version'][:3] except _mysql_exceptions.OperationalError: raise Exception('Could not connect to master server {instance} in ' 'order to determine MySQL version to launch with. ' 'Perhaps run this script from there? This is likely ' 'due to firewall rules.' ''.format(instance=instance.hostname)) return mysql_version
def get_master_mysql_major_version(instance): """ Given an instance, determine the mysql major version for the master of the replica set. Args: instance - a hostaddr object Returns - A string similar to '5.5' or '5.6' """ zk = host_utils.MysqlZookeeper() master = zk.get_mysql_instance_from_replica_set( instance.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) try: mysql_version = mysql_lib.get_global_variables(master)['version'][:3] except _mysql_exceptions.OperationalError: raise Exception('Could not connect to master server {instance} in ' 'order to determine MySQL version to launch with. ' 'Perhaps run this script from there? This is likely ' 'due to firewall rules.' ''.format(instance=instance.hostname)) return mysql_version
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format( host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now( ) - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} days ago. The timeout for servers builds is " "{timeout} days so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False version_server = original_server except MySQLdb.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') version_server = zk.get_mysql_instance_from_replica_set( original_server.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata( original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = { 'availability_zone': cmdb_data['location'], 'vpc_security_group': cmdb_data['security_groups'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': mysql_lib.get_global_variables(version_server)['version'][0:3], 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True } # At this point, all our defaults should be good to go config_overridden = False # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) reasons.add('changing {key} from {old} to ' '{new}'.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) if not dead_server: try: mysql_lib.assert_replication_sanity(original_server) except Exception as e: log.info('Replication problem: {e}'.format(e=e)) reasons.add('replication broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason ' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server( **replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)
def launch_replacement_db_host(original_server, dry_run=False, not_a_replacement=False, overrides=dict(), reason='', replace_again=False): """ Launch a replacement db server Args: original_server - A hostAddr object for the server to be replaced dry_run - If True, do not actually launch a replacement not_a_replacement - If set, don't log the replacement, therefore automation won't put it into prod use. overrides - A dict of overrides. Availible keys are 'mysql_minor_version', 'hostname', 'vpc_security_group', 'availability_zone', 'instance_type', and 'mysql_major_version'. reason - A description of why the host is being replaced. If the instance is still accessible and reason is not supply an exception will be thrown. replace_again - If True, ignore already existing replacements. """ reasons = set() if reason: reasons.add(reason) log.info('Trying to launch a replacement for host {host} which is part ' 'of replica set is {replica_set}'.format(host=original_server.hostname, replica_set=original_server.get_zk_replica_set()[0])) zk = host_utils.MysqlZookeeper() try: (_, replica_type) = zk.get_replica_set_from_instance(original_server) except: raise Exception('Can not replace an instance which is not in zk') if replica_type == host_utils.REPLICA_ROLE_MASTER: # If the instance, we will refuse to run. No ifs, ands, or buts/ raise Exception('Can not replace an instance which is a master in zk') # Open a connection to MySQL Ops and check if a replacement has already # been requested reporting_conn = mysql_lib.get_mysqlops_connections() existing_replacement = find_existing_replacements(reporting_conn, original_server) if existing_replacement and not not_a_replacement: log.info('A replacement has already been requested: ' '{re}'.format(re=existing_replacement)) if replace_again: log.info('Argument replace_again is set, continuing on.') else: age_of_replacement = datetime.datetime.now() - existing_replacement['created_at'] if age_of_replacement.days < SERVER_BUILD_TIMEOUT: raise Exception('Argument replace_again is not True but a ' 'replacement already exists.') else: log.info("A replacement already exists, but was launched " "{days} days ago. The timeout for servers builds is " "{timeout} days so we are automatically setting " "replace_again.".format(days=age_of_replacement.days, timeout=SERVER_BUILD_TIMEOUT)) replace_again = True # Check to see if MySQL is up on the host try: # This is not multi instance compatible. If we move to multiple # instances this will need to be updated conn = mysql_lib.connect_mysql(original_server) conn.close() dead_server = False version_server = original_server except pymysql.OperationalError as detail: dead_server = True (error_code, msg) = detail.args if error_code != mysql_lib.MYSQL_ERROR_CONN_HOST_ERROR: raise log.info('MySQL is down, assuming hardware failure') reasons.add('hardware failure') version_server = zk.get_mysql_instance_from_replica_set(original_server.get_zk_replica_set()[0], repl_type=host_utils.REPLICA_ROLE_MASTER) # Pull some information from cmdb. cmdb_data = environment_specific.get_server_metadata(original_server.hostname) if not cmdb_data: raise Exception('Could not find information about server to be ' 'replaced in the cmdb') if 'aws_status.codes' in cmdb_data: reasons.add(cmdb_data['aws_status.codes']) log.info('Data from cmdb: {cmdb_data}'.format(cmdb_data=cmdb_data)) replacement_config = {'availability_zone': cmdb_data['location'], 'vpc_security_group': cmdb_data['security_groups'], 'hostname': find_unused_server_name(original_server.get_standardized_replica_set(), reporting_conn, dry_run), 'instance_type': cmdb_data['config.instance_type'], 'mysql_major_version': mysql_lib.get_global_variables(version_server)['version'][0:3], 'mysql_minor_version': DEFAULT_MYSQL_MINOR_VERSION, 'dry_run': dry_run, 'skip_name_check': True} # At this point, all our defaults should be good to go config_overridden = False # All other overrides for key in overrides.keys(): if key not in replacement_config: raise Exception('Invalid override {key}'.format(key=key)) if overrides[key]: if replacement_config[key] == overrides[key]: log.info('Override for key {key} does not modify ' 'configuration'.format(key=key)) else: log.info('Overriding {key} to value {new} from {old}' ''.format(key=key, old=replacement_config[key], new=overrides[key])) reasons.add('changing {key} from {old} to ' '{new}'.format(key=key, old=replacement_config[key], new=overrides[key])) replacement_config[key] = overrides[key] config_overridden = True if config_overridden: log.info('Configuration after overrides: {replacement_config}' ''.format(replacement_config=replacement_config)) if not dead_server: try: mysql_lib.assert_replication_sanity(original_server) except Exception as e: log.info('Replication problem: {e}'.format(e=e)) reasons.add('replication broken') # If we get to here and there is no reason, bail out if not reasons and not replacement_config['dry_run']: raise Exception(('MySQL appears to be up and no reason for ' 'replacement is supplied. You can specify a reason ' 'with the --reason argument')) reason = ', '.join(reasons) log.info('Reason for launch: {reason}'.format(reason=reason)) new_instance_id = launch_amazon_mysql_server.launch_amazon_mysql_server(**replacement_config) if not (replacement_config['dry_run'] or not_a_replacement): log_replacement_host(reporting_conn, cmdb_data, new_instance_id, replace_again, replacement_config, reason)