예제 #1
0
    def backup_instance(self):
        """ Back up a replica instance to s3 in csv """
        host_lock_handle = None
        try:
            log.info('Backup for instance {i} started at {t}'
                     ''.format(t=str(self.timestamp), i=self.instance))
            log.info('Checking heartbeat to make sure replicaiton is not too '
                     'lagged.')
            self.check_replication_for_backup()

            log.info('Taking host backup lock')
            host_lock_handle = host_utils.take_flock_lock(
                backup.BACKUP_LOCK_FILE)

            log.info('Setting up export directory structure')
            self.setup_and_get_tmp_path()
            log.info('Will temporarily dump inside of {path}'
                     ''.format(path=self.dump_base_path))

            log.info('Releasing any invalid shard backup locks')
            self.ensure_backup_locks_sanity()

            log.info('Deleting old expired locks')
            self.purge_old_expired_locks()

            log.info('Stopping replication SQL thread to get a snapshot')
            mysql_lib.stop_replication(self.instance,
                                       mysql_lib.REPLICATION_THREAD_SQL)

            workers = []
            for _ in range(multiprocessing.cpu_count() / 2):
                proc = multiprocessing.Process(
                    target=self.mysql_backup_csv_dbs)
                proc.daemon = True
                proc.start()
                workers.append(proc)
            # throw in a sleep to make sure all threads have started dumps
            time.sleep(2)
            log.info('Restarting replication')
            mysql_lib.start_replication(self.instance,
                                        mysql_lib.REPLICATION_THREAD_SQL)

            for worker in workers:
                worker.join()

            if not self.dbs_to_backup.empty():
                raise Exception('All worker processes have completed, but '
                                'work remains in the queue')

            log.info('CSV backup is complete, will run a check')
            mysql_backup_status.verify_csv_backup(self.instance.replica_type,
                                                  self.datestamp,
                                                  self.instance)
        finally:
            if host_lock_handle:
                log.info('Releasing general host backup lock')
                host_utils.release_flock_lock(host_lock_handle)
예제 #2
0
    def backup_instance(self):
        """ Back up a replica instance to s3 in csv """
        host_lock_handle = None
        try:
            log.info('Backup for instance {i} started at {t}'
                     ''.format(t=str(self.timestamp),
                               i=self.instance))
            log.info('Checking heartbeat to make sure replicaiton is not too '
                     'lagged.')
            self.check_replication_for_backup()

            log.info('Taking host backup lock')
            host_lock_handle = host_utils.take_flock_lock(backup.BACKUP_LOCK_FILE)

            log.info('Setting up export directory structure')
            self.setup_and_get_tmp_path()
            log.info('Will temporarily dump inside of {path}'
                     ''.format(path=self.dump_base_path))

            log.info('Releasing any invalid shard backup locks')
            self.ensure_backup_locks_sanity()

            log.info('Deleting old expired locks')
            self.purge_old_expired_locks()

            log.info('Stopping replication SQL thread to get a snapshot')
            mysql_lib.stop_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL)

            workers = []
            for _ in range(multiprocessing.cpu_count() / 2):
                proc = multiprocessing.Process(target=self.mysql_backup_csv_dbs)
                proc.daemon = True
                proc.start()
                workers.append(proc)
            # throw in a sleep to make sure all threads have started dumps
            time.sleep(2)
            log.info('Restarting replication')
            mysql_lib.start_replication(self.instance, mysql_lib.REPLICATION_THREAD_SQL)

            for worker in workers:
                worker.join()

            if not self.dbs_to_backup.empty():
                raise Exception('All worker processes have completed, but '
                                'work remains in the queue')

            log.info('CSV backup is complete, will run a check')
            mysql_backup_status.verify_csv_backup(self.instance.replica_type,
                                                  self.datestamp,
                                                  self.instance)
        finally:
            if host_lock_handle:
                log.info('Releasing general host backup lock')
                host_utils.release_flock_lock(host_lock_handle)
    def backup_instance(self):
        """ Back up a replica instance to s3 in csv """

        log.info('Backup for instance {i} started at {t}'
                 ''.format(t=str(self.timestamp), i=self.instance))
        log.info('Checking heartbeat to make sure replication is not too '
                 'lagged.')
        self.check_replication_for_backup()

        log.info('Taking host backup lock')
        host_lock = host_utils.bind_lock_socket(backup.CSV_BACKUP_LOCK_SOCKET)

        log.info('Setting up export directory structure')
        self.setup_and_get_tmp_path()
        log.info('Will temporarily dump inside of {path}'
                 ''.format(path=self.dump_base_path))

        log.info('Releasing any invalid shard backup locks')
        self.ensure_backup_locks_sanity()

        log.info('Deleting old expired locks')
        self.purge_old_expired_locks()

        log.info('Stopping replication SQL thread to get a snapshot')
        mysql_lib.stop_replication(self.instance,
                                   mysql_lib.REPLICATION_THREAD_SQL)

        # starting a consistent snapshot here and retrieving the thread ID
        conn = mysql_lib.connect_mysql(self.instance,
                                       backup.USER_ROLE_MYSQLDUMP)
        mysql_lib.start_consistent_snapshot(conn, read_only=True)
        cursor = conn.cursor()
        cursor.execute('SET SESSION wait_timeout=28800')
        cursor.execute("SELECT VARIABLE_VALUE AS conn_id FROM "
                       "INFORMATION_SCHEMA.SESSION_VARIABLES "
                       "WHERE VARIABLE_NAME='pseudo_thread_id'")
        self.session_id = cursor.fetchone()['conn_id']

        workers = []
        for _ in range(multiprocessing.cpu_count() / 2):
            proc = multiprocessing.Process(target=self.mysql_backup_csv_tables)
            proc.daemon = True
            proc.start()
            workers.append(proc)

        # throw in a sleep to make sure all threads have started dumps
        time.sleep(2)
        log.info('Restarting replication')
        mysql_lib.start_replication(self.instance,
                                    mysql_lib.REPLICATION_THREAD_SQL)

        for worker in workers:
            worker.join()

        if not (self.tables_to_backup.empty()
                and self.tables_to_retry.empty()):
            raise Exception('All worker processes have completed, but '
                            'work remains in the queue')

        log.info('CSV backup is complete, will run a check')
        self.release_expired_locks()
        mysql_backup_status.verify_csv_instance_backup(self.instance,
                                                       self.datestamp,
                                                       self.dev_bucket)
        host_utils.release_lock_socket(host_lock)
예제 #4
0
def add_replica_to_zk(instance, replica_type, dry_run):
    """ Add a replica to zk

    Args:
    instance - A hostaddr object of the replica to add to zk
    replica_type - Either 'slave' or 'dr_slave'.
    dry_run - If set, do not modify zk
    """
    try:
        if replica_type not in [host_utils.REPLICA_ROLE_DR_SLAVE,
                                host_utils.REPLICA_ROLE_SLAVE]:
            raise Exception('Invalid value "{}" for argument '
                            "replica_type").format(replica_type)

        log.info('Instance is {}'.format(instance))
        mysql_lib.assert_replication_sanity(instance)
        mysql_lib.assert_replication_unlagged(
            instance,
            mysql_lib.REPLICATION_TOLERANCE_NORMAL)
        master = mysql_lib.get_master_from_instance(instance)

        zk_local = host_utils.MysqlZookeeper()
        kazoo_client = environment_specific.get_kazoo_client()
        if not kazoo_client:
            raise Exception('Could not get a zk connection')

        if master not in zk_local.get_all_mysql_instances_by_type(
                    host_utils.REPLICA_ROLE_MASTER):
            raise Exception('Instance {} is not a master in zk'
                            ''.format(master))

        log.info('Detected master of {instance} '
                 'as {master}'.format(instance=instance,
                                      master=master))

        replica_set = zk_local.get_replica_set_from_instance(master)
        log.info('Detected replica_set as {}'.format(replica_set))
        old_instance = zk_local.get_mysql_instance_from_replica_set(
                           replica_set,
                           repl_type=replica_type)

        if replica_type == host_utils.REPLICA_ROLE_SLAVE:
            (zk_node,
             parsed_data, version) = get_zk_node_for_replica_set(kazoo_client,
                                                                 replica_set)
            log.info('Replica set {replica_set} is held in zk_node '
                     '{zk_node}'.format(zk_node=zk_node,
                                        replica_set=replica_set))
            log.info('Existing config:')
            log.info(pprint.pformat(remove_auth(parsed_data[replica_set])))
            new_data = copy.deepcopy(parsed_data)
            new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['host'] = \
                instance.hostname
            new_data[replica_set][host_utils.REPLICA_ROLE_SLAVE]['port'] = \
                instance.port
            log.info('New config:')
            log.info(pprint.pformat(remove_auth(new_data[replica_set])))

            if new_data == parsed_data:
                raise Exception('No change would be made to zk, '
                                'will not write new config')
            elif dry_run:
                log.info('dry_run is set, therefore not modifying zk')
            else:
                log.info('Pushing new configuration for '
                         '{replica_set}:'.format(replica_set=replica_set))
                kazoo_client.set(zk_node, simplejson.dumps(new_data), version)
        elif replica_type == host_utils.REPLICA_ROLE_DR_SLAVE:
            znode_data, dr_meta = kazoo_client.get(environment_specific.DR_ZK)
            parsed_data = simplejson.loads(znode_data)
            new_data = copy.deepcopy(parsed_data)
            if replica_set in parsed_data:
                log.info('Existing dr config:')
                log.info(pprint.pformat(remove_auth(parsed_data[replica_set])))
            else:
                log.info('Replica set did not previously have a dr slave')

            new_data[replica_set] = \
                {host_utils.REPLICA_ROLE_DR_SLAVE: {'host': instance.hostname,
                                                    'port': instance.port}}
            log.info('New dr config:')
            log.info(pprint.pformat(remove_auth(new_data[replica_set])))

            if new_data == parsed_data:
                raise Exception('No change would be made to zk, '
                                'will not write new config')
            elif dry_run:
                log.info('dry_run is set, therefore not modifying zk')
            else:
                log.info('Pushing new dr configuration for '
                         '{replica_set}:'.format(replica_set=replica_set))
                kazoo_client.set(environment_specific.DR_ZK,
                                 simplejson.dumps(new_data), dr_meta.version)
        else:
            # we should raise an exception above rather than getting to here
            pass
        if not dry_run:
            log.info('Stopping replication and event scheduler on {} '
                     'being taken out of use'.format(old_instance))
            try:
                mysql_lib.stop_replication(old_instance)
                mysql_lib.stop_event_scheduler(old_instance)
            except:
                log.info('Could not stop replication on {}'
                         ''.format(old_instance))

    except Exception, e:
        log.exception(e)
        raise