예제 #1
0
def main(config, max_backup_age=0, max_backup_count=0):
    backups_to_purge = set()
    monitoring = Monitoring(config=config.monitoring)

    try:
        logging.info('Starting purge')
        storage = Storage(config=config.storage)
        # Get all backups for the local node
        logging.info('Listing backups for {}'.format(config.storage.fqdn))
        backup_index = storage.list_backup_index_blobs()
        backups = list(
            storage.list_node_backups(fqdn=config.storage.fqdn,
                                      backup_index_blobs=backup_index))
        # list all backups to purge based on date conditions
        backups_to_purge |= set(
            backups_to_purge_by_age(backups, max_backup_age))
        # list all backups to purge based on count conditions
        backups_to_purge |= set(
            backups_to_purge_by_count(backups, max_backup_count))
        # purge all candidate backups
        purge_backups(storage, backups_to_purge)

        logging.debug('Emitting metrics')
        tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR']
        monitoring.send(tags, 0)
    except Exception as e:
        traceback.print_exc()
        tags = ['medusa-node-backup', 'purge-error', 'PURGE-ERROR']
        monitoring.send(tags, 1)
        logging.error('This error happened during the purge: {}'.format(
            str(e)))
        sys.exit(1)
예제 #2
0
def delete_backup(config, backup_name, all_nodes):
    backups_to_purge = list()
    monitoring = Monitoring(config=config.monitoring)

    try:
        storage = Storage(config=config.storage)
        cluster_backup = storage.get_cluster_backup(backup_name)
        backups_to_purge = cluster_backup.node_backups.values()

        if not all_nodes:
            backups_to_purge = [
                nb for nb in backups_to_purge if storage.config.fqdn in nb.fqdn
            ]

        logging.info('Deleting Backup {}...'.format(backup_name))
        purge_backups(storage, backups_to_purge)

        logging.debug('Emitting metrics')
        tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR']
        monitoring.send(tags, 0)
    except Exception as e:
        tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR']
        monitoring.send(tags, 1)
        medusa.utils.handle_exception(
            e,
            'This error happened during the delete of backup "{}": {}'.format(
                backup_name, str(e)), config)
예제 #3
0
def handle_backup(config, backup_name_arg, stagger_time,
                  enable_md5_checks_flag, mode):
    start = datetime.datetime.now()
    backup_name = backup_name_arg or start.strftime('%Y%m%d%H%M')
    monitoring = Monitoring(config=config.monitoring)

    try:
        logging.debug(
            "Starting backup preparations with Mode: {}".format(mode))
        storage = Storage(config=config.storage)
        cassandra = Cassandra(config)

        differential_mode = False
        if mode == "differential":
            differential_mode = True

        node_backup = storage.get_node_backup(
            fqdn=config.storage.fqdn,
            name=backup_name,
            differential_mode=differential_mode)
        if node_backup.exists():
            raise IOError(
                'Error: Backup {} already exists'.format(backup_name))

        # Starting the backup
        logging.info(
            "Starting backup using Stagger: {} Mode: {} Name: {}".format(
                stagger_time, mode, backup_name))
        BackupMan.update_backup_status(backup_name,
                                       BackupMan.STATUS_IN_PROGRESS)
        info = start_backup(storage, node_backup, cassandra, differential_mode,
                            stagger_time, start, mode, enable_md5_checks_flag,
                            backup_name, config, monitoring)
        BackupMan.update_backup_status(backup_name, BackupMan.STATUS_SUCCESS)

        logging.debug("Done with backup, returning backup result information")
        return (info["actual_backup_duration"], info["actual_start_time"],
                info["end_time"], info["node_backup"],
                info["node_backup_cache"], info["num_files"],
                info["start_time"], info["backup_name"])

    except Exception as e:
        logging.error(
            "Issue occurred inside handle_backup Name: {} Error: {}".format(
                backup_name, str(e)))
        BackupMan.update_backup_status(backup_name, BackupMan.STATUS_FAILED)

        tags = ['medusa-node-backup', 'backup-error', backup_name]
        monitoring.send(tags, 1)
        medusa.utils.handle_exception(
            e, "Error occurred during backup: {}".format(str(e)), config)
예제 #4
0
def delete_backup(config, backup_names, all_nodes):
    monitoring = Monitoring(config=config.monitoring)

    try:
        storage = Storage(config=config.storage)
        cluster_backups = storage.list_cluster_backups()
        backups_to_purge = backups_to_purge_by_name(storage, cluster_backups,
                                                    backup_names, all_nodes)

        logging.info('Deleting Backup(s) {}...'.format(",".join(backup_names)))
        purge_backups(storage, backups_to_purge,
                      config.storage.backup_grace_period_in_days,
                      storage.config.fqdn)

        logging.debug('Emitting metrics')
        tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR']
        monitoring.send(tags, 0)
    except Exception as e:
        tags = ['medusa-node-backup', 'delete-error', 'DELETE-ERROR']
        monitoring.send(tags, 1)
        medusa.utils.handle_exception(
            e, 'This error happened during the delete of backup(s) "{}": {}'.
            format(",".join(backup_names), str(e)), config)
예제 #5
0
def report_latest(config, push_metrics):
    MAX_RETRIES = 3
    SLEEP_TIME = 15
    retry = 0

    monitoring = Monitoring(config=config.monitoring)

    for retry in range(MAX_RETRIES):
        try:
            logging.debug('Trying to report about existing backups ({}/{})...'.format(
                retry + 1,
                MAX_RETRIES
            ))
            storage = Storage(config=config.storage)
            fqdn = config.storage.fqdn
            backup_index = storage.list_backup_index_blobs()
            check_node_backup(config, storage, fqdn, push_metrics, monitoring)
            check_complete_cluster_backup(storage, push_metrics, monitoring, backup_index)
            check_latest_cluster_backup(storage, push_metrics, monitoring, backup_index)
            break
        except Exception as e:
            if (retry + 1) < MAX_RETRIES:
                logging.debug('Report attempt {} failed, waiting {} seconds to retry'.format(
                    retry + 1,
                    SLEEP_TIME
                ))
                time.sleep(SLEEP_TIME)
                continue
            else:
                logging.error('This error happened during the check: {}'.format(e), exc_info=True)
                if push_metrics:
                    # Set latest known complete backup to ~ 10 years ago to attract the attention
                    # of the operator on the broken monitoring.
                    logging.info("Sending a big value to 'seconds-since-backup' metric to trigger alerts.")
                    long_time_flag_value = 315365400
                    tags = ['medusa-cluster-backup', 'seconds-since-backup', 'TRACKING-ERROR']
                    monitoring.send(tags, long_time_flag_value)
예제 #6
0
def orchestrate(config, backup_name, seed_target, temp_dir, host_list, keep_auth, bypass_checks,
                verify, keyspaces, tables, parallel_restores, use_sstableloader=False):
    monitoring = Monitoring(config=config.monitoring)
    try:
        restore_start_time = datetime.datetime.now()
        if seed_target is None and host_list is None:
            # if no target node is provided, nor a host list file, default to the local node as seed target
            hostname_resolver = HostnameResolver(medusa.utils.evaluate_boolean(config.cassandra.resolve_ip_addresses))
            seed_target = hostname_resolver.resolve_fqdn(socket.gethostbyname(socket.getfqdn()))
            logging.warning("Seed target was not provided, using the local hostname: {}".format(seed_target))

        if seed_target is not None and host_list is not None:
            err_msg = 'You must either provide a seed target or a list of host, not both'
            logging.error(err_msg)
            raise Exception(err_msg)

        if not temp_dir.is_dir():
            err_msg = '{} is not a directory'.format(temp_dir)
            logging.error(err_msg)
            raise Exception(err_msg)

        storage = Storage(config=config.storage)

        try:
            cluster_backup = storage.get_cluster_backup(backup_name)
        except KeyError:
            err_msg = 'No such backup --> {}'.format(backup_name)
            logging.error(err_msg)
            raise Exception(err_msg)

        restore = RestoreJob(cluster_backup, config, temp_dir, host_list, seed_target, keep_auth, verify,
                             parallel_restores, keyspaces, tables, bypass_checks, use_sstableloader)
        restore.execute()

        restore_end_time = datetime.datetime.now()
        restore_duration = restore_end_time - restore_start_time

        logging.debug('Emitting metrics')

        logging.info('Restore duration: {}'.format(restore_duration.seconds))
        tags = ['medusa-cluster-restore', 'restore-duration', backup_name]
        monitoring.send(tags, restore_duration.seconds)

        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 0)

        logging.debug('Done emitting metrics')
        logging.info('Successfully restored the cluster')

    except Exception as e:
        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 1)

        logging.error('This error happened during the cluster restore: {}'.format(str(e)))
        traceback.print_exc()
        sys.exit(1)
예제 #7
0
def orchestrate(config,
                backup_name,
                seed_target,
                temp_dir,
                host_list,
                keep_auth,
                bypass_checks,
                verify,
                keyspaces,
                tables,
                pssh_pool_size,
                use_sstableloader=False):
    monitoring = Monitoring(config=config.monitoring)
    try:
        restore_start_time = datetime.datetime.now()
        if seed_target is not None:
            keep_auth = False

        if seed_target is None and host_list is None:
            err_msg = 'You must either provide a seed target or a list of host'
            logging.error(err_msg)
            raise Exception(err_msg)

        if seed_target is not None and host_list is not None:
            err_msg = 'You must either provide a seed target or a list of host, not both'
            logging.error(err_msg)
            raise Exception(err_msg)

        if not temp_dir.is_dir():
            err_msg = '{} is not a directory'.format(temp_dir)
            logging.error(err_msg)
            raise Exception(err_msg)

        if keep_auth:
            logging.info(
                'system_auth keyspace will be left untouched on the target nodes'
            )
        else:
            logging.info(
                'system_auth keyspace will be overwritten with the backup on target nodes'
            )

        storage = Storage(config=config.storage)

        try:
            cluster_backup = storage.get_cluster_backup(backup_name)
        except KeyError:
            err_msg = 'No such backup --> {}'.format(backup_name)
            logging.error(err_msg)
            raise Exception(err_msg)

        restore = RestoreJob(cluster_backup, config, temp_dir, host_list,
                             seed_target, keep_auth, verify, pssh_pool_size,
                             keyspaces, tables, bypass_checks,
                             use_sstableloader)
        restore.execute()

        restore_end_time = datetime.datetime.now()
        restore_duration = restore_end_time - restore_start_time

        logging.debug('Emitting metrics')

        logging.info('Restore duration: {}'.format(restore_duration.seconds))
        tags = ['medusa-cluster-restore', 'restore-duration', backup_name]
        monitoring.send(tags, restore_duration.seconds)

        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 0)

        logging.debug('Done emitting metrics')
        logging.info('Successfully restored the cluster')

    except Exception as e:
        tags = ['medusa-cluster-restore', 'restore-error', backup_name]
        monitoring.send(tags, 1)

        logging.error(
            'This error happened during the cluster restore: {}'.format(
                str(e)))
        traceback.print_exc()
        sys.exit(1)
def main(config, backup_name_arg, stagger_time, mode):
    start = datetime.datetime.now()
    backup_name = backup_name_arg or start.strftime('%Y%m%d%H')
    monitoring = Monitoring(config=config.monitoring)

    try:
        storage = Storage(config=config.storage)
        cassandra = Cassandra(config)

        differential_mode = False
        if mode == "differential":
            differential_mode = True

        node_backup = storage.get_node_backup(
            fqdn=config.storage.fqdn,
            name=backup_name,
            differential_mode=differential_mode
        )

        if node_backup.exists():
            raise IOError('Error: Backup {} already exists'.format(backup_name))

        # Make sure that priority remains to Cassandra/limiting backups resource usage
        try:
            throttle_backup()
        except Exception:
            logging.warning("Throttling backup impossible. It's probable that ionice is not available.")

        logging.info('Saving tokenmap and schema')
        schema, tokenmap = get_schema_and_tokenmap(cassandra)

        node_backup.schema = schema
        node_backup.tokenmap = json.dumps(tokenmap)
        if differential_mode is True:
            node_backup.differential = mode
        add_backup_start_to_index(storage, node_backup)

        if stagger_time:
            stagger_end = start + stagger_time
            logging.info('Staggering backup run, trying until {}'.format(stagger_end))
            while not stagger(config.storage.fqdn, storage, tokenmap):
                if datetime.datetime.now() < stagger_end:
                    logging.info('Staggering this backup run...')
                    time.sleep(60)
                else:
                    raise IOError('Backups on previous nodes did not complete'
                                  ' within our stagger time.')

        actual_start = datetime.datetime.now()

        num_files, node_backup_cache = do_backup(
            cassandra, node_backup, storage, differential_mode, config, backup_name)

        end = datetime.datetime.now()
        actual_backup_duration = end - actual_start

        print_backup_stats(actual_backup_duration, actual_start, end, node_backup, node_backup_cache, num_files, start)

        update_monitoring(actual_backup_duration, backup_name, monitoring, node_backup)
        return (actual_backup_duration, actual_start, end, node_backup, node_backup_cache, num_files, start)

    except Exception as e:
        tags = ['medusa-node-backup', 'backup-error', backup_name]
        monitoring.send(tags, 1)
        medusa.utils.handle_exception(
            e,
            "This error happened during the backup: {}".format(str(e)),
            config
        )
def orchestrate(config, backup_name_arg, seed_target, stagger,
                enable_md5_checks, mode, temp_dir, parallel_snapshots,
                parallel_uploads):
    backup = None
    backup_name = backup_name_arg or datetime.datetime.now().strftime(
        '%Y%m%d%H%M')
    monitoring = Monitoring(config=config.monitoring)
    try:
        backup_start_time = datetime.datetime.now()
        if not config.storage.fqdn:
            err_msg = "The fqdn was not provided nor calculated properly."
            logging.error(err_msg)
            raise Exception(err_msg)

        if not temp_dir.is_dir():
            err_msg = '{} is not a directory'.format(temp_dir)
            logging.error(err_msg)
            raise Exception(err_msg)

        try:
            # Try to get a backup with backup_name. If it exists then we cannot take another backup with that name
            storage = Storage(config=config.storage)
            cluster_backup = storage.get_cluster_backup(backup_name)
            if cluster_backup:
                err_msg = 'Backup named {} already exists.'.format(backup_name)
                logging.error(err_msg)
                raise Exception(err_msg)
        except KeyError:
            info_msg = 'Starting backup {}'.format(backup_name)
            logging.info(info_msg)

        backup = BackupJob(config, backup_name, seed_target, stagger,
                           enable_md5_checks, mode, temp_dir,
                           parallel_snapshots, parallel_uploads)
        backup.execute()

        backup_end_time = datetime.datetime.now()
        backup_duration = backup_end_time - backup_start_time

        logging.debug('Emitting metrics')

        logging.info('Backup duration: {}'.format(backup_duration.seconds))
        tags = [
            'medusa-cluster-backup', 'cluster-backup-duration', backup_name
        ]
        monitoring.send(tags, backup_duration.seconds)

        tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name]
        monitoring.send(tags, 0)

        logging.debug('Done emitting metrics.')
        logging.info('Backup of the cluster done.')

    except Exception as e:
        tags = ['medusa-cluster-backup', 'cluster-backup-error', backup_name]
        monitoring.send(tags, 1)

        logging.error(
            'This error happened during the cluster backup: {}'.format(str(e)))
        traceback.print_exc()

        if backup is not None:
            err_msg = 'Something went wrong! Attempting to clean snapshots and exit.'
            logging.error(err_msg)

            delete_snapshot_command = ' '.join(
                backup.cassandra.delete_snapshot_command(backup.snapshot_tag))
            pssh_run_success_cleanup = backup.orchestration_uploads\
                .pssh_run(backup.hosts,
                          delete_snapshot_command,
                          hosts_variables={})
            if pssh_run_success_cleanup:
                info_msg = 'All nodes successfully cleared their snapshot.'
                logging.info(info_msg)
            else:
                err_msg_cleanup = 'Some nodes failed to clear the snapshot. Cleaning snapshots manually is recommended'
                logging.error(err_msg_cleanup)
        sys.exit(1)