示例#1
0
def build_replication_cache(context, tags, configurations, region,
                            installed_region):
    """Build a giant cache of replication-relevant snapshots for region"""
    LOG.debug("Building cache of replication-relevant snapshots in %s", region)

    # all replication-related snapshots will have one or the other of these tags
    found_snapshots = {}

    region_owner_ids = get_owner_id(region)
    for tag in tags:
        found_snapshots[tag] = []

        params = {
            'Filters': [{
                'Name': 'tag-key',
                'Values': [tag]
            }],
            'OwnerIds': region_owner_ids,
        }
        paginator = build_snapshot_paginator(params, region)

        for page in paginator:
            if timeout_check(context, 'perform_replication'):
                break

            if not page and 'Snapshots' not in page:
                continue

            for snapshot in page['Snapshots']:
                found_snapshots[tag].append(snapshot)
                if timeout_check(context, 'perform_replication'):
                    break

    return found_snapshots
示例#2
0
def build_replication_cache(context, tags, configurations, region,
                            installed_region):
    """Build a giant cache of replication-relevant snapshots for region"""
    LOG.debug("Building cache of replication-relevant snapshots in %s", region)

    # all replication-related snapshots will have one or the other of these tags
    found_snapshots = {}

    ec2 = boto3.client('ec2', region_name=region)
    region_owner_ids = get_owner_id(region)
    for tag in tags:
        found_snapshots[tag] = []

        paginator = ec2.get_paginator('describe_snapshots')
        operation_parameters = {
            'Filters': [{
                'Name': 'tag-key',
                'Values': [tag]
            }],
            'OwnerIds': region_owner_ids,
        }
        sleep(1)  # help w/ API limits
        for page in paginator.paginate(**operation_parameters):
            if timeout_check(context, 'perform_replication'):
                break

            if not page and 'Snapshots' not in page:
                continue

            for snapshot in page['Snapshots']:
                found_snapshots[tag].append(snapshot)
                if timeout_check(context, 'perform_replication'):
                    break

    return found_snapshots
示例#3
0
def perform_snapshot(context, region, installed_region='us-east-1'):
    """Check the region and instance, and see if we should take any snapshots"""
    LOG.info('Reviewing snapshots in region %s', region)

    # fetch these, in case we need to figure out what applies to an instance
    configurations = dynamo.list_configurations(context, installed_region)
    LOG.debug('Fetched all possible configuration rules from DynamoDB')

    # build a list of any IDs (anywhere) that we should ignore
    ignore_ids = utils.build_ignore_list(configurations)

    # setup some lookup tables
    cache_data = utils.build_cache_maps(context, configurations, region, installed_region)
    all_instances = cache_data['instance_id_to_data']
    instance_configs = cache_data['instance_id_to_config']
    volume_snap_recent = cache_data['volume_id_to_most_recent_snapshot_date']

    for instance_id in set(all_instances.keys()):
        # before we go do some work
        if timeout_check(context, 'perform_snapshot'):
            break

        if instance_id in ignore_ids:
            continue

        snapshot_settings = instance_configs[instance_id]

        # parse out snapshot settings
        retention, frequency = utils.parse_snapshot_settings(snapshot_settings)

        # grab the data about this instance id, if we don't already have it
        instance_data = all_instances[instance_id]

        ami_id = instance_data['ImageId']
        LOG.info('Reviewing snapshots in region %s on instance %s', region, instance_id)

        for dev in instance_data.get('BlockDeviceMappings', []):
            # before we go make a bunch more API calls
            if timeout_check(context, 'perform_snapshot'):
                break

            # we probably should have been using volume keys from one of the
            # caches here, but since we're not, we're going to have to check here too
            LOG.debug('Considering device %s', dev)
            volume_id = dev['Ebs']['VolumeId']

            if volume_id in ignore_ids:
                continue

            # find snapshots
            recent = volume_snap_recent.get(volume_id)
            now = datetime.datetime.now(dateutil.tz.tzutc())

            # snapshot due?
            if should_perform_snapshot(frequency, now, volume_id, recent):
                LOG.debug('Performing snapshot for %s, calculating tags', volume_id)
            else:
                LOG.debug('NOT Performing snapshot for %s', volume_id)
                continue

            # perform actual snapshot and create tag: retention + now() as a Y-M-D
            delete_on_dt = now + retention
            delete_on = delete_on_dt.strftime('%Y-%m-%d')

            volume_data = utils.get_volume(volume_id, region=region)
            expected_tags = utils.calculate_relevant_tags(
                instance_data.get('Tags', None),
                volume_data.get('Tags', None))

            utils.snapshot_and_tag(
                instance_id,
                ami_id,
                volume_id,
                delete_on,
                region,
                additional_tags=expected_tags)
示例#4
0
def clean_snapshot(context,
                   region,
                   default_min_snaps=5,
                   installed_region='us-east-1'):
    """Check the region see if we should clean up any snapshots"""
    LOG.info('clean_snapshot in region %s', region)

    # fetch these, in case we need to figure out what applies to an instance
    configurations = dynamo.list_configurations(context, installed_region)
    LOG.debug('Fetched all possible configuration rules from DynamoDB')

    # build a list of any IDs (anywhere) that we should ignore
    ignore_ids = utils.build_ignore_list(configurations)

    # figure out if we're in an account-wide mode where we ignore retention and
    # destroy all snapshots with a delete_on value that we want to delete
    ignore_retention_enabled = utils.ignore_retention_enabled(configurations)

    cache_data = utils.build_cache_maps(context, configurations, region,
                                        installed_region)
    instance_configs = cache_data['instance_id_to_config']
    all_volumes = cache_data['volume_id_to_instance_id']
    volume_snap_count = cache_data['volume_id_to_snapshot_count']

    # figure out what dates we want to nuke
    today = datetime.date.today()
    delete_on_values = []
    for i in range(0, 8):  # seven days ago until today
        del_date = today + timedelta(days=-i)
        delete_on_values.append(del_date.strftime('%Y-%m-%d'))

    # setup counters before we start
    deleted_count = 0

    # setup our filters
    filters = [
        {
            'Name': 'tag-key',
            'Values': ['DeleteOn']
        },
        {
            'Name': 'tag-value',
            'Values': delete_on_values
        },
    ]
    params = {'Filters': filters}

    # paginate the snapshot list
    tag_paginator = utils.build_snapshot_paginator(params, region)
    for page in tag_paginator:
        # stop if we're running out of time
        if timeout_check(context, 'clean_snapshot'):
            break

        # if we don't get even a page of results, or missing hash key, skip
        if not page and 'Snapshots' not in page:
            continue

        for snap in page['Snapshots']:
            # stop if we're running out of time
            if timeout_check(context, 'clean_snapshot'):
                break

            # ugly comprehension to strip out a tag
            delete_on = [
                r['Value'] for r in snap['Tags'] if r.get('Key') == 'DeleteOn'
            ][0]

            # volume for snapshot
            snapshot_volume = snap['VolumeId']
            minimum_snaps = default_min_snaps

            if snapshot_volume in ignore_ids:
                continue

            try:
                # given volume id, get the instance for it
                volume_instance = all_volumes.get(snapshot_volume, None)

                # minimum required
                if volume_instance is not None:
                    snapshot_settings = instance_configs.get(
                        volume_instance, None)
                    if snapshot_settings is not None:
                        try:
                            minimum_snaps = int(
                                snapshot_settings['snapshot']['minimum'])
                        except ValueError:
                            raise Exception(
                                "Minimum number of snaps configured is not an integer."
                            )

                # current number of snapshots
                if snapshot_volume in volume_snap_count:
                    no_snaps = volume_snap_count[snapshot_volume]
                else:
                    raise Exception(
                        'Could not count snapshots, missing volume')

                # if we have less than the minimum, don't delete this one
                if no_snaps <= minimum_snaps:
                    LOG.warn('Not deleting snapshot %s from %s (%s)',
                             snap['SnapshotId'], region, delete_on)
                    LOG.warn('Only %s snapshots exist, below minimum of %s',
                             no_snaps, minimum_snaps)
                    continue

            except:
                # if we couldn't figure out a minimum of snapshots,
                # don't clean this up -- these could be orphaned snapshots
                LOG.warn(
                    'Error analyzing snapshot %s from %s, skipping... (%s)',
                    snap['SnapshotId'], region, delete_on)

                # skip this loop iteration unless ignore_retention_enabled
                if not ignore_retention_enabled:
                    continue

            log_snapcount = volume_snap_count.get(snapshot_volume, 'unknown') \
                if volume_snap_count else None

            LOG.warn('Deleting snapshot %s from %s (%s, count=%s > %s)',
                     snap['SnapshotId'], region, delete_on, log_snapcount,
                     minimum_snaps)
            deleted_count += utils.delete_snapshot(snap['SnapshotId'], region)

    if deleted_count <= 0:
        LOG.warn('No snapshots were cleaned up for the entire region %s',
                 region)
    else:
        LOG.info(
            'Function clean_snapshots_tagged completed, deleted count: %s',
            str(deleted_count))

    LOG.info('Function clean_snapshot completed')
示例#5
0
def perform_replication(context, region, installed_region='us-east-1'):
    """Check the region and instance, and see if we should clean or create copies"""
    LOG.info('Performing snapshot replication in region %s', region)

    # TL;DR -- always try to clean up first, before making new copies.

    # build a list of ignore IDs, just in case they are relevant here
    configurations = dynamo.list_configurations(context, installed_region)
    ignore_ids = utils.build_ignore_list(configurations)
    LOG.debug('Fetched all configured ignored IDs rules from DynamoDB')

    # 1. collect snapshots from this region
    relevant_tags = ['replication_src_region', 'replication_dst_region']
    found_snapshots = utils.build_replication_cache(context, relevant_tags,
                                                    configurations, region,
                                                    installed_region)

    # 2. evaluate snapshots that were copied to this region, if source not found, delete
    for snapshot in found_snapshots.get('replication_src_region', []):
        snapshot_id = snapshot['SnapshotId']
        snapshot_description = snapshot['Description']

        if timeout_check(context, 'perform_replication'):
            break

        if snapshot_id in ignore_ids:
            continue

        if snapshot['State'] in ['pending', 'error']:
            LOG.warn('Skip cleaning up this snapshot ' + snapshot_id +
                     ' due to ' + snapshot['State'] + ' state: ' +
                     snapshot_description)
            continue

        LOG.info('Working on cleaning up this snapshot ' + snapshot_id +
                 ' (if needed): ' + snapshot_description)

        # what region did this come from?
        tag_pairs = snapshot.get('Tags', [])
        region_tag_pair = [
            x for x in tag_pairs
            if x.get('Key', None) == 'replication_src_region'
        ]
        region_tag_value = region_tag_pair[0].get('Value')

        # what snapshot id did this come from?
        snapshotid_tag_pair = [
            x for x in tag_pairs
            if x.get('Key', None) == 'replication_snapshot_id'
        ]
        snapshotid_tag_value = snapshotid_tag_pair[0].get('Value')

        ec2_source = boto3.client('ec2', region_name=region_tag_value)
        try:
            found_originals = ec2_source.describe_snapshots(
                SnapshotIds=[snapshotid_tag_value
                             ],  # we think the original snapshot id is this
                Filters=[
                    # where it gets copied to should be us
                    {
                        'Name': 'tag:replication_dst_region',
                        'Values': [region]
                    },
                ])
        except Exception as err:
            if 'InvalidSnapshot.NotFound' in str(err):
                found_originals = {'Snapshots': []}
            else:
                raise err

        num_found = len(found_originals.get('Snapshots', []))
        if num_found > 0:
            LOG.info('Not removing this snapshot ' + snapshot_id + ' from ' +
                     region + ' since snapshot_id ' + snapshotid_tag_value +
                     ' was already found in ' + region_tag_value)
            continue

        # ax it!
        LOG.warn('Removing this snapshot ' + snapshot_id + ' from ' + region +
                 ' since snapshot_id ' + snapshotid_tag_value +
                 ' was not found in ' + region_tag_value)
        utils.delete_snapshot(snapshot_id, region)

    # 3. evaluate snapshots that should be copied from this region, if dest not found, copy and tag
    for snapshot in found_snapshots.get('replication_dst_region', []):
        snapshot_id = snapshot['SnapshotId']
        snapshot_description = snapshot['Description']

        if timeout_check(context, 'perform_replication'):
            break

        if snapshot_id in ignore_ids:
            continue

        if snapshot['State'] in ['pending', 'error']:
            LOG.warn('Skip copying this snapshot ' + snapshot_id + ' due to ' +
                     snapshot['State'] + ' state: ' + snapshot_description)
            continue

        LOG.info('Working on copying this snapshot ' + snapshot_id +
                 ' (if needed): ' + snapshot_description)

        # what region should this be mapped to?
        tag_pairs = snapshot.get('Tags', [])
        region_tag_pair = [
            x for x in tag_pairs
            if x.get('Key', None) == 'replication_dst_region'
        ]
        region_tag_value = region_tag_pair[0].get('Value')

        # does it already exist in the target region?
        ec2_destination = boto3.client('ec2', region_name=region_tag_value)
        found_replicas = ec2_destination.describe_snapshots(Filters=[
            # came from our region originally
            {
                'Name': 'tag:replication_src_region',
                'Values': [region]
            },

            # came from our snapshot originally
            {
                'Name': 'tag:replication_snapshot_id',
                'Values': [snapshot_id]
            }
        ])
        num_found = len(found_replicas.get('Snapshots', []))
        if num_found > 0:
            LOG.info('Not creating more snapshots, since snapshot_id ' +
                     snapshot_id + ' was already found in ' + region_tag_value)
            continue

        # we need to make one in the target region
        LOG.warn('Creating a new snapshot, since snapshot_id ' + snapshot_id +
                 ' was not already found in ' + region_tag_value)
        utils.copy_snapshot_and_tag(context, region, region_tag_value,
                                    snapshot_id, snapshot_description)
示例#6
0
def perform_replication(context, region, installed_region='us-east-1'):
    """Check the region and instance, and see if we should clean or create copies"""
    LOG.info('Performing snapshot replication in region %s', region)

    # TL;DR -- always try to clean up first, before making new copies.

    # build a list of ignore IDs, just in case they are relevant here
    configurations = dynamo.list_configurations(context, installed_region)
    ignore_ids = utils.build_ignore_list(configurations)
    LOG.debug('Fetched all configured ignored IDs rules from DynamoDB')

    # 1. collect snapshots from this region
    snap_cached_src_regions = []
    snap_cached_dst_regions = []
    src_snap_list = []
    replication_snap_list = []
    relevant_tags = ['replication_src_region', 'replication_dst_region']
    found_snapshots = utils.build_replication_cache(
        context,
        relevant_tags,
        configurations,
        region,
        installed_region
    )
    # 1a. build snapshot cache from all source regions
    for snapshot_regions in found_snapshots.get('replication_src_region', []):
        # what region did this come from?
        tag_pairs = snapshot_regions.get('Tags', [])
        region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_src_region']
        region_tag_value = region_tag_pair[0].get('Value')
        if region_tag_value not in snap_cached_src_regions:
            LOG.info('Caching snapshots in source region: %s', region_tag_value)
            snap_cached_src_regions.append(region_tag_value)

            ec2_source = boto3.client('ec2', region_name=region_tag_value)
            try:
                response = ec2_source.describe_snapshots(
                    Filters=[{'Name': 'tag:replication_dst_region', 'Values': [region]}]
                )
                mysnaps = response['Snapshots']
            except Exception as err:
                if 'InvalidSnapshot.NotFound' in str(err):
                    mysnaps = {'Snapshots', []}
                else:
                    raise err

            for snap in mysnaps:
                src_snap_list.append(snap['SnapshotId'])

            LOG.info('Caching completed for source region: ' + region_tag_value + ': cache size: ' +
                     str(len(src_snap_list)))
            sleep(1)

    # 1b. build snapshot cache for all destination regions
    for snapshot_regions in found_snapshots.get('replication_dst_region', []):
        # which region is destination
        tag_pairs = snapshot_regions.get('Tags', [])
        region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_dst_region']
        region_tag_value = region_tag_pair[0].get('Value')
        if region_tag_value not in snap_cached_dst_regions:
            LOG.info('Caching snapshots in destination region: %s', region_tag_value)
            snap_cached_dst_regions.append(region_tag_value)

            ec2_source = boto3.client('ec2', region_name=region_tag_value)
            try:
                response = ec2_source.describe_snapshots(
                    Filters=[{'Name': 'tag:replication_src_region', 'Values': [region]}]
                )
                mysnaps = response['Snapshots']
            except Exception as err:
                if 'InvalidSnapshot.NotFound' in str(err):
                    mysnaps = {'Snapshots', []}
                else:
                    raise err

            for snap in mysnaps:
                for tags in snap['Tags']:
                    if tags["Key"] == 'replication_snapshot_id':
                        replication_snap_list.append(tags["Value"])

            LOG.info('Caching completed for destination region: ' + region_tag_value +
                     ': cache size: ' + str(len(replication_snap_list)))
            sleep(1)

    # 2. evaluate snapshots that were copied to this region, if source not found, delete
    for snapshot in found_snapshots.get('replication_src_region', []):
        snapshot_id = snapshot['SnapshotId']
        snapshot_description = snapshot['Description']

        if timeout_check(context, 'perform_replication'):
            break

        if snapshot_id in ignore_ids:
            continue

        if snapshot['State'] in ['pending', 'error']:
            LOG.warn('Skip cleaning up this snapshot ' + snapshot_id +
                     ' due to ' + snapshot['State'] + ' state: ' + snapshot_description)
            continue

        LOG.info('Working on cleaning up this snapshot ' + snapshot_id +
                 ' (if needed): ' + snapshot_description)

        # what region did this come from?
        tag_pairs = snapshot.get('Tags', [])
        region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_src_region']
        region_tag_value = region_tag_pair[0].get('Value')

        # what snapshot id did this come from?
        snapshotid_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_snapshot_id']
        snapshotid_tag_value = snapshotid_tag_pair[0].get('Value')

        if snapshotid_tag_value in src_snap_list:
            LOG.info('Not removing this snapshot ' + snapshot_id + ' from ' + region +
                     ' since snapshot_id ' + snapshotid_tag_value +
                     ' was found in ' + region_tag_value)
            continue

        # ax it!
        LOG.warn('Removing this snapshot ' + snapshot_id + ' from ' + region +
                 ' since snapshot_id ' + snapshotid_tag_value +
                 ' was not found in ' + region_tag_value)
        utils.delete_snapshot(snapshot_id, region)
        sleep(2)

    # 3. evaluate snapshots that should be copied from this region, if dest not found, copy and tag
    for snapshot in found_snapshots.get('replication_dst_region', []):
        snapshot_id = snapshot['SnapshotId']
        snapshot_description = snapshot['Description']

        if timeout_check(context, 'perform_replication'):
            break

        if snapshot_id in ignore_ids:
            continue

        if snapshot['State'] in ['pending', 'error']:
            LOG.warn('Skip copying this snapshot ' + snapshot_id +
                     ' due to ' + snapshot['State'] + ' state: ' + snapshot_description)
            continue

        LOG.info('Working on copying this snapshot ' + snapshot_id +
                 ' (if needed): ' + snapshot_description)

        # what region should this be mapped to?
        tag_pairs = snapshot.get('Tags', [])
        region_tag_pair = [x for x in tag_pairs if x.get('Key') == 'replication_dst_region']
        region_tag_value = region_tag_pair[0].get('Value')

        name_tag_pair = [x for x in tag_pairs if x.get('Key') == 'Name']
        name_tag_pair.append({})  # Adds empty dictionary to list in even no Name tag is present
        name_tag_value = name_tag_pair[0].get('Value')

        # does it already exist in the target region?
        if snapshot_id in replication_snap_list:
            LOG.info('Not creating more snapshots, since snapshot_id ' + snapshot_id +
                     ' was already found in ' + region_tag_value)
            continue

        # we need to make one in the target region
        LOG.warn('Creating a new snapshot, since snapshot_id ' + snapshot_id +
                 ' was not already found in ' + region_tag_value)
        utils.copy_snapshot_and_tag(
            context,
            region,
            region_tag_value,
            name_tag_value,
            snapshot_id,
            snapshot_description)
示例#7
0
def build_cache_maps(context, configurations, region, installed_region):
    """Build a giant cache of instances, volumes, snapshots for region"""
    LOG.info("Building cache of instance, volume, and snapshots in %s", region)
    LOG.info("This may take a while...")
    cache_data = {
        # calculated here locally
        'instance_id_to_data': {},
        'instance_id_to_config': {},
        'volume_id_to_instance_id': {},

        # calculated w/ multiprocessing module
        'snapshot_id_to_data': {},
        'volume_id_to_snapshot_count': {},
        'volume_id_to_most_recent_snapshot_date': {},
    }

    # build an EC2 client, we're going to need it
    ec2 = boto3.client('ec2', region_name=region)

    if len(configurations) <= 0:
        LOG.info('No configurations found in %s, not building cache', region)
        return cache_data

    # populate them
    LOG.info("Retrieved %s DynamoDB configurations for caching",
             str(len(configurations)))

    # build a list of any IDs (anywhere) that we should ignore
    ignore_ids = build_ignore_list(configurations)

    for config in configurations:
        # stop if we're running out of time
        if ebs_snapper.timeout_check(context, 'build_cache_maps'):
            break

        # if it's missing the match section, ignore it
        if not validate_snapshot_settings(config):
            continue

        # build a boto3 filter to describe instances with
        configuration_matches = config['match']
        filters = convert_configurations_to_boto_filter(configuration_matches)

        # if we ended up with no boto3 filters, we bail so we don't snapshot everything
        if len(filters) <= 0:
            LOG.warn('Could not convert configuration match to a filter: %s',
                     configuration_matches)
            continue

        filters.append({
            'Name': 'instance-state-name',
            'Values': ['running', 'stopped']
        })
        instances = ec2.describe_instances(Filters=filters)
        res_list = instances.get('Reservations', [])
        random.shuffle(res_list)  # attempt to randomize order, for timeouts

        for reservation in res_list:
            inst_list = reservation.get('Instances', [])
            random.shuffle(
                inst_list)  # attempt to randomize order, for timeouts

            for instance_data in inst_list:
                instance_id = instance_data['InstanceId']

                # skip if we're ignoring this
                if instance_id in ignore_ids:
                    continue

                cache_data['instance_id_to_config'][instance_id] = config
                cache_data['instance_id_to_data'][instance_id] = instance_data
                for dev in instance_data.get('BlockDeviceMappings', []):
                    vid = dev['Ebs']['VolumeId']

                    # skip if we're ignoring this
                    if vid in ignore_ids:
                        continue

                    cache_data['volume_id_to_instance_id'][vid] = instance_id

    LOG.info("Retrieved %s instances for caching",
             str(len(cache_data['instance_id_to_data'].keys())))

    # look at each volume, get snapshots and count / most recent, and map to instance
    process_volumes = cache_data['volume_id_to_instance_id'].keys()[:]
    LOG.info("Retrieved %s volumes for caching", str(len(process_volumes)))

    chunked_work = []
    while len(process_volumes) > 0:
        popped = process_volumes[:25]
        del process_volumes[:25]
        chunked_work.append(popped)

    LOG.debug('Split out volume work into %s lists, pulling snapshots...',
              str(len(chunked_work)))

    if len(chunked_work) > 0:
        f = functools.partial(chunk_volume_work, region)
        pool = ThreadPool(processes=4)
        results = pool.map(f, chunked_work)
        pool.close()
        pool.join()

        keys = [
            'volume_id_to_most_recent_snapshot_date',
            'volume_id_to_snapshot_count', 'snapshot_id_to_data'
        ]
        for result_chunk in results:
            for k in keys:
                cache_data[k].update(result_chunk[k])

    LOG.info("Retrieved %s snapshots for caching",
             str(len(cache_data['snapshot_id_to_data'])))

    return cache_data