def record_set(session_factory, bucket, key_prefix, start_date): """Retrieve all s3 records for the given policy output url From the given start date. """ s3 = local_session(session_factory).client('s3') records = [] key_count = 0 marker = key_prefix.strip("/") + "/" + start_date.strftime( '%Y/%m/%d/00') + "/resources.json.gz" p = s3.get_paginator('list_objects').paginate( Bucket=bucket, Prefix=key_prefix.strip('/') + '/', Marker=marker) with ThreadPoolExecutor(max_workers=20) as w: for key_set in p: if 'Contents' not in key_set: continue keys = [ k for k in key_set['Contents'] if k['Key'].endswith('resources.json.gz') ] key_count += len(keys) futures = map( lambda k: w.submit(get_records, bucket, k, session_factory), keys) for f in as_completed(futures): records.extend(f.result()) log.info("Fetched %d records across %d files" % (len(records), key_count)) return records
def index_account_trails(config, account, region, date, directory): es_client = get_es_client(config) s3 = local_session(lambda: SessionFactory(region, profile=account.get('profile'), assume_role=account.get('role')) ()).client('s3') bucket = account['bucket'] key_prefix = "accounts/{}/{}/traildb".format(account['name'], region) marker = "{}/{}/trail.db.bz2".format(key_prefix, date) p = s3.get_paginator('list_objects_v2').paginate( Bucket=bucket, Prefix=key_prefix, StartAfter=marker, ) with ThreadPoolExecutor(max_workers=20) as w: for key_set in p: if 'Contents' not in key_set: continue keys = [] for k in key_set['Contents']: if (k['Key'].endswith('trail.db.bz2') and valid_date(k['Key'], date)): keys.append(k) futures = map( lambda k: w.submit( get_traildb, bucket, k, lambda: SessionFactory( region, profile=account.get('profile'), assume_role=account.get('role'))(), directory), keys) for f in as_completed(futures): local_db_file = f.result() connection = sqlite3.connect(local_db_file) connection.row_factory = dict_factory cursor = connection.cursor() index_events(es_client, fetch_events(cursor, config, account['name'])) connection.close() try: os.remove(local_db_file) except: log.warning("Failed to remove temporary file: {}".format( local_db_file)) pass
def log_entries_from_s3(session_factory, output, start, end): client = local_session(session_factory).client('s3') key_prefix = output.key_prefix.strip('/') local_tz = tz.tzlocal() start = datetime.fromtimestamp( _timestamp_from_string(start) / 1000 ) end = datetime.fromtimestamp( _timestamp_from_string(end) / 1000 ).replace(tzinfo=local_tz) records = [] key_count = 0 log_filename = 'custodian-run.log.gz' marker = '{}/{}/{}'.format( key_prefix, start.strftime('%Y/%m/%d/00'), log_filename, ) p = client.get_paginator('list_objects_v2').paginate( Bucket=output.bucket, Prefix=key_prefix + '/', StartAfter=marker, ) with ThreadPoolExecutor(max_workers=20) as w: for key_set in p: if 'Contents' not in key_set: continue log_keys = [k for k in key_set['Contents'] if k['Key'].endswith(log_filename)] keys = [k for k in log_keys if k['LastModified'] < end] if len(log_keys) >= 1 and len(keys) == 0: # there were logs, but we're now past the end date break key_count += len(keys) futures = map( lambda k: w.submit(get_records, output.bucket, k, session_factory), keys, ) for f in as_completed(futures): records.extend(f.result()) log.info('Fetched {} records across {} files'.format( len(records), key_count, )) return records