def search_STAC(self, activity, extra_catalogs=None): """Search for activity in remote STAC server. Notes: By default, uses entire activity to search for data catalog. When the parameter ``extra_catalog`` is set, this function will seek into given catalogs and then merge the result as a single query server. It may be useful if you have a collection in different server provider. Args: activity (dict): Current activity scope with default STAC server and stac collection **Make sure that ``bbox`` property is a GeoJSON Feature. extra_catalogs (List[dict]): Extra catalogs to seek for collection. Default is None. """ # Get DATACUBE params _ = self.stac.catalog bands = activity['bands'] datasets = activity['datasets'] bbox_feature = activity['bbox'] time = '{}/{}'.format(activity['start'], activity['end']) scenes = {} filter_opts = dict(datetime=time, intersects=bbox_feature, limit=10000) for dataset in datasets: filter_opts['collections'] = [dataset] items = self.stac.search(filter=filter_opts) scenes.update(**self._parse_stac_result(items, dataset, bands, activity['quality_band'])) if extra_catalogs: for catalog in extra_catalogs: stac_url = catalog['stac_url'] stac_token = catalog.get('token') stac_dataset = catalog['dataset'] filter_opts['collections'] = [stac_dataset] stac = STAC(stac_url, access_token=stac_token) items = stac.search(filter=filter_opts) res = self._parse_stac_result(items, stac_dataset, bands, activity['quality_band']) for band, datasets in res.items(): internal_dataset = list(datasets.keys())[0] scenes[band][dataset].update(datasets[internal_dataset]) return scenes
class CubeServices: def __init__(self, url_stac=None, bucket=None): # session = boto3.Session(profile_name='default') session = boto3.Session(aws_access_key_id=AWS_KEY_ID, aws_secret_access_key=AWS_SECRET_KEY) # --------------------------- # AWS infrastructure self.S3client = session.client('s3') self.SQSclient = session.client('sqs') self.LAMBDAclient = session.client('lambda') self.Kinesisclient = session.client('kinesis') self.dynamoDBResource = session.resource('dynamodb') self.QueueUrl = {} self.bucket_name = bucket # --------------------------- # create / get DynamoDB tables self.get_dynamo_tables() # --------------------------- # create / get the SQS self.get_queue_url() # --------------------------- # init STAC instance self.url_stac = url_stac if url_stac: self.stac = STAC(url_stac) def get_s3_prefix(self, bucket): # prefix = 'https://s3.amazonaws.com/{}/'.format(bucket) prefix = 's3://{}/'.format(bucket) return prefix ## ---------------------- # DYNAMO DB def get_dynamo_tables(self): # Create the cubeBuilderActivities table in DynamoDB to store all activities self.activitiesTable = self.dynamoDBResource.Table(DYNAMO_TB_ACTIVITY) table_exists = False try: self.activitiesTable.creation_date_time table_exists = True except: table_exists = False if not table_exists: self.activitiesTable = self.dynamoDBResource.create_table( TableName=DYNAMO_TB_ACTIVITY, KeySchema=[{ 'AttributeName': 'id', 'KeyType': 'HASH' }, { 'AttributeName': 'sk', 'KeyType': 'RANGE' }], AttributeDefinitions=[ { 'AttributeName': 'id', 'AttributeType': 'S' }, { 'AttributeName': 'sk', 'AttributeType': 'S' }, ], BillingMode='PAY_PER_REQUEST', ) # Wait until the table exists. self.dynamoDBResource.meta.client.get_waiter('table_exists').wait( TableName=DYNAMO_TB_ACTIVITY) # Create the cubeBuilderActivitiesControl table in DynamoDB to manage activities completion self.activitiesControlTable = self.dynamoDBResource.Table( DBNAME_TB_CONTROL) table_exists = False try: self.activitiesControlTable.creation_date_time table_exists = True except: table_exists = False if not table_exists: self.activitiesControlTable = self.dynamoDBResource.create_table( TableName=DBNAME_TB_CONTROL, KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'id', 'AttributeType': 'S' }, ], ProvisionedThroughput={ 'ReadCapacityUnits': 2, 'WriteCapacityUnits': 2 }) # Wait until the table exists. self.dynamoDBResource.meta.client.get_waiter('table_exists').wait( TableName=DBNAME_TB_CONTROL) # Create the cubeBuilderActivitiesControl table in DynamoDB to manage activities completion self.processTable = self.dynamoDBResource.Table(DBNAME_TB_PROCESS) table_exists = False try: self.processTable.creation_date_time table_exists = True except: table_exists = False if not table_exists: self.processTable = self.dynamoDBResource.create_table( TableName=DBNAME_TB_PROCESS, KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'id', 'AttributeType': 'S' }, ], ProvisionedThroughput={ 'ReadCapacityUnits': 2, 'WriteCapacityUnits': 2 }) # Wait until the table exists. self.dynamoDBResource.meta.client.get_waiter('table_exists').wait( TableName=DBNAME_TB_PROCESS) def get_activities(self): # self.activitiesTable.meta.client.delete_table(TableName=DYNAMO_TB_ACTIVITY) return self.activitiesTable.scan() def get_activities_ctrl(self): return self.activitiesControlTable.scan() def get_activities_by_key(self, dinamo_key): return self.activitiesTable.query( KeyConditionExpression=Key('id').eq(dinamo_key)) def get_activity_item(self, query): return self.activitiesTable.get_item(Key=query) def get_process_by_id(self, process_id): return self.processTable.query( KeyConditionExpression=Key('id').eq(process_id)) def get_process_by_datacube(self, datacube): return self.processTable.scan( FilterExpression=Key('datacube').eq(datacube)) def get_cube_meta( self, cube, ): filters = Key('data_cube').eq(cube) & Key('id').begins_with('merge') return self.activitiesTable.scan(FilterExpression=filters, ) def get_all_items(self, filters): response = self.activitiesTable.scan(FilterExpression=filters, Limit=100000000) items = response['Items'] while 'LastEvaluatedKey' in response: response = self.activitiesTable.scan( FilterExpression=filters, ExclusiveStartKey=response['LastEvaluatedKey']) items.extend(response['Items']) return items def get_merges(self, data_cube: str, tile_id: str, start: str, end: str): """List all merges activities used to build a data cube. Args: data_cube - Data cube name start - Filter start data end - Filter end data """ expression = Key('tile_id').eq(tile_id) & Key('period_start').between(start, end) & \ Key('period_end').between(start, end) & Key('data_cube').eq(data_cube) return self.get_all_items(expression) def get_activities_by_datacube(self, data_cube: str): """List all activities used to build a data cube. Args: data_cube - Data cube name """ expression = Key('data_cube').eq(data_cube) return self.get_all_items(expression) def put_activity(self, activity): self.activitiesTable.put_item( Item={ 'id': activity['dynamoKey'], 'sk': activity['sk'], 'tile_id': activity['tileid'], 'period_start': activity['start'], 'period_end': activity['end'], 'data_cube': activity['datacube'], 'mystatus': activity['mystatus'], 'mylaunch': activity['mylaunch'], 'mystart': activity['mystart'], 'myend': activity['myend'], 'efficacy': activity['efficacy'], 'cloudratio': activity['cloudratio'], 'instancesToBeDone': activity['instancesToBeDone'], 'totalInstancesToBeDone': activity['totalInstancesToBeDone'], 'activity': json.dumps(activity), }) return True def put_process_table(self, key, datacube_id, i_datacube_id, infos): self.processTable.put_item( Item={ 'id': key, 'datacube_id': datacube_id, 'irregular_datacube_id': i_datacube_id, 'infos': infos }) return True def put_control_table(self, key, value): self.activitiesControlTable.put_item(Item={ 'id': key, 'mycount': value, }) return True def remove_control_by_key(self, key: str): try: self.activitiesControlTable.delete_item(Key=dict(id=key)) return True except: return False def remove_process_by_key(self, key: str): try: self.processTable.delete_item(Key=dict(id=key)) return True except: return False def update_control_table(self, Key, UpdateExpression, ExpressionAttributeNames, ExpressionAttributeValues, ReturnValues): return self.activitiesControlTable.update_item( Key=Key, UpdateExpression=UpdateExpression, ExpressionAttributeNames=ExpressionAttributeNames, ExpressionAttributeValues=ExpressionAttributeValues, ReturnValues=ReturnValues) ## ---------------------- # SQS def get_queue_url(self): for action in ['merge', 'blend', 'posblend', 'publish']: queue = '{}-{}'.format(QUEUE_NAME, action) if self.QueueUrl.get(action, None) is not None: continue response = self.SQSclient.list_queues() q_exists = False if 'QueueUrls' in response: for qurl in response['QueueUrls']: if qurl.find(queue) != -1: q_exists = True self.QueueUrl[action] = qurl if not q_exists: self.create_queue(True, action) return True def create_queue(self, create_mapping=False, action=''): """ As the influx of messages to a queue increases, AWS Lambda automatically scales up polling activity until the number of concurrent function executions reaches 1000, the account concurrency limit, or the (optional) function concurrency limit, whichever is lower. Amazon Simple Queue Service supports an initial burst of 5 concurrent function invocations and increases concurrency by 60 concurrent invocations per minute. So, for example, 1000 messages arrives at the queue at once, only 5 will be processed in the first minute. 65 lambdas will run concurrently in the second minute... so on """ # Create a SQS for this experiment queue = '{}-{}'.format(QUEUE_NAME, action) response = self.SQSclient.create_queue( QueueName=queue, Attributes={'VisibilityTimeout': '500'}) self.QueueUrl[action] = response['QueueUrl'] # Get attributes attributes = self.SQSclient.get_queue_attributes( QueueUrl=self.QueueUrl[action], AttributeNames=[ 'All', ]) QueueArn = attributes['Attributes']['QueueArn'] # Create Source Mapping to Maestro from queue if create_mapping: response = self.LAMBDAclient.create_event_source_mapping( EventSourceArn=QueueArn, FunctionName=LAMBDA_FUNCTION_NAME, Enabled=True, BatchSize=1) def send_to_sqs(self, activity): if self.get_queue_url(): action = activity['action'] self.SQSclient.send_message(QueueUrl=self.QueueUrl[action], MessageBody=json.dumps(activity)) ## ---------------------- # Kinesis def put_item_kinesis(self, activity): activity['channel'] = 'kinesis' activity['db'] = 'dynamodb' status = self.sendToKinesis(activity) del activity['channel'] del activity['db'] return status def sendToKinesis(self, activity): self.Kinesisclient.put_record(StreamName=KINESIS_NAME, Data=json.dumps(activity), PartitionKey='dsKinesis') return True ## ---------------------- # STAC def get_collection_stac(self, collection_id): _ = self.stac.catalog return self.stac.collection(collection_id) def _parse_stac_result(self, items, dataset, bands, quality_band): scenes = dict() for f in items['features']: if f['type'] == 'Feature': id = f['id'] date = f['properties']['datetime'] # Get file link and name assets = f['assets'] for band in bands: band_obj = assets.get(band, None) if not band_obj: continue scenes[band] = scenes.get(band, {}) scenes[band][dataset] = scenes[band].get(dataset, {}) scene = {} scene['sceneid'] = id scene['date'] = date scene['band'] = band scene['link'] = band_obj['href'] if dataset == 'MOD13Q1' and band == quality_band: scene['link'] = scene['link'].replace( quality_band, 'reliability') # TODO: verify if scene['link'] exist if date not in scenes[band][dataset]: scenes[band][dataset][date] = [] scenes[band][dataset][date].append(scene) return scenes def search_STAC(self, activity, extra_catalogs=None): """Search for activity in remote STAC server. Notes: By default, uses entire activity to search for data catalog. When the parameter ``extra_catalog`` is set, this function will seek into given catalogs and then merge the result as a single query server. It may be useful if you have a collection in different server provider. Args: activity (dict): Current activity scope with default STAC server and stac collection **Make sure that ``bbox`` property is a GeoJSON Feature. extra_catalogs (List[dict]): Extra catalogs to seek for collection. Default is None. """ # Get DATACUBE params _ = self.stac.catalog bands = activity['bands'] datasets = activity['datasets'] bbox_feature = activity['bbox'] time = '{}/{}'.format(activity['start'], activity['end']) scenes = {} filter_opts = dict(datetime=time, intersects=bbox_feature, limit=10000) for dataset in datasets: filter_opts['collections'] = [dataset] items = self.stac.search(filter=filter_opts) scenes.update(**self._parse_stac_result(items, dataset, bands, activity['quality_band'])) if extra_catalogs: for catalog in extra_catalogs: stac_url = catalog['stac_url'] stac_token = catalog.get('token') stac_dataset = catalog['dataset'] filter_opts['collections'] = [stac_dataset] stac = STAC(stac_url, access_token=stac_token) items = stac.search(filter=filter_opts) res = self._parse_stac_result(items, stac_dataset, bands, activity['quality_band']) for band, datasets in res.items(): internal_dataset = list(datasets.keys())[0] scenes[band][dataset].update(datasets[internal_dataset]) return scenes ## ---------------------- # S3 def create_bucket(self, name, requester_pay=True): try: # Create a bucket with public access response = self.S3client.create_bucket(ACL='public-read', Bucket=name) if requester_pay: response = self.S3client.put_bucket_request_payment( Bucket=name, RequestPaymentConfiguration={'Payer': 'Requester'}) assert response['ResponseMetadata']['HTTPStatusCode'] == 200 return True except ClientError: return False return True def s3_file_exists(self, bucket_name=None, key=''): try: if not bucket_name: bucket_name = self.bucket_name return self.S3client.head_object(Bucket=bucket_name, Key=key) except ClientError: return False def get_object(self, key, bucket_name=None): return self.S3client.get_object(Bucket=bucket_name, Key=key) def delete_file_S3(self, bucket_name=None, key=''): try: if not bucket_name: bucket_name = self.bucket_name self.S3client.delete_object(Bucket=bucket_name, Key=key) except ClientError: return False return True def save_file_S3(self, bucket_name=None, key='', activity={}): if not bucket_name: bucket_name = self.bucket_name return self.S3client.put_object( Bucket=bucket_name, Key=key, Body=(bytes(json.dumps(activity).encode('UTF-8')))) def upload_file_S3(self, memfile, key, args, bucket_name=None): if not bucket_name: bucket_name = self.bucket_name return self.S3client.upload_file(memfile, Bucket=bucket_name, Key=key, ExtraArgs=args) def upload_fileobj_S3(self, memfile, key, args, bucket_name=None): if not bucket_name: bucket_name = self.bucket_name return self.S3client.upload_fileobj(memfile, Bucket=bucket_name, Key=key, ExtraArgs=args) def list_repositories(self): return [ bucket['Name'] for bucket in self.S3client.list_buckets()['Buckets'] ]
class EarthSearch(BaseProvider): """Define a simple abstraction of Provider for Element84. It was designed to download Sentinel-2 COGS from `Sentinel-2 Cloud-Optimized GeoTIFFs <https://registry.opendata.aws/sentinel-2-l2a-cogs/>`_ """ def __init__(self, **kwargs): """Build STAC provider for Element84.""" access_token = kwargs.pop('access_token', None) self.kwargs = kwargs self.api = STAC('https://earth-search.aws.element84.com/v0', access_token=access_token) self.progress = kwargs.get('progress') def search(self, query, *args, **kwargs) -> List[SceneResult]: """Search for scenes in STAC.""" options = dict() if 'start_date' in kwargs: options[ 'time'] = f'{kwargs.get("start_date")}/{kwargs.get("end_date")}' if 'bbox' in kwargs: options['intersects'] = mapping(box(*kwargs['bbox'])) options['collection'] = query res = self.api.search(filter=options) # TODO: Implement next page as iterator or check stac.py support return [ SceneResult(scene_id=f['properties']['sentinel:product_id'], cloud_cover=f['properties']['sentinel:cloud_cover'], **f) for f in res['features'] ] @staticmethod def _guess_parser(scene_id: str): """Get the supported parser for Scene.""" if scene_id.startswith('S2'): return Sentinel2Scene(scene_id) return LandsatScene(scene_id) def download(self, scene_id: str, *args, **kwargs) -> str: """Download files from STAC Element 84.""" output = kwargs['output'] collection = kwargs['dataset'] parsed = self._guess_parser(scene_id) stac_collection = self.api.collection(collection) product = parsed.fragments[1][-3:] item_id = f'{parsed.source()}_{parsed.tile_id()}_{parsed.sensing_date().strftime("%Y%m%d")}_0_{product}' feature = stac_collection.get_items(item_id=item_id) if feature.get('code'): raise RuntimeError( f'Scene {scene_id} not found for collection {collection}.') with TemporaryDirectory() as tmp: tmp_path = Path(tmp) / item_id for asset_name, asset in feature['assets'].items(): self._download(asset['href'], str(tmp_path)) shutil.move(str(tmp_path), output) return output def _download(self, link, output): """Download asset from STAC.""" file_name = Path(link).name path = Path(output) / file_name response = requests.get(link, stream=True, timeout=90) download_stream(str(path), response, progress=self.progress)