示例#1
0
    def search_STAC(self, activity, extra_catalogs=None):
        """Search for activity in remote STAC server.

        Notes:
            By default, uses entire activity to search for data catalog.
            When the parameter ``extra_catalog`` is set, this function will seek
            into given catalogs and then merge the result as a single query server.
            It may be useful if you have a collection in different server provider.

        Args:
            activity (dict): Current activity scope with default STAC server and stac collection
                **Make sure that ``bbox`` property is a GeoJSON Feature.
            extra_catalogs (List[dict]): Extra catalogs to seek for collection. Default is None.
        """
        # Get DATACUBE params
        _ = self.stac.catalog
        bands = activity['bands']
        datasets = activity['datasets']
        bbox_feature = activity['bbox']
        time = '{}/{}'.format(activity['start'], activity['end'])

        scenes = {}
        filter_opts = dict(datetime=time, intersects=bbox_feature, limit=10000)

        for dataset in datasets:
            filter_opts['collections'] = [dataset]
            items = self.stac.search(filter=filter_opts)

            scenes.update(**self._parse_stac_result(items, dataset, bands,
                                                    activity['quality_band']))

        if extra_catalogs:
            for catalog in extra_catalogs:
                stac_url = catalog['stac_url']
                stac_token = catalog.get('token')
                stac_dataset = catalog['dataset']
                filter_opts['collections'] = [stac_dataset]

                stac = STAC(stac_url, access_token=stac_token)

                items = stac.search(filter=filter_opts)

                res = self._parse_stac_result(items, stac_dataset, bands,
                                              activity['quality_band'])

                for band, datasets in res.items():
                    internal_dataset = list(datasets.keys())[0]

                    scenes[band][dataset].update(datasets[internal_dataset])

        return scenes
示例#2
0
class CubeServices:
    def __init__(self, url_stac=None, bucket=None):
        # session = boto3.Session(profile_name='default')
        session = boto3.Session(aws_access_key_id=AWS_KEY_ID,
                                aws_secret_access_key=AWS_SECRET_KEY)

        # ---------------------------
        # AWS infrastructure
        self.S3client = session.client('s3')
        self.SQSclient = session.client('sqs')
        self.LAMBDAclient = session.client('lambda')
        self.Kinesisclient = session.client('kinesis')
        self.dynamoDBResource = session.resource('dynamodb')

        self.QueueUrl = {}
        self.bucket_name = bucket

        # ---------------------------
        # create / get DynamoDB tables
        self.get_dynamo_tables()

        # ---------------------------
        # create / get the SQS
        self.get_queue_url()

        # ---------------------------
        # init STAC instance
        self.url_stac = url_stac
        if url_stac:
            self.stac = STAC(url_stac)

    def get_s3_prefix(self, bucket):
        # prefix = 'https://s3.amazonaws.com/{}/'.format(bucket)
        prefix = 's3://{}/'.format(bucket)
        return prefix

    ## ----------------------
    # DYNAMO DB
    def get_dynamo_tables(self):
        # Create the cubeBuilderActivities table in DynamoDB to store all activities
        self.activitiesTable = self.dynamoDBResource.Table(DYNAMO_TB_ACTIVITY)
        table_exists = False
        try:
            self.activitiesTable.creation_date_time
            table_exists = True
        except:
            table_exists = False

        if not table_exists:
            self.activitiesTable = self.dynamoDBResource.create_table(
                TableName=DYNAMO_TB_ACTIVITY,
                KeySchema=[{
                    'AttributeName': 'id',
                    'KeyType': 'HASH'
                }, {
                    'AttributeName': 'sk',
                    'KeyType': 'RANGE'
                }],
                AttributeDefinitions=[
                    {
                        'AttributeName': 'id',
                        'AttributeType': 'S'
                    },
                    {
                        'AttributeName': 'sk',
                        'AttributeType': 'S'
                    },
                ],
                BillingMode='PAY_PER_REQUEST',
            )
            # Wait until the table exists.
            self.dynamoDBResource.meta.client.get_waiter('table_exists').wait(
                TableName=DYNAMO_TB_ACTIVITY)

        # Create the cubeBuilderActivitiesControl table in DynamoDB to manage activities completion
        self.activitiesControlTable = self.dynamoDBResource.Table(
            DBNAME_TB_CONTROL)
        table_exists = False
        try:
            self.activitiesControlTable.creation_date_time
            table_exists = True
        except:
            table_exists = False

        if not table_exists:
            self.activitiesControlTable = self.dynamoDBResource.create_table(
                TableName=DBNAME_TB_CONTROL,
                KeySchema=[
                    {
                        'AttributeName': 'id',
                        'KeyType': 'HASH'
                    },
                ],
                AttributeDefinitions=[
                    {
                        'AttributeName': 'id',
                        'AttributeType': 'S'
                    },
                ],
                ProvisionedThroughput={
                    'ReadCapacityUnits': 2,
                    'WriteCapacityUnits': 2
                })
            # Wait until the table exists.
            self.dynamoDBResource.meta.client.get_waiter('table_exists').wait(
                TableName=DBNAME_TB_CONTROL)

        # Create the cubeBuilderActivitiesControl table in DynamoDB to manage activities completion
        self.processTable = self.dynamoDBResource.Table(DBNAME_TB_PROCESS)
        table_exists = False
        try:
            self.processTable.creation_date_time
            table_exists = True
        except:
            table_exists = False

        if not table_exists:
            self.processTable = self.dynamoDBResource.create_table(
                TableName=DBNAME_TB_PROCESS,
                KeySchema=[
                    {
                        'AttributeName': 'id',
                        'KeyType': 'HASH'
                    },
                ],
                AttributeDefinitions=[
                    {
                        'AttributeName': 'id',
                        'AttributeType': 'S'
                    },
                ],
                ProvisionedThroughput={
                    'ReadCapacityUnits': 2,
                    'WriteCapacityUnits': 2
                })
            # Wait until the table exists.
            self.dynamoDBResource.meta.client.get_waiter('table_exists').wait(
                TableName=DBNAME_TB_PROCESS)

    def get_activities(self):
        # self.activitiesTable.meta.client.delete_table(TableName=DYNAMO_TB_ACTIVITY)

        return self.activitiesTable.scan()

    def get_activities_ctrl(self):
        return self.activitiesControlTable.scan()

    def get_activities_by_key(self, dinamo_key):
        return self.activitiesTable.query(
            KeyConditionExpression=Key('id').eq(dinamo_key))

    def get_activity_item(self, query):
        return self.activitiesTable.get_item(Key=query)

    def get_process_by_id(self, process_id):
        return self.processTable.query(
            KeyConditionExpression=Key('id').eq(process_id))

    def get_process_by_datacube(self, datacube):
        return self.processTable.scan(
            FilterExpression=Key('datacube').eq(datacube))

    def get_cube_meta(
        self,
        cube,
    ):
        filters = Key('data_cube').eq(cube) & Key('id').begins_with('merge')

        return self.activitiesTable.scan(FilterExpression=filters, )

    def get_all_items(self, filters):
        response = self.activitiesTable.scan(FilterExpression=filters,
                                             Limit=100000000)

        items = response['Items']

        while 'LastEvaluatedKey' in response:
            response = self.activitiesTable.scan(
                FilterExpression=filters,
                ExclusiveStartKey=response['LastEvaluatedKey'])
            items.extend(response['Items'])

        return items

    def get_merges(self, data_cube: str, tile_id: str, start: str, end: str):
        """List all merges activities used to build a data cube.

        Args:
            data_cube - Data cube name
            start - Filter start data
            end - Filter end data
        """
        expression = Key('tile_id').eq(tile_id) & Key('period_start').between(start, end) & \
            Key('period_end').between(start, end) & Key('data_cube').eq(data_cube)

        return self.get_all_items(expression)

    def get_activities_by_datacube(self, data_cube: str):
        """List all activities used to build a data cube.

        Args:
            data_cube - Data cube name
        """
        expression = Key('data_cube').eq(data_cube)
        return self.get_all_items(expression)

    def put_activity(self, activity):
        self.activitiesTable.put_item(
            Item={
                'id': activity['dynamoKey'],
                'sk': activity['sk'],
                'tile_id': activity['tileid'],
                'period_start': activity['start'],
                'period_end': activity['end'],
                'data_cube': activity['datacube'],
                'mystatus': activity['mystatus'],
                'mylaunch': activity['mylaunch'],
                'mystart': activity['mystart'],
                'myend': activity['myend'],
                'efficacy': activity['efficacy'],
                'cloudratio': activity['cloudratio'],
                'instancesToBeDone': activity['instancesToBeDone'],
                'totalInstancesToBeDone': activity['totalInstancesToBeDone'],
                'activity': json.dumps(activity),
            })
        return True

    def put_process_table(self, key, datacube_id, i_datacube_id, infos):
        self.processTable.put_item(
            Item={
                'id': key,
                'datacube_id': datacube_id,
                'irregular_datacube_id': i_datacube_id,
                'infos': infos
            })
        return True

    def put_control_table(self, key, value):
        self.activitiesControlTable.put_item(Item={
            'id': key,
            'mycount': value,
        })
        return True

    def remove_control_by_key(self, key: str):
        try:
            self.activitiesControlTable.delete_item(Key=dict(id=key))
            return True
        except:
            return False

    def remove_process_by_key(self, key: str):
        try:
            self.processTable.delete_item(Key=dict(id=key))
            return True
        except:
            return False

    def update_control_table(self, Key, UpdateExpression,
                             ExpressionAttributeNames,
                             ExpressionAttributeValues, ReturnValues):
        return self.activitiesControlTable.update_item(
            Key=Key,
            UpdateExpression=UpdateExpression,
            ExpressionAttributeNames=ExpressionAttributeNames,
            ExpressionAttributeValues=ExpressionAttributeValues,
            ReturnValues=ReturnValues)

    ## ----------------------
    # SQS
    def get_queue_url(self):
        for action in ['merge', 'blend', 'posblend', 'publish']:
            queue = '{}-{}'.format(QUEUE_NAME, action)
            if self.QueueUrl.get(action, None) is not None:
                continue
            response = self.SQSclient.list_queues()
            q_exists = False
            if 'QueueUrls' in response:
                for qurl in response['QueueUrls']:
                    if qurl.find(queue) != -1:
                        q_exists = True
                        self.QueueUrl[action] = qurl
            if not q_exists:
                self.create_queue(True, action)
        return True

    def create_queue(self, create_mapping=False, action=''):
        """
        As the influx of messages to a queue increases, AWS Lambda automatically scales up 
        polling activity until the number of concurrent function executions reaches 1000, 
        the account concurrency limit, or the (optional) function concurrency limit, 
        whichever is lower. 
        Amazon Simple Queue Service supports an initial burst of 5 concurrent function invocations 
        and increases concurrency by 60 concurrent invocations per minute.

        So, for example, 
        1000 messages arrives at the queue at once, only 5 will be processed in the first minute.
        65 lambdas will run concurrently in the second minute... so on
		"""
        # Create a SQS for this experiment
        queue = '{}-{}'.format(QUEUE_NAME, action)
        response = self.SQSclient.create_queue(
            QueueName=queue, Attributes={'VisibilityTimeout': '500'})
        self.QueueUrl[action] = response['QueueUrl']
        # Get attributes
        attributes = self.SQSclient.get_queue_attributes(
            QueueUrl=self.QueueUrl[action], AttributeNames=[
                'All',
            ])
        QueueArn = attributes['Attributes']['QueueArn']

        # Create Source Mapping to Maestro from queue
        if create_mapping:
            response = self.LAMBDAclient.create_event_source_mapping(
                EventSourceArn=QueueArn,
                FunctionName=LAMBDA_FUNCTION_NAME,
                Enabled=True,
                BatchSize=1)

    def send_to_sqs(self, activity):
        if self.get_queue_url():
            action = activity['action']
            self.SQSclient.send_message(QueueUrl=self.QueueUrl[action],
                                        MessageBody=json.dumps(activity))

    ## ----------------------
    # Kinesis
    def put_item_kinesis(self, activity):
        activity['channel'] = 'kinesis'
        activity['db'] = 'dynamodb'
        status = self.sendToKinesis(activity)
        del activity['channel']
        del activity['db']
        return status

    def sendToKinesis(self, activity):
        self.Kinesisclient.put_record(StreamName=KINESIS_NAME,
                                      Data=json.dumps(activity),
                                      PartitionKey='dsKinesis')
        return True

    ## ----------------------
    # STAC
    def get_collection_stac(self, collection_id):
        _ = self.stac.catalog
        return self.stac.collection(collection_id)

    def _parse_stac_result(self, items, dataset, bands, quality_band):
        scenes = dict()

        for f in items['features']:
            if f['type'] == 'Feature':
                id = f['id']
                date = f['properties']['datetime']

                # Get file link and name
                assets = f['assets']
                for band in bands:
                    band_obj = assets.get(band, None)
                    if not band_obj: continue

                    scenes[band] = scenes.get(band, {})
                    scenes[band][dataset] = scenes[band].get(dataset, {})

                    scene = {}
                    scene['sceneid'] = id
                    scene['date'] = date
                    scene['band'] = band
                    scene['link'] = band_obj['href']
                    if dataset == 'MOD13Q1' and band == quality_band:
                        scene['link'] = scene['link'].replace(
                            quality_band, 'reliability')

                    # TODO: verify if scene['link'] exist

                    if date not in scenes[band][dataset]:
                        scenes[band][dataset][date] = []
                    scenes[band][dataset][date].append(scene)

        return scenes

    def search_STAC(self, activity, extra_catalogs=None):
        """Search for activity in remote STAC server.

        Notes:
            By default, uses entire activity to search for data catalog.
            When the parameter ``extra_catalog`` is set, this function will seek
            into given catalogs and then merge the result as a single query server.
            It may be useful if you have a collection in different server provider.

        Args:
            activity (dict): Current activity scope with default STAC server and stac collection
                **Make sure that ``bbox`` property is a GeoJSON Feature.
            extra_catalogs (List[dict]): Extra catalogs to seek for collection. Default is None.
        """
        # Get DATACUBE params
        _ = self.stac.catalog
        bands = activity['bands']
        datasets = activity['datasets']
        bbox_feature = activity['bbox']
        time = '{}/{}'.format(activity['start'], activity['end'])

        scenes = {}
        filter_opts = dict(datetime=time, intersects=bbox_feature, limit=10000)

        for dataset in datasets:
            filter_opts['collections'] = [dataset]
            items = self.stac.search(filter=filter_opts)

            scenes.update(**self._parse_stac_result(items, dataset, bands,
                                                    activity['quality_band']))

        if extra_catalogs:
            for catalog in extra_catalogs:
                stac_url = catalog['stac_url']
                stac_token = catalog.get('token')
                stac_dataset = catalog['dataset']
                filter_opts['collections'] = [stac_dataset]

                stac = STAC(stac_url, access_token=stac_token)

                items = stac.search(filter=filter_opts)

                res = self._parse_stac_result(items, stac_dataset, bands,
                                              activity['quality_band'])

                for band, datasets in res.items():
                    internal_dataset = list(datasets.keys())[0]

                    scenes[band][dataset].update(datasets[internal_dataset])

        return scenes

    ## ----------------------
    # S3
    def create_bucket(self, name, requester_pay=True):
        try:
            # Create a bucket with public access
            response = self.S3client.create_bucket(ACL='public-read',
                                                   Bucket=name)
            if requester_pay:
                response = self.S3client.put_bucket_request_payment(
                    Bucket=name,
                    RequestPaymentConfiguration={'Payer': 'Requester'})
            assert response['ResponseMetadata']['HTTPStatusCode'] == 200
            return True
        except ClientError:
            return False
        return True

    def s3_file_exists(self, bucket_name=None, key=''):
        try:
            if not bucket_name:
                bucket_name = self.bucket_name
            return self.S3client.head_object(Bucket=bucket_name, Key=key)
        except ClientError:
            return False

    def get_object(self, key, bucket_name=None):
        return self.S3client.get_object(Bucket=bucket_name, Key=key)

    def delete_file_S3(self, bucket_name=None, key=''):
        try:
            if not bucket_name:
                bucket_name = self.bucket_name
            self.S3client.delete_object(Bucket=bucket_name, Key=key)
        except ClientError:
            return False
        return True

    def save_file_S3(self, bucket_name=None, key='', activity={}):
        if not bucket_name:
            bucket_name = self.bucket_name
        return self.S3client.put_object(
            Bucket=bucket_name,
            Key=key,
            Body=(bytes(json.dumps(activity).encode('UTF-8'))))

    def upload_file_S3(self, memfile, key, args, bucket_name=None):
        if not bucket_name:
            bucket_name = self.bucket_name
        return self.S3client.upload_file(memfile,
                                         Bucket=bucket_name,
                                         Key=key,
                                         ExtraArgs=args)

    def upload_fileobj_S3(self, memfile, key, args, bucket_name=None):
        if not bucket_name:
            bucket_name = self.bucket_name
        return self.S3client.upload_fileobj(memfile,
                                            Bucket=bucket_name,
                                            Key=key,
                                            ExtraArgs=args)

    def list_repositories(self):
        return [
            bucket['Name']
            for bucket in self.S3client.list_buckets()['Buckets']
        ]
示例#3
0
class EarthSearch(BaseProvider):
    """Define a simple abstraction of Provider for Element84.

    It was designed to download Sentinel-2 COGS from
    `Sentinel-2 Cloud-Optimized GeoTIFFs <https://registry.opendata.aws/sentinel-2-l2a-cogs/>`_
    """
    def __init__(self, **kwargs):
        """Build STAC provider for Element84."""
        access_token = kwargs.pop('access_token', None)

        self.kwargs = kwargs
        self.api = STAC('https://earth-search.aws.element84.com/v0',
                        access_token=access_token)
        self.progress = kwargs.get('progress')

    def search(self, query, *args, **kwargs) -> List[SceneResult]:
        """Search for scenes in STAC."""
        options = dict()

        if 'start_date' in kwargs:
            options[
                'time'] = f'{kwargs.get("start_date")}/{kwargs.get("end_date")}'

        if 'bbox' in kwargs:
            options['intersects'] = mapping(box(*kwargs['bbox']))

        options['collection'] = query

        res = self.api.search(filter=options)

        # TODO: Implement next page as iterator or check stac.py support
        return [
            SceneResult(scene_id=f['properties']['sentinel:product_id'],
                        cloud_cover=f['properties']['sentinel:cloud_cover'],
                        **f) for f in res['features']
        ]

    @staticmethod
    def _guess_parser(scene_id: str):
        """Get the supported parser for Scene."""
        if scene_id.startswith('S2'):
            return Sentinel2Scene(scene_id)
        return LandsatScene(scene_id)

    def download(self, scene_id: str, *args, **kwargs) -> str:
        """Download files from STAC Element 84."""
        output = kwargs['output']

        collection = kwargs['dataset']

        parsed = self._guess_parser(scene_id)

        stac_collection = self.api.collection(collection)

        product = parsed.fragments[1][-3:]

        item_id = f'{parsed.source()}_{parsed.tile_id()}_{parsed.sensing_date().strftime("%Y%m%d")}_0_{product}'

        feature = stac_collection.get_items(item_id=item_id)

        if feature.get('code'):
            raise RuntimeError(
                f'Scene {scene_id} not found for collection {collection}.')

        with TemporaryDirectory() as tmp:
            tmp_path = Path(tmp) / item_id

            for asset_name, asset in feature['assets'].items():
                self._download(asset['href'], str(tmp_path))

            shutil.move(str(tmp_path), output)

        return output

    def _download(self, link, output):
        """Download asset from STAC."""
        file_name = Path(link).name

        path = Path(output) / file_name

        response = requests.get(link, stream=True, timeout=90)

        download_stream(str(path), response, progress=self.progress)