예제 #1
0
def process_items(self, p_walk: list, df_collections: dict, df_tiles: dict) -> None:
    '''Worker task that iterate over p_walk list and processes the items.'''

    print_line()

    logger.info(f'process_items - `{len(p_walk)}` chunks have been received.')
    logger.info(f'process_items - p_walk first record: {p_walk[0]}')
    logger.info(f'process_items - p_walk last record: {p_walk[-1]}')

    # convert from dict to dataframe again
    df_collections = DataFrame.from_dict(df_collections)
    df_tiles = DataFrame.from_dict(df_tiles)

    # fill pandas NaN (None, NaN, etc.) with numpy NaN
    df_collections.fillna({'grid_ref_sys_id': NaN}, inplace=True)

    logger.info('process_items - df_collections:\n'
                f"{df_collections[['id', 'name', 'grid_ref_sys_id', 'metadata', 'is_public']]}\n")
    logger.info(f'process_items - df_tiles.head():\n{df_tiles.head()}\n')

    items_insert = []
    errors_insert = []

    for dir_path, metadata, assets in p_walk:
        # create INSERT clause based on item information
        __items_insert, __errors_insert = create_item_and_get_insert_clauses(
            dir_path, metadata, assets, df_collections, df_tiles
        )

        items_insert += __items_insert
        errors_insert += __errors_insert

    # if there are INSERT clauses, then insert them in the database
    if items_insert:
        # logger.info(f'process_items - items_insert: {items_insert}\n')
        logger.info(f'process_items - there are `{len(items_insert)}` '
                     'items to insert in the database.')
        # if there is INSERT clauses to insert in the database,
        # then create a database instance and insert them there
        db = DBFactory.factory()
        concanate_inserts = ' '.join(items_insert)
        # logger.info(f'concanate_inserts: \n{concanate_inserts}\n')
        logger.info('process_items - inserting items in the database...')
        db.execute(concanate_inserts, is_transaction=True)

    # if there are INSERT clauses, then insert them in the database
    if errors_insert:
        # logger.info(f'process_items - errors_insert: {errors_insert}\n')
        logger.info(f'process_items - there are `{len(errors_insert)}` '
                     'warnings or errors to insert in the database.')
        # if there is INSERT clauses to insert in the database,
        # then create a database instance and insert them there
        db = PostgreSQLPublisherConnection()
        concanate_errors = ' '.join(errors_insert)
        # logger.info(f'concanate_errors: \n{concanate_errors}\n')
        logger.info('process_items - inserting task errors in the database...')
        db.execute(concanate_errors, is_transaction=True)
예제 #2
0
 def save_the_errors_in_the_database(self):
     # if there are INSERT clauses, then insert them in the database
     if self.errors_insert:
         # if there is INSERT clauses to insert in the database,
         # then create a database instance and insert them there
         db = PostgreSQLPublisherConnection()
         concanate_errors = ' '.join(self.errors_insert)
         # logger.info(f'concanate_errors: \n{concanate_errors}\n')
         logger.info('Inserting PublisherWalk.errors into database...')
         db.execute(concanate_errors, is_transaction=True)
예제 #3
0
            def check_scene_dir(scene_dir):
                try:
                    _, sensor_dir, date_dir, time_dir = decode_scene_dir(scene_dir)
                except CDSRDecoderException as error:
                    self.errors_insert.append(
                        PostgreSQLPublisherConnection.create_task_error_insert_clause({
                            'message': error,
                            'metadata': {'folder': dir_path, 'method': 'check_scene_dir'},
                            'type': 'error'
                        })
                    )
                    return None

                # if scene_dir does not have the selected sensor, then not return it
                if sensor_dir != self.query['sensor']:
                    return None

                # convert date from str to datetime
                date = datetime.strptime(date_dir, '%Y-%m-%d')

                # if time dir is between 0h and 5h, then consider it one day ago,
                # because date is reception date and not viewing date
                if time_dir >= '00:00:00' and time_dir <= '05:00:00':
                    # subtract one day from the date
                    date -= timedelta(days=1)

                # if scene_dir is not inside the selected date range, then not return it
                if not (date >= self.query['start_date'] and date <= self.query['end_date']):
                    return None

                return scene_dir
예제 #4
0
            def check_path_row_dir(path_row_dir):
                try:
                    path, row = decode_path_row_dir(path_row_dir)
                except CDSRDecoderException as error:
                    self.errors_insert.append(
                        PostgreSQLPublisherConnection.create_task_error_insert_clause({
                            'message': error,
                            'metadata': {'folder': dir_path, 'method': 'check_path_row_dir'},
                            'type': 'error'
                        })
                    )
                    return None

                if self.query['path'] is not None and self.query['path'] != int(path):
                    return None

                if self.query['row'] is not None and self.query['row'] != int(row):
                    return None

                return path_row_dir
예제 #5
0
    def __generator(self):
        '''Generator that returns just directories with valid files.'''

        # logger.info('PublisherWalk\n')

        # `base_path` example: /TIFF/CBERS2B/
        base_path = f'{self.BASE_DIR}/{self.query["satellite"]}'

        # logger.info(f'PublisherWalk - self.query: {self.query}')

        for dir_path, dirs, files in walk(base_path, followlinks=True):
            # get dir path starting at `/TIFF`
            index = dir_path.find('TIFF')
            # `splitted_dir_path` example:
            # ['TIFF', 'CBERS4A', '2020_11', 'CBERS_4A_WFI_RAW_2020_11_10.13_41_00_ETC2',
            #  '207_148_0', '2_BC_UTM_WGS84']
            splitted_dir_path = dir_path[index:].split(os_path_sep)
            dir_level = len(splitted_dir_path)

            # get just the valid dirs and replace old ones with them
            dirs[:] = self.__filter_dir(dir_level, dir_path, dirs)

            # if I'm not inside a geo processing dir, then ignore this folder
            if dir_level != 6:
                continue

            # if the dir does not have any file, then report and ignore this folder
            if not files:
                self.errors_insert.append(
                    PostgreSQLPublisherConnection.create_task_error_insert_clause({
                        'message': 'This folder is valid, but it is empty.',
                        'metadata': {'folder': dir_path},
                        'type': 'warning'
                    })
                )
                continue

            # if there are not enough metadata, then ignore this folder
            metadata = decode_path(dir_path)
            if not metadata:
                continue

            assets = {}
            for radio_processing in self.query['radio_processing']:
                # if user is publishing `SR` files, but there is not any
                # `SR` files in this folder, then ignore it
                if radio_processing == 'SR' and not is_there_sr_files_in_the_list_of_files(files):
                    continue

                assets_metadata = self.satellite_metadata.get_assets_metadata(
                    metadata['satellite'], metadata['sensor'], radio_processing
                )

                # if there is not a valid asset, then ignore it
                __assets = self.__create_assets_from_metadata(assets_metadata, dir_path, metadata)
                if not __assets:
                    continue

                assets[radio_processing] = __assets

            # if there is not one asset at least, then ignore this folder
            if not assets:
                continue

            # yield just valid directories
            yield dir_path, metadata, assets
예제 #6
0
    def __filter_dir(self, dir_level, dir_path, dirs):
        '''Filter `dirs` parameter based on the directory level.'''

        # check the year_month dirs
        if dir_level == 2:
            # I'm inside satellite folder, then the dirs are year-month folders
            # return just the year_month dirs that are between the date range
            # `start_date` and `end_date` fields are required

            # example: 2019_01
            start_year_month = (f"{self.query['start_date'].year}_"
                                f"{fill_string_with_left_zeros(str(self.query['start_date'].month), 2)}")
            # example: 2020_12
            end_year_month = (f"{self.query['end_date'].year}_"
                              f"{fill_string_with_left_zeros(str(self.query['end_date'].month), 2)}")

            return [d for d in dirs if d >= start_year_month and d <= end_year_month]

        # check the scene dirs
        elif dir_level == 3:
            # I'm inside year-month folder, then the dirs are scene folders
            # return just the scene dirs that have the selected sensor

            # if the option is None, then return the original dirs
            if self.query['sensor'] is None:
                return dirs

            def check_scene_dir(scene_dir):
                try:
                    _, sensor_dir, date_dir, time_dir = decode_scene_dir(scene_dir)
                except CDSRDecoderException as error:
                    self.errors_insert.append(
                        PostgreSQLPublisherConnection.create_task_error_insert_clause({
                            'message': error,
                            'metadata': {'folder': dir_path, 'method': 'check_scene_dir'},
                            'type': 'error'
                        })
                    )
                    return None

                # if scene_dir does not have the selected sensor, then not return it
                if sensor_dir != self.query['sensor']:
                    return None

                # convert date from str to datetime
                date = datetime.strptime(date_dir, '%Y-%m-%d')

                # if time dir is between 0h and 5h, then consider it one day ago,
                # because date is reception date and not viewing date
                if time_dir >= '00:00:00' and time_dir <= '05:00:00':
                    # subtract one day from the date
                    date -= timedelta(days=1)

                # if scene_dir is not inside the selected date range, then not return it
                if not (date >= self.query['start_date'] and date <= self.query['end_date']):
                    return None

                return scene_dir

            return list(filter(check_scene_dir, dirs))

        # check the path/row dirs
        elif dir_level == 4:
            # I'm inside sensor folder, then the dirs are path/row folders

            def check_path_row_dir(path_row_dir):
                try:
                    path, row = decode_path_row_dir(path_row_dir)
                except CDSRDecoderException as error:
                    self.errors_insert.append(
                        PostgreSQLPublisherConnection.create_task_error_insert_clause({
                            'message': error,
                            'metadata': {'folder': dir_path, 'method': 'check_path_row_dir'},
                            'type': 'error'
                        })
                    )
                    return None

                if self.query['path'] is not None and self.query['path'] != int(path):
                    return None

                if self.query['row'] is not None and self.query['row'] != int(row):
                    return None

                return path_row_dir

            return list(filter(check_path_row_dir, dirs))

        # check the geo processing dirs
        elif dir_level == 5:
            # I'm inside path/row folder, then the dirs are geo processing folders

            # lambda function to check if the directory starts with any selected geo processing
            check_if_dir_startswith_any_gp = lambda directory: any(
                directory.startswith(gp) for gp in self.query['geo_processing']
            )

            # if the level_dir does not start with the informed geo_processing, then the folder is invalid
            # `d` example: `2_BC_UTM_WGS84`
            return [d for d in dirs if check_if_dir_startswith_any_gp(d)]

        # check files existence
        elif dir_level == 6:
            # I'm inside geo processing folder, then should not have dirs inside here

            if dirs:
                self.errors_insert.append(
                    PostgreSQLPublisherConnection.create_task_error_insert_clause({
                        'message': 'There are folders inside a geo processing directory.',
                        'metadata': {'folder': dir_path},
                        'type': 'warning'
                    })
                )

            return dirs

        self.errors_insert.append(
            PostgreSQLPublisherConnection.create_task_error_insert_clause({
                'message': f'Invalid `{dir_level}` directory level.',
                'metadata': {'folder': dir_path},
                'type': 'warning'
            })
        )

        return dirs
예제 #7
0
    def __create_assets_from_metadata(self, assets_matadata, dir_path, metadata):
        '''Create assets object based on assets metadata.'''

        # search for all files that end with `*.png`
        png_files = glob(f'{dir_path}/*.png')

        if not png_files:
            self.errors_insert.append(
                PostgreSQLPublisherConnection.create_task_error_insert_clause({
                    'message': 'There is NOT a quicklook in this folder, then it will be ignored.',
                    'metadata': {'folder': dir_path},
                    'type': 'error'
                })
            )
            return None

        # initialize `assets` object with the `thumbnail` key
        assets = {
            'thumbnail': {
                'href': png_files[0],
                'type': 'image/png',
                'roles': ['thumbnail']
            }
        }

        # if this folder is WFI/L4, then this folder must contain `*h5_*.json` files
        if metadata['geo_processing'] == '4' and \
                (metadata['sensor'] == 'WFI' or metadata['sensor'] == 'AWFI'):
            # search for all files that end with `*h5_*.json`
            l4_json_files = sorted(glob(f'{dir_path}/*.h5_*.json'))

            if not l4_json_files:
                self.errors_insert.append(
                    PostgreSQLPublisherConnection.create_task_error_insert_clause({
                        'message': 'There is NOT a L4 JSON file (i.e. `*h5_*.json`) in this folder, '
                                   'then it will be ignored.',
                        'metadata': {'folder': dir_path},
                        'type': 'error'
                    })
                )
                return None

            # # if there are L4 JSON files, then add them to the assets dict
            for l4_json_file in l4_json_files:
                # l4_json_file example:
                # '/TIFF/CBERS4A/2020_11/.../4_BC_UTM_WGS84/CBERS_4A_WFI_20201122_217_156.h5_0.json'
                # first get the file name, then get the `h5_N` part from the file name
                # the asset name should be something like `h5_N_json`
                # (e.g. either h5_0_json or h5_1_json)
                asset_name = f"{l4_json_file.split('/')[-1].split('.')[1]}_json"
                assets[asset_name] = {
                    'href': l4_json_file,
                    'type': 'application/json',
                    'roles': ['metadata']
                }

        for band, band_template in assets_matadata.items():
            # search for all TIFF files based on a template with `band_template`
            # for example: search all TIFF files that matches with '/folder/*BAND6.tif'
            tiff_files = sorted(glob(f'{dir_path}/*{band_template}'))

            if not tiff_files:
                # EVI and NDVI files are optional, then if they do not exist, do not report them
                if band == 'evi' or band == 'ndvi':
                    continue

                self.errors_insert.append(
                    PostgreSQLPublisherConnection.create_task_error_insert_clause({
                        'message': ('There is NOT a TIFF file in this folder that ends with the '
                                    f'`{band_template}` template, then it will be ignored.'),
                        'metadata': {'folder': dir_path},
                        'type': 'error'
                    })
                )
                return None

            # get just the band name from the template (e.g. `BAND6`)
            band_name = band_template.replace('.tif', '')

            # add TIFF file as an asset
            assets[band_name] = {
                'href': tiff_files[0],
                'type': 'image/tiff; application=geotiff',
                'common_name': band,
                'roles': ['data']
            }

            # quality, evi and ndvi TIFF files have not XML files
            if band == 'quality' or band == 'evi' or band == 'ndvi':

                # `quality` band contains a JSON file
                if band == 'quality':
                    # search for all JSON files based on a template with `band_template`
                    # for example: search all JSON files that matches with '/folder/*BAND6.json'
                    json_files = sorted(glob(f"{dir_path}/*{band_template.replace('.tif', '.json')}"))

                    if not json_files:
                        self.errors_insert.append(
                            PostgreSQLPublisherConnection.create_task_error_insert_clause({
                                'message': ('There is NOT a JSON file in this folder that ends with the '
                                            f"`{band_template.replace('.tif', '.json')}` template, "
                                            'then it will be ignored.'),
                                'metadata': {'folder': dir_path},
                                'type': 'error'
                            })
                        )
                        return None

                    # add JSON file as an asset
                    assets[band_name + '_json'] = {
                        'href': json_files[0],
                        'type': 'application/json',
                        'roles': ['metadata']
                    }

                continue

            # search for all TIFF files based on a template with `band_template`
            # for example: search all TIFF files that matches with '/folder/*BAND6.xml'
            xml_files = sorted(glob(f"{dir_path}/*{band_template.replace('.tif', '.xml')}"))

            if not xml_files:
                self.errors_insert.append(
                    PostgreSQLPublisherConnection.create_task_error_insert_clause({
                        'message': ('There is NOT an XML file in this folder that ends with the '
                                    f"`{band_template.replace('.tif', '.xml')}` template, "
                                    'then it will be ignored.'),
                        'metadata': {'folder': dir_path},
                        'type': 'error'
                    })
                )
                return None

            # add XML file as an asset
            assets[band_name + '_xml'] = {
                'href': xml_files[0],
                'type': 'application/xml',
                'roles': ['metadata']
            }

        return assets
예제 #8
0
def create_item_and_get_insert_clauses(dir_path, metadata, assets, df_collections, df_tiles):
    print_line()

    items_insert = []
    errors_insert = []

    logger.info(f'dir_path: {dir_path}')
    logger.info(f'metadata: {metadata}')
    # logger.info(f'assets: {assets}')

    # `items` is a list of items (e.g. [dn_item, sr_item])
    items = create_items(metadata, assets)
    # logger.info(f'items size: {len(items)}\n')

    for item in items:
        print_line()
        # logger.info(f'item: {item}\n')
        logger.info(f"item[properties]: {item['properties']}")
        logger.info(f"item[collection]: {item['collection']}")

        # get collection id from dataframe
        collection = df_collections.loc[
            df_collections['name'] == item['collection']['name']
        ].reset_index(drop=True)
        # logger.info('collection: \n'
        #             f"{collection[['id', 'name', 'grid_ref_sys_id', 'metadata', 'is_public']]}\n")

        # if `collection` is an empty dataframe, a collection was not found by its name,
        # then save the warning and ignore it
        if len(collection.index) == 0:
            # create a substring to check if this message has already been added to the list
            sub_message = f"There is metadata to the `{item['collection']['name']}` collection"

            # check if the collection has not already been added to the errors list.
            # prevent inserting the same message twice
            if not any(sub_message in error_insert for error_insert in errors_insert):
                errors_insert.append(
                    PostgreSQLPublisherConnection.create_task_error_insert_clause({
                        'message': (
                            f"There is metadata to the `{item['collection']['name']}` collection,"
                            ' however this collection does not exist in the database.'
                        ),
                        'metadata': {'folder': dir_path},
                        'type': 'error'
                    })
                )
            continue

        collection_id = collection.at[0, 'id']
        # logger.info(f'collection_id: {collection_id}')

        tile_id = get_tile_id_from_collection(collection, metadata, df_tiles)

        # create INSERT clause based on item metadata
        insert = PostgreSQLCatalogTestConnection.create_item_insert_clause(
            item, collection_id, tile_id
        )
        # logger.info(f'insert: {insert}\n')
        logger.info(f"Adding an INSERT clause to `{item['properties']['name']}` "
                     "item in the list...\n")
        items_insert.append(insert)

    return items_insert, errors_insert