Python PipelineFile.publish_type示例，aodncore.pipeline.PipelineFile.publish_type Python示例

示例#1

0

显示文件

文件： handlers.py 项目： aodn/python-aodndata

    def set_deployment_status(self, input_file, message):
        """
        Write message to Harvestmission.csv file. ingested in RT pieline
        Update the anfog_rt.harvest_listing table after ingestion of csv file by RT pipeline
        Note that to be consistent with the available message in the production DB,
        dashes need to be replaced by underscore, for ex delayed-mode =>delayed_mode
        :return:  Harvestmission.csv updated with deployment specific status
        """
        name = os.path.basename(input_file)
        deployment = AnfogFileClassifier.get_deployment_code(input_file)
        platform = AnfogFileClassifier.get_platform(name)

        listing_path = os.path.join(self.products_dir,
                                    AnfogFileClassifier.MISSION_LISTING)
        with open(listing_path, 'w') as f:
            f.write('deployment_name, platform_type, status' + os.linesep)
            row = "%s,%s,%s" % (deployment, platform, message.replace(
                '-', '_').lower())
            f.write(row)

        product = PipelineFile(listing_path)
        product.publish_type = PipelineFilePublishType.HARVEST_ONLY
        product.check_type = PipelineFileCheckType.FORMAT_CHECK
        product.dest_path = os.path.join(self.upload_destination,
                                         os.path.basename(listing_path))
        self.file_collection.add(product)

示例#2

0

显示文件

    def preprocess(self):
        """
        pre processsing to handle the conversion of BUFR csv files to NetCDF
        :return:
        """
        csv_file = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.CSV)

        if csv_file:
            csv_file = csv_file[0]
            csv_file.publish_type = PipelineFilePublishType.ARCHIVE_ONLY

            profiles = parse_bufr_file(csv_file.src_path)
            profiles = return_unique_profiles(
                profiles)  # check for duplicate profiles within BUFR file
            for profile in profiles:
                profile = fzf_vessel_get_info(
                    profile)  # fuzzy search finder for vessel name
                profile = xbt_line_get_info(
                    profile, self.xbt_line_vocab_url
                )  # get hard coded info per xbt line
                netcdf_filepath = netcdf_writer(
                    profile, self.temp_dir)  # convert BUFR to NetCDF

                # publish
                nc_file = PipelineFile(
                    netcdf_filepath,
                    file_update_callback=self._file_update_callback)
                nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD

                self.file_collection.add(nc_file)

示例#3

0

显示文件

 def schedule_file_removal(self, remote_pipeline_file):
     """schedule a file to be removed."""
     filename = remote_pipeline_file.name
     file_to_remove = PipelineFile(
         filename,
         is_deletion=True,
         dest_path=self.dest_path_function(filename),
     )
     file_to_remove.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
     self.file_collection.add(file_to_remove)
     logger.info(
         NRT_FILE_REMOVAL_MSG.format(file=file_to_remove,
                                     ptype=file_to_remove.publish_type))

示例#4

0

显示文件

    def preprocess(self):
        """
        Files to be deleted as found in 'soop_trv_duplicate_url' wfs layer
        """
        files_to_delete = self.state_query.query_wfs_urls_for_layer(
            'soop_trv_duplicate_url')

        for f in files_to_delete:
            file_to_delete = PipelineFile(
                os.path.basename(f),
                is_deletion=True,
                dest_path=f,
                file_update_callback=self._file_update_callback)
            file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
            self.file_collection.add(file_to_delete)

示例#5

0

显示文件

    def preprocess(self):
        """ Preprocessing for NRT and DM files
           - NRT: generate a NetCDF files based on input text file.
             Set the input file publish_type property to 'archive'
           - DM file collection: update the check_type and publish_type properties for non-NetCDF files.
             These files are not checked or harvested, but uploaded to S3

        """
        if self.custom_params is not None and self.custom_params.get(
                'ship_callsign_ls'):
            self.ship_callsign_ls = self.custom_params['ship_callsign_ls']
        else:
            self.ship_callsign_ls = ship_callsign_list()

        # Delayed mode file submitted as a zip archive
        if self.file_extension == '.zip':
            nc_file = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.NETCDF)
            if len(nc_file) != 1:
                raise InvalidInputFileError(
                    "Expecting one netCDF file in ZIP archive '{zip}'".format(
                        zip=os.path.basename(self.input_file)))

            # first process the NetCDF file to set the destination path for the file collection
            nc = nc_file[0]
            nc.dest_path = self.dest_path(nc.src_path)
            nc_dir_path = os.path.dirname(nc.dest_path)

            # SOOP-CO2 DM and FRMAP .txt,.pdf or/and .xml files.
            # Set check type to NONEMPTY and publish type to UPLOAD_ONLY
            non_nc_files = PipelineFileCollection(
                f for f in self.file_collection
                if f.file_type is not FileType.NETCDF)
            for non_nc in non_nc_files:
                non_nc.check_type = PipelineFileCheckType.FORMAT_CHECK
                non_nc.publish_type = PipelineFilePublishType.UPLOAD_ONLY
                non_nc.dest_path = os.path.join(nc_dir_path, non_nc.name)

        elif self.input_file.endswith('dat.txt'):
            # Single text file Realtime files (*dat.txt)
            rt_file = self.file_collection[0]
            rt_file.publish_type = PipelineFilePublishType.ARCHIVE_ONLY

            nrt_nc_file_path = soop_co2_nrt_nc_generator.process_co2_rt(
                rt_file, self.products_dir, self.ship_callsign_ls)
            nrt_nc_file = PipelineFile(nrt_nc_file_path)
            self.file_collection.add(nrt_nc_file)
            nrt_nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD

示例#6

0

显示文件

    def _cleanup_previous_version(self, product_filename):
        """Identify any previously published version(s) of the given product file and mark them for deletion.
        Ignores cases where the previous version has exactly the same file name, as this will simply be overwritten.

        :param product_filename: File name of the newly generated product
        """
        product_type = get_product_type(product_filename)
        for old_product_url in self.old_product_files.get(product_type, []):
            if os.path.basename(old_product_url) != product_filename:
                # Add the previous version as a "late deletion". It will be deleted during the handler's `publish`
                # step after (and only if) all new files have been successfully published.
                old_file = PipelineFile(
                    old_product_url,
                    dest_path=old_product_url,
                    is_deletion=True,
                    late_deletion=True,
                    file_update_callback=self._file_update_callback)
                old_file.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
                self.file_collection.add(old_file)

示例#7

0

显示文件

文件： handlers.py 项目： aodn/python-aodndata

    def preprocess(self):
        """Check that every input file is valid according to the include/exclude regex patterns. Any non-matching
        file will be left with publish_type UNSET after the _resolve step.

        If there are any netCDF files from burst-sampling instruments in the collection, create the burst-averaged
        version of each and add them to the collection.

        :return: None
        """
        self.logger.info(
            "Checking for invalid files and adjusting check/publish properties."
        )

        invalid_files = self.file_collection.filter_by_attribute_id(
            'publish_type', PipelineFilePublishType.UNSET)
        if invalid_files:
            raise InvalidFileNameError(
                "File name(s) don't match the pattern expected for this upload location: {names}"
                .format(names=invalid_files.get_attribute_list('name')))

        # Burst-processing for FV01 files with burst-sampling global attributes
        burst_files = (self.file_collection.filter_by_attribute_id(
            'file_type',
            FileType.NETCDF).filter_by_attribute_regex('name', r'.*_FV01_'))
        for f in burst_files:
            with Dataset(f.src_path, mode='r') as D:
                has_interval = hasattr(D, 'instrument_burst_interval')
                has_duration = hasattr(D, 'instrument_burst_duration')
                is_adcp = ('DIST_ALONG_BEAMS' in D.dimensions
                           or 'HEIGHT_ABOVE_SENSOR' in D.dimensions)
            if not (has_interval and has_duration) or is_adcp:
                continue

            self.logger.info("Burst-processing {f.name}".format(f=f))
            product_path = create_burst_average_netcdf(f.src_path,
                                                       self.products_dir)
            product_file = PipelineFile(
                product_path, file_update_callback=self._file_update_callback)
            product_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
            self.file_collection.add(product_file)

示例#8

0

显示文件

with open(GETFEATURE_OLD_PRODUCTS_FILE) as f:
    TEST_GETFEATURE_OLD_PRODUCTS_JSON = f.read()

with open(GETFEATURE_EMPTY_FILE) as f:
    TEST_GETFEATURE_EMPTY_JSON = f.read()

# Create collection of input files for the products
# These will be uploaded to the mocked equivalent of S3 (where the real input files will be)
features = json.loads(TEST_GETFEATURE_JSON)['features']
INPUT_FILE_COLLECTION = PipelineFileCollection()
for f in features:
    pf = PipelineFile(
            os.path.join(TEST_ROOT, os.path.basename(f['properties']['url'])),
            dest_path=f['properties']['url']
    )
    pf.publish_type = PipelineFilePublishType.UPLOAD_ONLY
    INPUT_FILE_COLLECTION.add(pf)


class TestMooringsProductsHandler(HandlerTestCase):
    def setUp(self):
        self.handler_class = MooringsProductsHandler
        upload_broker = get_storage_broker(self.config.pipeline_config['global']['upload_uri'])
        upload_broker.upload(INPUT_FILE_COLLECTION)
        super().setUp()

    @patch('aodncore.util.wfs.WebFeatureService')
    def test_all_products(self, mock_webfeatureservice):
        mock_webfeatureservice().getfeature().getvalue.side_effect = [TEST_GETFEATURE_JSON,
                                                                      TEST_GETFEATURE_OLD_PRODUCTS_JSON]

示例#9

0

显示文件

    def preprocess(self):

        # if input file is a NetCDF, create a .nc.gz and harvest upload it.
        # historically, files were always sent as *.nc.gz. But as of April 2021, files might be pushed as *.nc.
        # to be consistent, we transform this .nc into a .nz.gz
        if self.file_type is FileType.NETCDF:
            self.file_collection.set_publish_types(
                PipelineFilePublishType.NO_ACTION)

            gzip_path = os.path.join(self.temp_dir, self.file_basename + '.gz')
            with open(self.input_file,
                      'rb') as f_in, gzip.open(gzip_path, 'wb') as gz_out:
                gz_out.writelines(f_in)

            # publish
            self.add_to_collection(
                gzip_path, publish_type=PipelineFilePublishType.HARVEST_UPLOAD)

        if self.file_type is FileType.GZIP:
            # add nc_gz file to collection (not by default)
            self.file_collection.add(self.input_file_object)
            netcdf_file_gz_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.GZIP)
            netcdf_file_gz = netcdf_file_gz_collection[0]
            netcdf_file_gz.publish_type = PipelineFilePublishType.HARVEST_UPLOAD  # default

            # some GSLA files are gzipped, so gunzip them before checking them
            # if uploaded file is GZIP check that GZIP contains a NetCDF
            netcdf_collection = self.file_collection.filter_by_attribute_id(
                'file_type', FileType.NETCDF)
            if len(netcdf_collection) != 1:
                raise InvalidInputFileError(
                    "Expecting one netCDF file in GZIP archive '{gzip}'".
                    format(gzip=os.path.basename(self.input_file)))

        netcdf_file_gz = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.GZIP)[0]
        netcdf_file = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)[0]
        # setting the path of the gz file with the gunzipped file
        netcdf_file_gz.dest_path = self.dest_path(netcdf_file.src_path)
        # Nothing to do with *.nc. Talend can harvest *.nc.gz. Set to NO_ACTION
        netcdf_file.publish_type = PipelineFilePublishType.NO_ACTION

        # we don't know the product type (DM00 or DM01) of the file already
        # on s3 in order to deduce its path. We need to get the product
        # type from the file in incoming
        result_previous_version_creation_date = self.get_previous_version_creation_date(
            netcdf_file.src_path)
        """ default values
        by default we push to the storage the file landed in the pipeline (ie *.nc.gz) """
        push_new_file = True
        remove_previous_version = False

        # compare creation dates with file already on storage
        if result_previous_version_creation_date:
            new_file_creation_date = get_creation_date(netcdf_file.name)
            if result_previous_version_creation_date > new_file_creation_date:
                push_new_file = False
            elif result_previous_version_creation_date == new_file_creation_date:
                push_new_file = True
            else:
                remove_previous_version = True
                previous_file_path = self.get_previous_version_object(
                    netcdf_file.src_path)

        if push_new_file:
            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                # yearly file should never be harvested
                netcdf_file_gz.publish_type = PipelineFilePublishType.UPLOAD_ONLY
        else:
            raise InvalidFileNameError(
                "file name: \"{filename}\"  creation date is older than file already on "
                "storage".format(filename=netcdf_file_gz.name))

        # deletion of the previous file
        if remove_previous_version:
            previous_file_name = os.path.basename(previous_file_path)
            file_to_delete = PipelineFile(
                previous_file_name,
                is_deletion=True,
                dest_path=previous_file_path,
                file_update_callback=self._file_update_callback)

            if GSLA_REGEX_YEARLY.match(netcdf_file.name):
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY
            else:
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST

            self.file_collection.add(file_to_delete)

示例#10

0

显示文件

    def get_previous_version(self, previous_file_list, path, input_file_name):
        """
            Find previous version of each incoming file based on its type/extension and
            add them to the filecollection with the correct publish type
            extension can be: .inf', '.nc.png','.pitch.csv','.roll.csv',.gps.csv'
            inputs: previous_file_list : dictionary containing file listing(full path) and metadata from destination
                  input_file :  file basename
                   path : full destination path
        """

        if not previous_file_list:
            return

        files_to_delete = PipelineFileCollection()

        try:
            extension = ALLOWED_CONTENT_EXTENSIONS.match(
                input_file_name).groupdict()['extension']
        except KeyError:
            raise ValueError(
                "unable to determine extension from file name {infile}".format(
                    infile=input_file_name))

        # get list of previous files basename  to search through
        basenames = {os.path.basename(f) for f in previous_file_list}

        this_extension_pattern = re.compile(
            r".*\.{ext}$".format(ext=extension))
        if input_file_name not in basenames:
            previous_file = [
                f for f in previous_file_list
                if this_extension_pattern.match(f)
            ]

            if extension == 'nc':
                if len(previous_file) != 1:
                    raise ValueError(
                        "Expected exactly 1 previous versions of the netcdf file, found {n}. Aborting "
                        .format(n=len(previous_file)))
            else:
                # if uploaded file name has the same name published file => no action, file will be overwritten, otherwise
                # sort file per wildcard and work out which one to delete (
                # check previous file widcard :
                # can be '.inf', '.nc.png','.pitch.csv','.roll.csv',.gps.csv'
                if len(previous_file) > 1:
                    raise ValueError(
                        "Found more than one previous versions of the extension '{ext}'. Aborting"
                        .format(ext=extension))
                elif len(previous_file) == 0:
                    return

            prev_file = previous_file[0]
            dest_path = os.path.join(path, os.path.basename(prev_file))
            self.logger.info(
                "adding deletion of previous file '{dest_path}'".format(
                    dest_path=dest_path))

            file_to_delete = PipelineFile(prev_file,
                                          is_deletion=True,
                                          dest_path=dest_path)

            if extension == 'nc':
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST
            else:
                file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY

            files_to_delete.add(file_to_delete)

        return files_to_delete

示例#11

0

显示文件

    def preprocess(self):
        if self.custom_params is not None and self.custom_params.get(
                'ship_callsign_ls'):
            self.ship_callsign_ls = self.custom_params['ship_callsign_ls']
        else:
            self.ship_callsign_ls = ship_callsign_list()

        if SHIP_CODE not in self.ship_callsign_ls:
            raise RuntimeError(
                "Missing vessel callsign {callsign} from vocabulary.".format(
                    callsign=SHIP_CODE))

        self.soop_tmv_dir = os.path.join(
            'IMOS', 'SOOP', 'SOOP-TMV', '{ship_code}_{ship_name}'.format(
                ship_code=SHIP_CODE,
                ship_name=self.ship_callsign_ls[SHIP_CODE]), 'realtime')

        txt_files = self.file_collection.filter_by_attribute_value(
            'extension', '.txt')
        log_files = self.file_collection.filter_by_attribute_value(
            'extension', '.log')
        nc_files = self.file_collection.filter_by_attribute_id(
            'file_type', FileType.NETCDF)
        """
        * 10secs zip files (*.log + *.txt [calibration]) -> *.zip is pushed to ARCHIVE_DIR
                                                            (netcdf still needs to be generated to deduce path).
                                                            *.log, *.txt and *.nc NOT added to the collection
        * 1sec zip files (*.log only) -> *.log & *.nc pushed to S3. *.zip not added to the collection
        """

        if len(nc_files):
            # case where we re-push an existing NetCDF file
            f_nc = nc_files[0]
            f_nc.publish_type = PipelineFilePublishType.HARVEST_UPLOAD

        elif len(log_files):
            f_log = log_files[0]
            log_filename = os.path.basename(f_log.src_path)

            if SOOP_NRT_LOG_PATTERN.match(log_filename) is None:
                raise InvalidFileNameError(
                    "SOOP TMV NRT input logfile has incorrect naming '{name}'."
                    .format(name=log_filename))

            # case to create NetCDF file from log file
            f_txt = None
            if len(txt_files):
                f_txt = txt_files[0]
                netcdf_filepath = netcdf_writer(
                    f_log.src_path,
                    self.temp_dir,
                    self.ship_callsign_ls[SHIP_CODE],
                    meta_path=f_txt.src_path)
            else:
                netcdf_filepath = netcdf_writer(
                    f_log.src_path, self.temp_dir,
                    self.ship_callsign_ls[SHIP_CODE])

            # the path of logs and zips has to deduced within the pre-process as it needs the creation of a NetCDF to
            # get the correct info
            with Dataset(netcdf_filepath) as nc_open:
                measurement_frequency = nc_open.measurement_frequency
                product_type = nc_open.product_type
                year = datetime.strptime(nc_open.time_coverage_start,
                                         '%Y-%m-%dT%H:%M:%SZ').strftime("%Y")

            pre_path = os.path.join(self.soop_tmv_dir, product_type,
                                    measurement_frequency, year)

            if measurement_frequency == "1sec":
                f_log.publish_type = PipelineFilePublishType.UPLOAD_ONLY
                f_log.dest_path = os.path.join(pre_path, 'logs', f_log.name)
                nc_file = PipelineFile(netcdf_filepath)
                nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
                self.file_collection.add(nc_file)

            elif measurement_frequency == "10secs":
                if self.input_file.endswith('zip'):
                    self.input_file_object.publish_type = PipelineFilePublishType.ARCHIVE_ONLY
                    self.input_file_object.archive_path = os.path.join(
                        pre_path, 'logs', self.input_file_object.name)
                    self.file_collection.add(self.input_file_object)
                    f_log.publish_type = PipelineFilePublishType.NO_ACTION
                    if f_txt:
                        f_txt.publish_type = PipelineFilePublishType.NO_ACTION
                else:
                    # case when a 10secs log file (and not a zip) is pushed to incoming
                    f_log.publish_type = PipelineFilePublishType.ARCHIVE_ONLY
                    f_log.archive_path = os.path.join(pre_path, 'logs',
                                                      f_log.name)

示例#12

0

显示文件

文件： handlers.py 项目： aodn/python-aodndata

    def delete_previous_version(self, mode, deployment_status):
        """
           In RT mode: 2 cases 1) update of deployment in progress :select previous version of a file that needs to be
                       deleted (.nc)  other files are automatically overwritten
                               2) cleaning RT folder :status "renamed","clear-files"

           In DM mode either - new DM (deployment_status = delayed_mode): delete RT deployment files
                            or
                            - update DM (deployment_status = update):: delete previous .nc,
                            other files are automatically overwritten)
         """

        if mode == 'DM' and deployment_status == 'delayed_mode':
            #  RT and DM folder hierarchy similar except that RT has additional level /REALTIME/
            destination = AnfogFileClassifier.make_rt_path(
                self.upload_destination)
            delete_file_regex = '%s|%s|%s' % (
                AnfogFileClassifier.ANFOG_RT_REGEX,
                AnfogFileClassifier.RT_PNG_REGEX,
                AnfogFileClassifier.RT_POSITION_SUMMARY)
        elif mode == 'DM' and deployment_status == 'update':
            destination = self.upload_destination
            delete_file_regex = AnfogFileClassifier.ANFOG_DM_REGEX
        elif mode == 'RT' and deployment_status in ['renamed', 'clear-files']:
            destination = self.upload_destination
            delete_file_regex = '%s|%s|%s' % (
                AnfogFileClassifier.ANFOG_RT_REGEX,
                AnfogFileClassifier.RT_PNG_REGEX,
                AnfogFileClassifier.RT_POSITION_SUMMARY)
        elif mode == 'RT' and deployment_status == 'in_progress':
            destination = self.upload_destination
            delete_file_regex = AnfogFileClassifier.ANFOG_RT_REGEX

        else:
            raise ValueError(
                "Invalid combination of mode '{mode}' and status'{st}'".format(
                    mode=mode, st=deployment_status))

        # publish_type set according to file type: netcdf unharvest_delete,
        # everything else (png, txt, kml..) DELETE_ONLY
        previous_file_list = self.state_query.query_storage(destination).keys()
        self.logger.info("Mode '{mode}' and Status '{status}'".format(
            mode=mode, status=deployment_status))

        for filename in previous_file_list:
            previous_file = PipelineFile(filename,
                                         is_deletion=True,
                                         dest_path=destination)
            previous_file.dest_path = os.path.join(destination,
                                                   previous_file.name)
            # set default publish type to delete only
            previous_file.publish_type = PipelineFilePublishType.DELETE_ONLY
            if previous_file.file_type is FileType.NETCDF:
                previous_file.publish_type = PipelineFilePublishType.DELETE_UNHARVEST

            if re.match(delete_file_regex, previous_file.name):
                self.file_collection.add(previous_file)

                if deployment_status not in [
                        'renamed', 'clear-files'
                ] and re.match(previous_file.name,
                               os.path.basename(self.primary_nc.src_path)):
                    # removing file as it will be overwritten. Note that test is invalid in 'renamed' status
                    #  cause primary_nc not set(irrelevant)
                    self.file_collection.discard(previous_file)
                else:
                    self.logger.info(
                        "adding deletion of previous file '{dest_path}'".
                        format(dest_path=previous_file.dest_path))

示例#13

0

显示文件

 def _add_to_collection(self, product_url):
     """Add a new product file to the file_collection to be harvested and uploaded."""
     product_file = PipelineFile(
         product_url, file_update_callback=self._file_update_callback)
     product_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD
     self.file_collection.add(product_file)

示例#14

0

显示文件

 def preprocess(self):
     jpg_output_path, jpg_output_dest_path = create_plot(self.file_collection[0].src_path, self.temp_dir)
     jpg_file = PipelineFile(jpg_output_path, dest_path=jpg_output_dest_path)
     jpg_file.publish_type = PipelineFilePublishType.UPLOAD_ONLY
     self.file_collection.add(jpg_file)