def _get_product_level(cls, input_file): """Determine the product level of the file, i.e. either 'non-QC' (FV00), 'burst-averaged' or 'gridded' (FV02 products), or empty for FV01 files. """ name_field = cls._get_file_name_fields(input_file) if cls._get_data_category(input_file) == 'CO2': if 'realtime' in name_field[6]: return 'real-time' elif 'delayed' in name_field[6]: return 'delayed' else: raise InvalidFileNameError("Unknown CO2 file type '{input_file}'".format(input_file=input_file)) if name_field[5] == 'FV00': return 'non-QC' if name_field[5] == 'FV02': if len(name_field) < 7: raise InvalidFileNameError( "Can't determine product type from file name '{name}'".format(name=input_file) ) if 'burst-averaged' in name_field[6]: return 'burst-averaged' if 'gridded' in name_field[6]: return 'gridded' return ''
def dest_path(self, src_file): dir_list = [] project = def_project(src_file) if project not in VALID_PROJECT: raise InvalidFileNameError( "Invalid project name '{project}'. " "Project should be IMOS, SOOP-CO2_RT or Future_Reef_MAP". format(project=project)) if project in ['IMOS', 'SOOP-CO2_RT']: fields = FileClassifier._get_file_name_fields(src_file) ship_code = fields[4] if ship_code not in self.ship_callsign_ls: raise InvalidFileNameError( "Missing vessel callsign in file name '{name}'.".format( name=src_file)) project_base = 'IMOS' facility = fields[1][:4] sub_facility = fields[1] platform = "{ship_code}_{ship_name}".format( ship_code=ship_code, ship_name=self.ship_callsign_ls[ship_code]) dir_list.extend([project_base, facility, sub_facility, platform]) if project == 'FutureReefMap': fields = FileClassifier._get_file_name_fields(src_file, min_fields=5) ship_code = fields[3] if ship_code not in self.ship_callsign_ls: raise InvalidFileNameError( "Missing vessel callsign in file name '{name}'.".format( name=src_file)) dir_list.append('Future_Reef_MAP') data_type = 'underway' dir_list.extend([data_type, self.ship_callsign_ls[ship_code]]) if project in ['IMOS', 'FutureReefMap']: att_list = FileClassifier._get_nc_att( src_file, ['cruise_id', 'time_coverage_start']) year = att_list[1][:4] cruise_id = att_list[0] dir_list.extend([year, cruise_id]) if project == 'SOOP-CO2_RT': data_type = 'REALTIME' time_start = FileClassifier._get_nc_att(src_file, 'time_coverage_start') year = time_start[:4] month = time_start[5:7] month = month.lstrip('0') dir_list.extend([data_type, year, month]) dir_path = FileClassifier._make_path(dir_list) return os.path.join(dir_path, os.path.basename(src_file))
def process(self): """Handle a zip file containing images and no NetCDF files. In this case we just want to publish the zip file itself, not the individual images. If we encounter a "mixed" zip file with images and netCDF files, we're just going to give up, for now. """ images = PipelineFileCollection(f for f in self.file_collection if f.file_type.is_image_type) netcdfs = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF) is_zip = self.file_type is FileType.ZIP have_images = len(images) > 0 have_netcdfs = len(netcdfs) > 0 if is_zip and have_images: if have_netcdfs: raise InvalidFileContentError( "Zip file contains both images and netCDFs. Don't know what to do!" " They are handled differently, so please upload only one at a time." ) if not DwmFileClassifier.SOTS_IMAGES_ZIP_PATTERN.match( self.file_basename): raise InvalidFileNameError( "Zip file contains images, but its name does not match pattern for images zip file " "(regular expression '{p}')".format( p=DwmFileClassifier.SOTS_IMAGES_ZIP_PATTERN.pattern)) self.logger.info( "Zip file contains images and no netCDF files. " "Publishing original zip file instead of its contents.") self.file_collection.set_publish_types( PipelineFilePublishType.NO_ACTION) self.input_file_object.publish_type = PipelineFilePublishType.HARVEST_UPLOAD self.file_collection.add(self.input_file_object)
def get_deployment_code(cls, src_path): """Depending on data mode : DM : get deployment code from netcdf global attributes directly DSTG : no attribute deployment_code, extract deployment code from title instead RT :exctract deployment code from title """ name = os.path.basename(src_path) if re.match(cls.DSTG_REGEX, name) or re.match(cls.ANFOG_RT_REGEX, name): title = cls._get_nc_att(src_path, 'title') deployment_code = title.split()[-1] if deployment_code == 'mission': raise InvalidFileContentError( "Missing deployment code in {file} ".format(file=name)) elif re.match(cls.ANFOG_NC_REGEX, name) or re.match( cls.ADAPTER_REGEX, name): deployment_code = cls._get_nc_att(src_path, 'deployment_code') elif name.endswith('.txt'): # extract deployment code from filename like SL-Yamba20180609_completed.txt field = name.split('_') deployment_code = field[0].split('-')[1] else: raise InvalidFileNameError( "Invalidfile name {file} ".format(file=name)) return deployment_code
def dest_path_aatams_sattag_qc_ctd(filepath): with Dataset(filepath, mode='r') as nc_obj: try: deployment_code = nc_obj.deployment_code except AttributeError: raise InvalidFileNameError( 'deployment_code attribute not found in NetCDF file to deduce path' ) # deployment code should be equivalent to the start of the NetCDF netcdf_filename = os.path.basename(filepath) if deployment_code != netcdf_filename[0:len(deployment_code)]: raise InvalidFileNameError( 'Empty deployment_code attribute in NetCDF file to deduce path') return os.path.join(AATAMS_MEOP_DIR, deployment_code, netcdf_filename)
def dest_path(cls, input_file): """ Destination object path for an DWM file. Of the form: 'IMOS/DWM/DA/<platform_code>/<data_category>/<product_level>' or 'IMOS/DWM/SOTS/<year_of_deployment>/<product_type>' or 'IMOS/DWM/SOTS/images' where <platform_code> is the value of the platform_code global attribute <data_category> is a broad category like 'Temperature', 'CTD_profiles', etc... <product_level> is - 'non-QC' for FV00 files - empty for FV01 files <year_of_deployment> is the year in which the deployment started <product_type> is - 'real-time'; - empty (for delayed mode data) The basename of the input file is appended. """ dir_list = [cls.PROJECT, cls.FACILITY] input_file_basename = os.path.basename(input_file) # deal with image zip files first, as they're simpler if cls.SOTS_IMAGES_ZIP_PATTERN.match(input_file_basename): dir_list.extend(['SOTS', 'images', input_file_basename]) return cls._make_path(dir_list) fac, subfac = cls._get_facility(input_file) is_asfs_and_rt = subfac == 'ASFS' and cls._is_realtime(input_file) if subfac == 'DA': dir_list.append(subfac) dir_list.append(cls._get_nc_att(input_file, 'platform_code')) dir_list.append(cls._get_data_category(input_file)) dir_list.append(cls._get_product_level(input_file)) elif is_asfs_and_rt: # rt files with old names not migrated yet cat = cls._get_old_data_category(input_file) start_time = cls._get_nc_att(input_file,'time_coverage_start',time_format=True) rt_folder_name = '{}_daily'.format(start_time.year) dir_list += ['ASFS', 'SOFS', cat, 'Real-time', rt_folder_name] elif subfac in ('SOTS', 'ASFS'): dir_list.append('SOTS') dir_list.append(cls._get_deployment_year(input_file)) if cls._is_realtime(input_file): dir_list.append('real-time') else: raise InvalidFileNameError( "Unknown DWM sub-facility '{subfac}' for file '{input_file}'".format(subfac=subfac, input_file=input_file) ) dir_list.append(input_file_basename) return cls._make_path(dir_list)
def dest_path(filepath): filepath = re.sub( '_C-.*$', '.nc', filepath) # strip creation date from filepath if exists netcdf_filename = os.path.basename(filepath) m = re.search( r'^IMOS_SRS-OC_F_([0-9]{8}T[0-9]{6}Z)_(.*)_FV0([0-2]{1})_DALEC_.*\.nc$', netcdf_filename) if m is None: raise InvalidFileNameError( "file name not matching regex to deduce dest_path") platform_code = m.group(2) file_version_code = 'FV0%s' % m.group(3) ships_dic = ship_callsign_list() if platform_code in ships_dic: vessel_name = ships_dic[platform_code] else: raise InvalidFileNameError( "Vessel name not known '{name}'".format(name=platform_code)) if not (file_version_code != "FV00" or file_version_code != "FV01" or file_version_code != "FV02"): raise InvalidFileNameError( "File_version code is unknown for '{name}'".format( name=filepath)) year = datetime.strptime(m.group(1), '%Y%m%dT%H%M%SZ').strftime("%Y") relative_netcdf_path = os.path.join( 'IMOS', 'SRS', 'OC', 'radiometer', '%s_%s' % (platform_code, vessel_name), year) if file_version_code == "FV02": relative_netcdf_path = os.path.join(relative_netcdf_path, 'fv02-products', netcdf_filename) else: relative_netcdf_path = os.path.join(relative_netcdf_path, netcdf_filename) return relative_netcdf_path
def dest_path_srs_oc_ljco_aeronet(filepath): file_basename = os.path.basename(filepath) if file_basename == VALID_FILENAME: return os.path.join(PREFIX_PATH, file_basename) else: raise InvalidFileNameError( "file name: \"{filename}\" not equal to {valid} in order to deduce dest_path" .format(filename=os.path.basename(filepath), valid=VALID_FILENAME))
def _get_data_category(cls, input_file): if 'aggregated-timeseries' in input_file: return 'aggregated_timeseries' elif 'hourly-timeseries' in input_file: return 'hourly_timeseries' elif 'gridded-timeseries' in input_file: return 'gridded_timeseries' else: raise InvalidFileNameError( "Could not determine data category from {name}".format( name=input_file))
def dest_path_aodn_wave_dm(filepath): file_basename = os.path.basename(filepath) with Dataset(filepath, mode='r') as nc_obj: site_name = nc_obj.site_name if BOM_WAVERIDER.match(file_basename): data_base_dir = os.path.join(BOM_DIR, WAVERIDER_DIR, DELAYED_DIR) product_dir = site_name.replace(' ', '_') elif DES_QLD_WAVERIDER.match(file_basename): data_base_dir = os.path.join(DES_QLD_DIR, WAVERIDER_DIR, DELAYED_DIR) fields = get_pattern_subgroups_from_string(file_basename, DES_QLD_WAVERIDER) product_dir = fields['site_code'] elif DOT_WA_WAVERIDER.match(file_basename): data_base_dir = os.path.join(DOT_WA_DIR, WAVERIDER_DIR, DELAYED_DIR) fields = get_pattern_subgroups_from_string(file_basename, DOT_WA_WAVERIDER) product_dir = os.path.join(site_name.replace(' ', '_'), fields['site_code']) elif MHL_WAVERIDER.match(file_basename): data_base_dir = os.path.join(MHL_DIR_BASE, MHL_DIR, MHL_WAVERIDER_DIR) product_dir = site_name.replace(' ', '_') elif DOT_WA_AWAC.match(file_basename): data_base_dir = os.path.join(DOT_WA_DIR, AWAC_DIR, DELAYED_DIR) fields = get_pattern_subgroups_from_string(file_basename, DOT_WA_AWAC) product_dir = fields['site_code'] elif DTA_NZ_WAVERIDER.match(file_basename): data_base_dir = os.path.join(DTA_NZ_DIR, WAVERIDER_DIR, DELAYED_DIR) if 'Wave Rider Buoy' not in site_name: raise InvalidFileContentError( "file name: \"{filename}\"; global attribute site_code does not contain 'Wave Rider Buoy' string to " \ "deduce path".format(filename=file_basename)) product_dir = site_name.replace('Wave Rider Buoy', '').strip().replace(' ', '_') elif NTP_WAVE.match(file_basename): data_base_dir = os.path.join(NTP_WAVE_DIR, WAVERIDER_DIR, DELAYED_DIR) if len(site_name) == 0: raise InvalidFileContentError( "file name: \"{filename}\"; global attribute site_name is empty" .format(filename=file_basename)) product_dir = site_name else: raise InvalidFileNameError( "file name: \"{filename}\" not matching regex to deduce path". format(filename=file_basename)) return os.path.join(data_base_dir, product_dir, os.path.basename(filepath))
def dest_path_cars(filepath): pattern = r'CARS(\d+)_.*\.nc' try: year = re.search(pattern, filepath).group(1) except AttributeError: raise InvalidFileNameError( "invalid file name {filepath}. Not matching '{pattern}'".format( filepath=filepath, pattern=pattern)) return os.path.join( "CSIRO/Climatology/CARS/{year}/AODN-product/{basename}".format( year=year, basename=os.path.basename(filepath)))
def archive_path(self, src_file): """ Generate archive path for RT file based on vessel_code eg:IN_2017-165-0000dat.txt <Vessel_code>_yyyy-ddd-hhmmdat.txt :return: relative archive path- full path, including file name eg: 'IMOS/SOOP/SOOP-CO2/VLMJ_Investigator/REALTIME/2018/1/IN_2018-022-0000dat.txt' """ dir_list = [] project = 'IMOS' facility = 'SOOP' sub_facility = 'SOOP-CO2' data_type = 'REALTIME' dir_list.extend([project, facility, sub_facility]) fields = FileClassifier._get_file_name_fields( os.path.basename(src_file), min_fields=2) if fields[0] in VESSEL_CODE: ship_code = VESSEL_CODE[fields[0]] else: raise InvalidFileNameError( "File {file} has an invalid vessel code or is not a valid SOOP-CO2 realtime file" .format(file=os.path.basename(src_file))) platform = "{ship_code}_{ship_name}".format( ship_code=ship_code, ship_name=self.ship_callsign_ls[ship_code]) dir_list.extend([platform, data_type]) year = int(fields[1][:4]) dir_list.append(year) jday = int(fields[1][5:8]) if not (jday in range(0, 367)) or year < 2017: raise InvalidFileNameError( "Failed extracting valid [year, day] from file {file}".format( file=os.path.basename(src_file))) # Determine month from julian day (1-365). Leap year taken into account year_to_ordinal = datetime.date(year, 1, 1).toordinal() + jday - 1 month = datetime.date.fromordinal(year_to_ordinal).month dir_list.append(month) dir_list.append(os.path.basename(src_file)) archive_file_path = FileClassifier._make_path(dir_list) return archive_file_path
def get_type(filepath): """return acorn_file_type, the file type of an ACORN file based on its filename""" file_basename = os.path.basename(filepath) unknown_product = False if ACORN_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, ACORN_FILE_PATTERN) product_type = fields['product_type'] file_version = fields['file_version'] platform_code = fields['platform_code'] if product_type == 'radial' and file_version == 'FV00': acorn_file_type = "radial" elif product_type == 'radial' and file_version == 'FV01': acorn_file_type = "radial_quality_controlled" elif product_type == 'sea-state' and file_version == 'FV00': acorn_file_type = "vector" elif product_type == 'wavespec' and file_version == 'FV01': acorn_file_type = "gridded_1h-avg-wave-spectra_QC" elif product_type == 'windp' and file_version == 'FV01': acorn_file_type = "gridded_1h-avg-wind-map_QC" elif product_type == 'wavep' and file_version == 'FV01': site_map_station = ['CBG', 'SAG', 'ROT', 'COF'] if any(s == platform_code for s in site_map_station): acorn_file_type = "gridded_1h-avg-wave-site-map_QC" else: acorn_file_type = "gridded_1h-avg-wave-station-map_QC" elif product_type == '1-hour-avg' and file_version == 'FV00': acorn_file_type = "gridded_1h-avg-current-map_non-QC" elif product_type == '1-hour-avg' and file_version == 'FV01': acorn_file_type = "gridded_1h-avg-current-map_QC" else: unknown_product = True else: unknown_product = True if unknown_product: raise InvalidFileNameError( "file name: \"{filename}\" Unknown product type from filename". format(filename=file_basename)) return acorn_file_type
def preprocess(self): """Check that every input file is valid according to the include/exclude regex patterns. Any non-matching file will be left with publish_type UNSET after the _resolve step. :return: None """ self.logger.info("Checking for invalid files.") invalid_files = self.file_collection.filter_by_attribute_id( 'publish_type', PipelineFilePublishType.UNSET) if invalid_files: raise InvalidFileNameError( "File name(s) don't match the pattern expected for this upload location: {names}" .format(names=invalid_files.get_attribute_list('name')))
def get_gsla_type(filepath): """ :return: gsla file type """ file_basename = os.path.basename(filepath) if GSLA_REGEX.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, GSLA_REGEX) return fields['product_type'] elif GSLA_REGEX_YEARLY.match(file_basename): return os.path.join(get_product_type(filepath), 'yearfiles') else: raise InvalidFileNameError( "file name: \"{filename}\" not matching regex to deduce dest_path". format(filename=file_basename))
def get_product_type(file_path): """Return a product type label for the given file (extracted from the file name). For example "PSAL-aggregated-timeseries", or "hourly-timeseries". :param file_path: str path or name of file :returns: str product type label """ file_name = os.path.basename(file_path) name_match = PRODUCT_TYPE_PATTERN.search(file_name) if not name_match: raise InvalidFileNameError( "Could not extract produt type from '{file_name}'".format( file_name=file_name)) return name_match.group(1)
def get_creation_date(filepath): """ :return: creation date """ file_basename = os.path.basename(filepath) if GSLA_REGEX.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, GSLA_REGEX) elif GSLA_REGEX_YEARLY.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, GSLA_REGEX_YEARLY) else: raise InvalidFileNameError( "file name: \"{filename}\" not matching regex to deduce creation_date" .format(filename=file_basename)) return datetime.strptime(fields['creation_date'], '%Y%m%dT%H%M%SZ')
def dest_path(filepath): sstaars_alt_dir = os.path.join('CSIRO', 'Climatology', 'SSTAARS', '2017') sstaars_aodn_dir = os.path.join(sstaars_alt_dir, 'AODN-product') netcdf_file_name = os.path.basename(filepath) regex_daily_files = re.compile(r'SSTAARS_daily_fit_[0-9]{3}\.nc') if netcdf_file_name == 'SSTAARS.nc': return os.path.join(sstaars_alt_dir, netcdf_file_name) elif (netcdf_file_name == 'SSTAARS_daily_fit.nc') or re.match( regex_daily_files, netcdf_file_name): return os.path.join(sstaars_aodn_dir, netcdf_file_name) else: raise InvalidFileNameError( r"invalid file name {filepath}. Not matching 'STAARS.*\.nc'". format(filepath=filepath))
def dest_path(filepath): file_basename = os.path.basename(filepath) # NON CONTRIBUTED DATA SET if IMOS_OC_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, IMOS_OC_FILE_PATTERN) nc_time_cov_start = datetime.strptime(fields['nc_time_cov_start'], '%Y%m%dT%H%M%SZ') data_parameter_code = fields['data_parameter_code'] if data_parameter_code == 'A': product_name = 'aqua' elif data_parameter_code == 'S': product_name = 'seawifs' elif data_parameter_code == 'V': product_name = 'viirs' path = os.path.join(OC_GRIDDED_PREFIX_PATH, product_name, fields['time_coverage_resolution'], '%d' % nc_time_cov_start.year, '%02d' % nc_time_cov_start.month, file_basename) return path # CONTRIBUTED DATA SET elif RJOHNSON_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, RJOHNSON_FILE_PATTERN) data_parameter_code = fields['data_parameter_code'] time_coverage_resolution = fields['time_coverage_resolution'] if data_parameter_code == 'A': product_name = 'aqua' elif data_parameter_code == 'S': product_name = 'seawifs' if time_coverage_resolution == '8D': time_cov = '8d' elif time_coverage_resolution == 'MO': time_cov = '1m' return os.path.join(OC_GRIDDED_PREFIX_PATH, 'contributed', 'SO-Johnson', 'chl', time_cov, product_name, file_basename) else: raise InvalidFileNameError("file name: \"{filename}\" not matching regex to deduce dest_path". format(filename=file_basename))
def preprocess(self): """Check that every input file is valid according to the include/exclude regex patterns. Any non-matching file will be left with publish_type UNSET after the _resolve step. If there are any netCDF files from burst-sampling instruments in the collection, create the burst-averaged version of each and add them to the collection. :return: None """ self.logger.info( "Checking for invalid files and adjusting check/publish properties." ) invalid_files = self.file_collection.filter_by_attribute_id( 'publish_type', PipelineFilePublishType.UNSET) if invalid_files: raise InvalidFileNameError( "File name(s) don't match the pattern expected for this upload location: {names}" .format(names=invalid_files.get_attribute_list('name'))) # Burst-processing for FV01 files with burst-sampling global attributes burst_files = (self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF).filter_by_attribute_regex('name', r'.*_FV01_')) for f in burst_files: with Dataset(f.src_path, mode='r') as D: has_interval = hasattr(D, 'instrument_burst_interval') has_duration = hasattr(D, 'instrument_burst_duration') is_adcp = ('DIST_ALONG_BEAMS' in D.dimensions or 'HEIGHT_ABOVE_SENSOR' in D.dimensions) if not (has_interval and has_duration) or is_adcp: continue self.logger.info("Burst-processing {f.name}".format(f=f)) product_path = create_burst_average_netcdf(f.src_path, self.products_dir) product_file = PipelineFile( product_path, file_update_callback=self._file_update_callback) product_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD self.file_collection.add(product_file)
def dest_path_soop_ba(src_file): dir_list = [] fields = FileClassifier._get_file_name_fields(src_file.name) ship_code = fields[4] ship_callsign_ls = ship_callsign_list() if ship_code not in ship_callsign_ls: raise InvalidFileNameError( "Missing vessel callsign in file name '{name}'.".format( name=src_file.name)) project = fields[0] facility = fields[1][:4] sub_facility = fields[1] platform = "{ship_code}_{ship_name}".format( ship_code=ship_code, ship_name=ship_callsign_ls[ship_code]) dir_list.extend([project, facility, sub_facility, platform]) deployment_id = get_deployment_id(src_file, ship_code) dir_list.append(deployment_id) return FileClassifier._make_path(dir_list)
def archive_path_soop_ba(src_file): """Define the archive path based on info from NetCDF""" dir_list = [] fields = FileClassifier._get_file_name_fields(src_file.name) ship_code = fields[4] ship_callsign_ls = ship_callsign_list() if ship_code not in ship_callsign_ls: raise InvalidFileNameError( "Missing vessel callsign in file name '{name}'.".format( name=src_file.name)) project = fields[0] facility = fields[1][:4] sub_facility = fields[1] raw_folder = 'raw' platform = "{ship_code}_{ship_name}".format( ship_code=ship_code, ship_name=ship_callsign_ls[ship_code]) dir_list.extend([project, facility, sub_facility, raw_folder, platform]) deployment_id = get_deployment_id(src_file, ship_code) dir_list.append(deployment_id) return FileClassifier._make_path(dir_list)
def get_info_nc(filepath): file_basename = os.path.basename(filepath) if L3S_L3C_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, L3S_L3C_FILE_PATTERN) day_time = fields['day_time'] temporal_extent = fields['temporal_extent'] elif L3U_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, L3U_FILE_PATTERN) day_time = None temporal_extent = None elif L3S_MULTISENSOR_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string( file_basename, L3S_MULTISENSOR_FILE_PATTERN) day_time = fields['day_time'] temporal_extent = fields['temporal_extent'] fields['product_type'] = '%sM' % fields['product_type'] elif L3U_VIIRS_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, L3U_VIIRS_FILE_PATTERN) day_time = '' temporal_extent = None fields['sat_value'] = 'snpp' elif L3C_VIIRS_FILE_PATTERN.match(file_basename): fields = get_pattern_subgroups_from_string(file_basename, L3C_VIIRS_FILE_PATTERN) day_time = fields['day_time'] temporal_extent = fields['temporal_extent'] fields['sat_value'] = 'snpp' else: raise InvalidFileNameError( "file name: \"{filename}\" not matching regex to deduce dest_path". format(filename=os.path.basename(filepath))) prod_lev = fields['product_type'] if day_time == 'night': day_time = 'ngt' date_nc = datetime.strptime(fields['nc_time_cov_start'], '%Y%m%d%H%M%S') sat_value = fields.get('sat_value', '') if sat_value.isdigit(): sat_value = 'n%s' % sat_value if prod_lev != 'L3U': product_path = '%s-%s' % (prod_lev, temporal_extent) else: product_path = prod_lev if 'Southern' in filepath: if '-' in product_path: product_path = '%sS' % product_path else: product_path = '%s-%s' % (product_path, 'S') file_info = { 'prod_level': prod_lev, 'temporal_extent': temporal_extent, 'day_time': day_time, 'date_data': date_nc, 'sat_value': sat_value, 'product_path': product_path } return file_info
def dest_path(filepath): ljco_s3_base_dir = os.path.join('IMOS', 'SRS', 'OC', 'LJCO') netcdf_filename = os.path.basename(filepath) netcdf_filename = re.sub('_C-.*.nc$', '.nc', netcdf_filename) # remove creation date # looking for product_name m = re.search( r'^IMOS_SRS-OC-LJCO_.*_([0-9]{8}T[0-9]{6}Z)_(SRC|LJCO)_FV0([0-2]{1}).*\.nc$', netcdf_filename) if m is None: raise InvalidFileNameError( "file name not matching regex to deduce dest_path") # list of allowed products keywords products_type_ls = [ 'ACS', 'EcoTriplet', 'BB9', 'HyperOCR', 'WQM', 'DALEC' ] products_type = re.compile('|'.join(products_type_ls)) nc_product_type = products_type.findall(netcdf_filename) # list of allowed time coverage keywords products_time_cov_ls = ['hourly', 'daily', 'monthly'] products_time_cov = re.compile('|'.join(products_time_cov_ls)) nc_product_time_cov = products_time_cov.findall(netcdf_filename) # netcdf qc value nc_product_qc = 'FV0%s' % m.group(3) nc_time_cov_start = datetime.strptime(m.group(1), '%Y%m%dT%H%M%SZ') nc_year = nc_time_cov_start.year nc_month = nc_time_cov_start.month nc_day = nc_time_cov_start.day if not nc_product_type: raise InvalidFileNameError( "can not find matching product type from allowed list: {product_type_ls}" .format(product_type_ls=products_type_ls)) if nc_product_type[0] == 'DALEC': product_dir = nc_product_type[0] else: # products other than DALEC need to have product type AND time coverage info if len(nc_product_time_cov) == 0: raise InvalidFileNameError( "can not find matching time coverage from allowed list: {products_time_cov_ls}" .format(products_time_cov_ls=products_time_cov_ls)) else: product_dir = '%s-%s' % (nc_product_type[0], nc_product_time_cov[0]) nc_common_dir_structure_prefix = os.path.join(ljco_s3_base_dir, product_dir, '%d' % nc_year) # DALEC doesn't have nc_product_time_cov keywords, so we run this section first if nc_product_type[0] == 'DALEC': if nc_product_qc == 'FV02': return os.path.join(nc_common_dir_structure_prefix, '%02d' % nc_month, 'fv02-products', netcdf_filename) else: return os.path.join(nc_common_dir_structure_prefix, '%02d' % nc_month, netcdf_filename) if nc_product_time_cov[0] == 'hourly': return os.path.join(nc_common_dir_structure_prefix, '%02d' % nc_month, '%02d' % nc_day, netcdf_filename) if nc_product_time_cov[0] == 'daily': return os.path.join(nc_common_dir_structure_prefix, netcdf_filename)
def netcdf_writer(log_path, output_dir, ship_name, meta_path=[]): if meta_path != []: with open(meta_path, 'r') as f: meta_data = json.loads('\n'.join([ row for row in f.readlines() if len(row.split('#')) == 1 ])) # remove comments for ii in range(len(meta_data['calibration'])): if meta_data['calibration'][ii]['item'] == 'EFLO': calibration_flo_a0 = float( meta_data['calibration'][ii]['a0']) calibration_flo_a1 = float( meta_data['calibration'][ii]['a1']) if meta_data['calibration'][ii]['item'] == 'ESAL': calibration_sal_a0 = float( meta_data['calibration'][ii]['a0']) calibration_sal_a1 = float( meta_data['calibration'][ii]['a1']) if meta_data['calibration'][ii]['item'] == 'ETMP': calibration_tmp_a0 = float( meta_data['calibration'][ii]['a0']) calibration_tmp_a1 = float( meta_data['calibration'][ii]['a1']) if meta_data['calibration'][ii]['item'] == 'ETURB': calibration_turb_a0 = float( meta_data['calibration'][ii]['a0']) calibration_turb_a1 = float( meta_data['calibration'][ii]['a1']) df = parse_log_file(log_path) df = transform_count_to_real_val(df) log_filename = os.path.basename(log_path) fields = get_pattern_subgroups_from_string(log_filename, SOOP_NRT_LOG_PATTERN) product_code = fields['product_code'] if product_code in ['D2M', 'M2D', 'S2M', 'M2S']: product_type = "transect" feature_type = "trajectory" template = DatasetTemplate.from_json(NC_JSON_TEMPLATE_TRAJECTORY) elif product_code in ['DEV', 'MEL', 'SYD']: product_type = "mooring" feature_type = "timeSeries" template = DatasetTemplate.from_json(NC_JSON_TEMPLATE_MOORING) else: raise InvalidFileNameError( "SOOP NRT input logfile has incorrect product_code '{product_code}'. Not belonging to any of " "('D2M', 'M2D', 'S2M', 'M2S','DEV', 'MEL', 'SYD').".format( product_code=product_code)) template.global_attributes.update({'product_type': product_type}) time_val_dateobj = date2num(df.index.to_pydatetime(), template.variables['TIME']['units'], template.variables['TIME']['calendar']) # replace all nan with FillValue from template value df.replace(np.nan, template.variables['LATITUDE']['_FillValue'], inplace=True) template.variables['TIME']['_data'] = time_val_dateobj template.variables['LATITUDE']['_data'] = df.LATITUDE.values template.variables['LONGITUDE']['_data'] = df.LONGITUDE.values template.variables['TEMP']['_data'] = df.TEMP.values template.variables['PSAL']['_data'] = df.PSAL.values template.variables['TURB']['_data'] = df.TURB.values template.variables['CPHL']['_data'] = df.CPHL.values calibration_comment = 'Value=a0 + a1 x raw_value' if 'calibration_tmp_a0' in locals() and 'calibration_tmp_a1' in locals(): template.variables['TEMP']['a0'] = calibration_tmp_a0 template.variables['TEMP']['a1'] = calibration_tmp_a1 template.variables['TEMP']['calibration_comment'] = calibration_comment if 'calibration_sal_a0' in locals() and 'calibration_sal_a1' in locals(): template.variables['PSAL']['a0'] = calibration_sal_a0 template.variables['PSAL']['a1'] = calibration_sal_a1 template.variables['PSAL']['calibration_comment'] = calibration_comment if 'calibration_turb_a0' in locals() and 'calibration_turb_a1' in locals(): template.variables['TURB']['a0'] = calibration_turb_a0 template.variables['TURB']['a1'] = calibration_turb_a1 template.variables['TURB']['calibration_comment'] = calibration_comment if 'calibration_flo_a0' in locals() and 'calibration_flo_a1' in locals(): template.variables['CPHL']['a0'] = calibration_flo_a0 template.variables['CPHL']['a1'] = calibration_flo_a1 template.variables['CPHL']['calibration_comment'] = calibration_comment measurement_frequency = get_measurement_frequency(df) if measurement_frequency == 1: measurement_frequency_str = '1sec' elif measurement_frequency == 10: measurement_frequency_str = '10secs' template.global_attributes.update({ 'time_coverage_start': df.index.strftime('%Y-%m-%dT%H:%M:%SZ')[0], 'time_coverage_end': df.index.strftime('%Y-%m-%dT%H:%M:%SZ')[-1], 'featureType': feature_type, 'date_created': datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"), 'platform_code': SHIP_CODE, 'vessel_name': ship_name, 'geospatial_lat_min': df.LATITUDE.dropna().min(), 'geospatial_lat_max': df.LATITUDE.dropna().max(), 'geospatial_lon_min': df.LONGITUDE.dropna().min(), 'geospatial_lon_max': df.LONGITUDE.dropna().max(), 'measurement_frequency': measurement_frequency_str, 'history': "File created {date_created}".format( date_created=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) }) if measurement_frequency == 1: template.variables['CPHL'].update({ 'calibration_blank': CHLU_PARAMS['blank'], 'calibration_scale': CHLU_PARAMS['scale'] }) template.variables['TURB'].update({ 'calibration_blank': TURB_PARAMS['blank'], 'calibration_scale': TURB_PARAMS['scale'] }) nc_filename = 'IMOS_SOOP-TMV_TSUB_{time_start}_{vessel_code}_FV0{product_number}_{product_type}-{product_code}_END-{time_end}.nc'.format( time_start=df.index.strftime('%Y%m%dT%H%M%SZ')[0], time_end=df.index.strftime('%Y%m%dT%H%M%SZ')[-1], vessel_code=SHIP_CODE, product_number=0, product_type=product_type, product_code=product_code) netcdf_path = os.path.join(output_dir, nc_filename) template.to_netcdf(netcdf_path) return netcdf_path
def preprocess(self): if self.custom_params is not None and self.custom_params.get( 'ship_callsign_ls'): self.ship_callsign_ls = self.custom_params['ship_callsign_ls'] else: self.ship_callsign_ls = ship_callsign_list() if SHIP_CODE not in self.ship_callsign_ls: raise RuntimeError( "Missing vessel callsign {callsign} from vocabulary.".format( callsign=SHIP_CODE)) self.soop_tmv_dir = os.path.join( 'IMOS', 'SOOP', 'SOOP-TMV', '{ship_code}_{ship_name}'.format( ship_code=SHIP_CODE, ship_name=self.ship_callsign_ls[SHIP_CODE]), 'realtime') txt_files = self.file_collection.filter_by_attribute_value( 'extension', '.txt') log_files = self.file_collection.filter_by_attribute_value( 'extension', '.log') nc_files = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF) """ * 10secs zip files (*.log + *.txt [calibration]) -> *.zip is pushed to ARCHIVE_DIR (netcdf still needs to be generated to deduce path). *.log, *.txt and *.nc NOT added to the collection * 1sec zip files (*.log only) -> *.log & *.nc pushed to S3. *.zip not added to the collection """ if len(nc_files): # case where we re-push an existing NetCDF file f_nc = nc_files[0] f_nc.publish_type = PipelineFilePublishType.HARVEST_UPLOAD elif len(log_files): f_log = log_files[0] log_filename = os.path.basename(f_log.src_path) if SOOP_NRT_LOG_PATTERN.match(log_filename) is None: raise InvalidFileNameError( "SOOP TMV NRT input logfile has incorrect naming '{name}'." .format(name=log_filename)) # case to create NetCDF file from log file f_txt = None if len(txt_files): f_txt = txt_files[0] netcdf_filepath = netcdf_writer( f_log.src_path, self.temp_dir, self.ship_callsign_ls[SHIP_CODE], meta_path=f_txt.src_path) else: netcdf_filepath = netcdf_writer( f_log.src_path, self.temp_dir, self.ship_callsign_ls[SHIP_CODE]) # the path of logs and zips has to deduced within the pre-process as it needs the creation of a NetCDF to # get the correct info with Dataset(netcdf_filepath) as nc_open: measurement_frequency = nc_open.measurement_frequency product_type = nc_open.product_type year = datetime.strptime(nc_open.time_coverage_start, '%Y-%m-%dT%H:%M:%SZ').strftime("%Y") pre_path = os.path.join(self.soop_tmv_dir, product_type, measurement_frequency, year) if measurement_frequency == "1sec": f_log.publish_type = PipelineFilePublishType.UPLOAD_ONLY f_log.dest_path = os.path.join(pre_path, 'logs', f_log.name) nc_file = PipelineFile(netcdf_filepath) nc_file.publish_type = PipelineFilePublishType.HARVEST_UPLOAD self.file_collection.add(nc_file) elif measurement_frequency == "10secs": if self.input_file.endswith('zip'): self.input_file_object.publish_type = PipelineFilePublishType.ARCHIVE_ONLY self.input_file_object.archive_path = os.path.join( pre_path, 'logs', self.input_file_object.name) self.file_collection.add(self.input_file_object) f_log.publish_type = PipelineFilePublishType.NO_ACTION if f_txt: f_txt.publish_type = PipelineFilePublishType.NO_ACTION else: # case when a 10secs log file (and not a zip) is pushed to incoming f_log.publish_type = PipelineFilePublishType.ARCHIVE_ONLY f_log.archive_path = os.path.join(pre_path, 'logs', f_log.name)
def preprocess(self): # if input file is a NetCDF, create a .nc.gz and harvest upload it. # historically, files were always sent as *.nc.gz. But as of April 2021, files might be pushed as *.nc. # to be consistent, we transform this .nc into a .nz.gz if self.file_type is FileType.NETCDF: self.file_collection.set_publish_types( PipelineFilePublishType.NO_ACTION) gzip_path = os.path.join(self.temp_dir, self.file_basename + '.gz') with open(self.input_file, 'rb') as f_in, gzip.open(gzip_path, 'wb') as gz_out: gz_out.writelines(f_in) # publish self.add_to_collection( gzip_path, publish_type=PipelineFilePublishType.HARVEST_UPLOAD) if self.file_type is FileType.GZIP: # add nc_gz file to collection (not by default) self.file_collection.add(self.input_file_object) netcdf_file_gz_collection = self.file_collection.filter_by_attribute_id( 'file_type', FileType.GZIP) netcdf_file_gz = netcdf_file_gz_collection[0] netcdf_file_gz.publish_type = PipelineFilePublishType.HARVEST_UPLOAD # default # some GSLA files are gzipped, so gunzip them before checking them # if uploaded file is GZIP check that GZIP contains a NetCDF netcdf_collection = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF) if len(netcdf_collection) != 1: raise InvalidInputFileError( "Expecting one netCDF file in GZIP archive '{gzip}'". format(gzip=os.path.basename(self.input_file))) netcdf_file_gz = self.file_collection.filter_by_attribute_id( 'file_type', FileType.GZIP)[0] netcdf_file = self.file_collection.filter_by_attribute_id( 'file_type', FileType.NETCDF)[0] # setting the path of the gz file with the gunzipped file netcdf_file_gz.dest_path = self.dest_path(netcdf_file.src_path) # Nothing to do with *.nc. Talend can harvest *.nc.gz. Set to NO_ACTION netcdf_file.publish_type = PipelineFilePublishType.NO_ACTION # we don't know the product type (DM00 or DM01) of the file already # on s3 in order to deduce its path. We need to get the product # type from the file in incoming result_previous_version_creation_date = self.get_previous_version_creation_date( netcdf_file.src_path) """ default values by default we push to the storage the file landed in the pipeline (ie *.nc.gz) """ push_new_file = True remove_previous_version = False # compare creation dates with file already on storage if result_previous_version_creation_date: new_file_creation_date = get_creation_date(netcdf_file.name) if result_previous_version_creation_date > new_file_creation_date: push_new_file = False elif result_previous_version_creation_date == new_file_creation_date: push_new_file = True else: remove_previous_version = True previous_file_path = self.get_previous_version_object( netcdf_file.src_path) if push_new_file: if GSLA_REGEX_YEARLY.match(netcdf_file.name): # yearly file should never be harvested netcdf_file_gz.publish_type = PipelineFilePublishType.UPLOAD_ONLY else: raise InvalidFileNameError( "file name: \"{filename}\" creation date is older than file already on " "storage".format(filename=netcdf_file_gz.name)) # deletion of the previous file if remove_previous_version: previous_file_name = os.path.basename(previous_file_path) file_to_delete = PipelineFile( previous_file_name, is_deletion=True, dest_path=previous_file_path, file_update_callback=self._file_update_callback) if GSLA_REGEX_YEARLY.match(netcdf_file.name): file_to_delete.publish_type = PipelineFilePublishType.DELETE_ONLY else: file_to_delete.publish_type = PipelineFilePublishType.DELETE_UNHARVEST self.file_collection.add(file_to_delete)