def test_pb_info_no_subdir(): # Verify that the prepbufr file information is correctly # curated into a list of named tuples. Testing on data that # is separated into ymd dated subdirs. Test against the 20170601 # data file: prepbufr.gdas.2017060100 pb = pb2nc_wrapper() # Make sure we are dealing with the GDAS data pb.pb_dict['PREPBUFR_FILE_REGEX'] = 'prepbufr.gdas.(2[0-9]{9})' pb.pb_dict['NC_FILE_TMPL'] = 'prepbufr.gdas.{valid?fmt=%Y%m%d%H}.nc]' num_expected_files = 117 data_dir = '/d1/METplus_Mallory/data/prepbufr/gdas' file_regex = 'prepbufr.gdas.2[0-9]{9}' logger = logging.getLogger("test_log") pb_files = util.get_files(data_dir, file_regex, logger) time_method = 'valid' test_file = os.path.join(data_dir, 'prepbufr.gdas.2017060100') pb_info_list = [] for pb_file in pb_files: pb_info = pb.retrieve_pb_time_info(pb_file, time_method) pb_info_list.append(pb_info) actual_full_filepaths = [] if len(pb_info_list) != num_expected_files: # Fail, number of files is not what was expected assert True is False for pb_info in pb_info_list: actual_full_filepaths.append(pb_info.full_filepath) if test_file not in actual_full_filepaths: # Fail, expected file not found assert True is False
def test_pb_info_with_subdir(): # Verify that the prepbufr file information is correctly # curated into a list of named tuples. Testing on data that # is separated into ymd dated subdirs. Perform test on only # one subdirectory's worth of data. pytest.skip('Function no longer used') # Make sure we are dealing with the GDAS data pb = pb2nc_wrapper() pb.c_dict['PREPBUFR_FILE_REGEX'] =\ 'nam.t([0-9]{2})z.prepbufr.tm([0-9]{2})' pb.c_dict['NC_FILE_TMPL'] =\ 'prepbufr.{valid?fmt=%Y%m%d%H}.t{cycle?fmt=%HH}z.nc' expected_file_subdir =\ '/d1/METplus_Mallory/data/prepbufr/nam/nam.20170615' expected_files = [ 'nam.t00z.prepbufr.tm00', 'nam.t00z.prepbufr.tm03', 'nam.t06z.prepbufr.tm00', 'nam.t06z.prepbufr.tm03', 'nam.t12z.prepbufr.tm00', 'nam.t12z.prepbufr.tm03', 'nam.t18z.prepbufr.tm00', 'nam.t18z.prepbufr.tm03' ] # expected_ymd = '20170615' num_expected_files = 8 expected_full_filepaths = [] for expected_file in expected_files: expected_full_filepaths.append( os.path.join(expected_file_subdir, expected_file)) # Get the ymd of the first subdirectory subdir = '/d1/METplus_Mallory/data/prepbufr/nam/nam.20170615' ymd_match = re.match(r'.*(2[0-9]{7}).*', subdir) ymd = ymd_match.group(1) file_regex = 'nam.t([0-9]{2})z.prepbufr.tm([0-9]{2})' logger = logging.getLogger("temp_log") pb_files = util.get_files(subdir, file_regex, logger) time_method = 'valid' all_pb_info = [] for pb_file in pb_files: pb_info = pb.retrieve_pb_time_info(pb_file, time_method, ymd) all_pb_info.append(pb_info) if len(all_pb_info) != num_expected_files: # Fail there should be one entry for each file assert True is False for expected_full_filepath in expected_full_filepaths: if expected_full_filepath not in pb_files: assert True is False
def run_at_time(self, input_dict): """!Get TC-paris data then regrid tiles centered on the storm. Get TC-pairs track data and GFS model data, do any necessary processing then regrid the forecast and analysis files to a 30 x 30 degree tile centered on the storm. Args: input_dict: Time dictionary Returns: None: invokes regrid_data_plane to create a netCDF file from two extratropical storm track files. """ time_info = time_util.ti_calculate(input_dict) init_time = time_info['init_fmt'] # get the process id to be used to identify the output # amongst different users and runs. cur_pid = str(os.getpid()) tmp_dir = os.path.join(self.config.getdir('TMP_DIR'), cur_pid) self.logger.info("Begin extract tiles") cur_init = init_time[0:8] + "_" + init_time[8:10] # Check that there are tc_pairs data which are used as input if util.is_dir_empty(self.tc_pairs_dir): self.logger.error("No tc pairs data found at {}"\ .format(self.tc_pairs_dir)) sys.exit(1) # Create the name of the filter file we need to find. If # the file doesn't exist, then run TC_STAT filter_filename = "filter_" + cur_init + ".tcst" filter_name = os.path.join(self.filtered_out_dir, cur_init, filter_filename) if util.file_exists(filter_name) and not self.overwrite_flag: self.logger.debug("Filter file exists, using Track data file: {}"\ .format(filter_name)) else: # Create the storm track by applying the # filter options defined in the config/param file. # Use TcStatWrapper to build up the tc_stat command and invoke # the MET tool tc_stat to perform the filtering. tiles_list = util.get_files(self.tc_pairs_dir, ".*tcst", self.logger) tiles_list_str = ' '.join(tiles_list) tcs = TcStatWrapper(self.config, self.logger) tcs.build_tc_stat(self.filtered_out_dir, cur_init, tiles_list_str, self.addl_filter_opts) # Remove any empty files and directories that can occur # from filtering. util.prune_empty(filter_name, self.logger) # Now get unique storm ids from the filter file, # filter_yyyymmdd_hh.tcst sorted_storm_ids = util.get_storm_ids(filter_name, self.logger) # Check for empty sorted_storm_ids, if empty, # continue to the next time. if not sorted_storm_ids: # No storms found for init time, cur_init msg = "No storms were found for {} ...continue to next in list"\ .format(cur_init) self.logger.debug(msg) return # Process each storm in the sorted_storm_ids list # Iterate over each filter file in the output directory and # search for the presence of the storm id. Store this # corresponding row of data into a temporary file in the # /tmp/<pid> directory. for cur_storm in sorted_storm_ids: storm_output_dir = os.path.join(self.filtered_out_dir, cur_init, cur_storm) header = open(filter_name, "r").readline() util.mkdir_p(storm_output_dir) util.mkdir_p(tmp_dir) tmp_filename = "filter_" + cur_init + "_" + cur_storm full_tmp_filename = os.path.join(tmp_dir, tmp_filename) storm_match_list = util.grep(cur_storm, filter_name) with open(full_tmp_filename, "a+") as tmp_file: # copy over header information tmp_file.write(header) for storm_match in storm_match_list: tmp_file.write(storm_match) # Perform regridding of the forecast and analysis files # to an n X n degree tile centered on the storm (dimensions # are indicated in the config/param file). feature_util.retrieve_and_regrid(full_tmp_filename, cur_init, cur_storm, self.filtered_out_dir, self.config) # end of for cur_storm # Remove any empty files and directories in the extract_tiles output # directory util.prune_empty(self.filtered_out_dir, self.logger) # Clean up the tmp directory if it exists if os.path.isdir(tmp_dir): util.rmtree(tmp_dir)
def get_pb_files_by_time(self): """! Identify the prepbufr files that are within the specified time window and the specified time intervals between initialization times. Args: Returns: files_of_interest : A list of the full filepaths corresponding to the files of interest (i.e. those files that are within the specified start and end times, and the appropriate interval times) """ # pylint:disable=protected-access # Need to call sys.__getframe() to get the filename and method/func # for logging information. # Used for logging. cur_filename = sys._getframe().f_code.co_filename cur_function = sys._getframe().f_code.co_name self.logger.info("INFO|:" + cur_function + '|' + cur_filename + '| ' + "Filtering prepbufr files based on time (init or " "valid).") # Create a list of times that are expected, based on # the begin, end, and interval date/times. dates_needed = [] initial_start_date = \ self.convert_date_strings_to_unix_times(self.pb_dict['START_DATE']) current_start_date = initial_start_date interval_time_str = self.pb_dict['INTERVAL_TIME'] interval_time = int(interval_time_str) * self.HOURS_TO_SECONDS start_date_unix = self.convert_date_strings_to_unix_times( self.pb_dict['START_DATE']) end_date_unix = self.convert_date_strings_to_unix_times( self.pb_dict['END_DATE']) while start_date_unix <= current_start_date <= end_date_unix: dates_needed.append(current_start_date) current_start_date = current_start_date + interval_time # Iterate through the input prepbufr directories to determine the # init or valid times. # Get a list of all the sub-directories and files under the # PREPBUFR_DATA_DIR/PREPBUFR_MODEL_DIR_NAME dir_to_search = os.path.join(self.pb_dict['PREPBUFR_DATA_DIR'], self.pb_dict['PREPBUFR_MODEL_DIR_NAME']) pb_subdirs_list = util.get_dirs(dir_to_search) # Determine whether times are to be retrieved based on init times: # ymd + cycle hour or valid times: ymd + (cycle hour - offset) # initialize the time flag if self.pb_dict['TIME_METHOD'].lower() == 'by_init': time_flag = 'init' elif self.pb_dict['TIME_METHOD'].lower() == 'by_valid': time_flag = 'valid' else: # unsupported time method self.logger.error('ERROR|:' + cur_function + '|' + cur_filename + ' Unrecognized time method, only BY_INIT or ' + 'BY_VALID are supported. Check the ' + 'TIME_METHOD setting in your configuration ' + 'file.') sys.exit(1) # Some prepbufr files are organized into YMD subdirectories with # the cycle and offset (fhr) times incorporated into their filenames. # # There are also prepbufr files that are not separated by YMD and # instead have the YMDh incorporated into their filenames. Provide # support for both cases. files_within_time_criteria = [] if pb_subdirs_list: for pb_subdir in pb_subdirs_list: # Retrieve the YMD from the subdirectory name dir_regex = self.pb_dict['PREPBUFR_DIR_REGEX'] regex_search = re.compile(dir_regex) match = re.match(regex_search, pb_subdir) if match: ymd = match.group(1) regex_file = self.pb_dict['PREPBUFR_FILE_REGEX'] pb_files_list = util.get_files(pb_subdir, regex_file, self.logger) if pb_files_list: # Calculate the init or valid time for each file, then # determine if this is found in the list of times of # interest, dates_needed[]. for pb_file in pb_files_list: # Create the time information for this file, based # on init time or valid time, as indicated by the # time_flag. pb_file_time_info = \ self.retrieve_pb_time_info(pb_file, time_flag, ymd) if pb_file_time_info.pb_unix_time in dates_needed: files_within_time_criteria.append( pb_file_time_info.full_filepath) else: # No files in subdirectory, continue to next # subdirectory self.logger.info('INFO:|' + cur_function + '|' + cur_filename + ' No files found in ' 'current ' 'subdirectory: ' + pb_subdir + ' continue checking ' 'next available ' 'subdirectory for files.') continue else: # No subdirectories, only files, get the list of files to process # These files will have YMD incorporated in the filename. files_within_time_criteria = [] pb_files_list = util.get_files(dir_to_search, self.pb_dict['PREPBUFR_FILE_REGEX'], self.logger) if not pb_files_list: self.logger.error('ERROR:|' + cur_function + '|' + cur_filename + ' No files were found. ' 'Check the path to ' 'the prepbufr ' 'data directory in your ' 'configuration file.') sys.exit(1) else: for pb_file in pb_files_list: pb_file_time_info = \ self.retrieve_pb_time_info(pb_file, time_flag) if pb_file_time_info.pb_unix_time in dates_needed: files_within_time_criteria.append( pb_file_time_info.full_filepath) return files_within_time_criteria
def retrieve_data(self): """! Retrieve data from track files and return the min and max lon. Returns: None """ self.logger.debug("Begin retrieving data...") all_tracks_list = [] # Store the data in the track list. if os.path.isdir(self.input_data): self.logger.debug("Generate plot for all files in the directory" + self.input_data) # Get the list of all files (full file path) in this directory all_init_files = util.get_files(self.input_data, ".*.tcst", self.logger) for init_file in all_init_files: # Ignore empty files if os.stat(init_file).st_size == 0: self.logger.info("Empty file") continue # logger.info("Consider all files under directory" + # init_file + " with " + " init time (ymd): " + # self.init_date + " and lead time (hh):" + self.lead_hr) with open(init_file, 'r') as infile: self.logger.debug("Parsing file " + init_file) # Extract information from the header, which is # the first line. header = infile.readline() # print("header: {}".format(header)) column_indices = self.get_columns_and_indices(header) # For the remaining lines of this file, # retrieve information from each row: # lon, lat, init time, lead hour, valid time, # model name, mslp, and basin. # NOTE: Some of these aren't used until we fully # emulate Guang Ping's plots. for line in infile: track_dict = {} col = line.split() lat = col[column_indices['ALAT']] lon = col[column_indices['ALON']] init_time = col[column_indices['INIT']] fcst_lead_hh = \ str(col[column_indices['LEAD']]).zfill(3) model_name = col[column_indices['AMODEL']] valid_time = col[column_indices['VALID']] storm_id = col[column_indices['STORM_ID']] # Not needed until support for regional plots # is implemented. # mslp = col[column_indices['AMSLP']] # basin = col[column_indices['BASIN']] # Check for NA values in lon and lat, skip to # next line in file if 'NA' is encountered. if lon == 'NA' or lat == 'NA': continue else: track_dict['lon'] = float(lon) track_dict['lat'] = float(lat) # If the lead hour is 'NA', skip to next line. # The track data was very likely generated with # TC-Stat set to match-pairs set to True. lead_hr = self.extract_lead_hr(fcst_lead_hh) if lead_hr == 'NA': continue # Check that the init date, init hour # and model name are what the user requested. init_ymd, init_hh = \ self.extract_date_and_time_from_init(init_time) if init_ymd == self.init_date and\ init_hh == self.init_hr: if model_name == self.model: # Check for the requested model, # if the model matches the requested # model name, then we have all the # necessary information to continue. # Store all data in dictionary track_dict['fcst_lead_hh'] = fcst_lead_hh track_dict['init_time'] = init_time track_dict['model_name'] = model_name track_dict['valid_time'] = valid_time track_dict['storm_id'] = storm_id # Identify the 'first' point of the # storm track. If the storm id is novel, then # retrieve the date and hh from the valid time if storm_id in self.unique_storm_id: track_dict['first_point'] = False track_dict['valid_dd'] = '' track_dict['valid_hh'] = '' else: self.unique_storm_id.add(storm_id) # Since this is the first storm_id with # a valid value for lat and lon (ie not # 'NA'), this is the first track point # in the storm track and will be # labelled with the corresponding # date/hh z on the plot. valid_match = \ re.match(r'[0-9]{6}([0-9]{2})_' + '([0-9]{2})[0-9]{4}', track_dict['valid_time']) if valid_match: valid_dd = valid_match.group(1) valid_hh = valid_match.group(2) else: # Shouldn't get here if this is # the first point of the track. valid_dd = '' valid_hh = '' track_dict['first_point'] = True track_dict['valid_dd'] = valid_dd track_dict['valid_hh'] = valid_hh # Identify points based on valid time (hh). # Useful for plotting later on. valid_match = \ re.match(r'[0-9]{8}_([0-9]{2})[0-9]{4}', track_dict['valid_time']) if valid_match: # Since we are only interested in 00, # 06, 12, and 18 hr times... valid_hh = valid_match.group(1) if valid_hh == '00' or valid_hh == '12': track_dict['lead_group'] = '0' elif valid_hh == '06' or valid_hh == '18': track_dict['lead_group'] = '6' else: # To gracefully handle any hours other # than 0, 6, 12, or 18 track_dict['lead_group'] = '' all_tracks_list.append(track_dict) # For future work, support for MSLP when # generating regional plots- # implementation goes here... # Finishing up, do any cleaning up, # logging, etc. self.logger.info("INFO: All criteria met, " + "saving track data init " + track_dict['init_time'] + " lead " + track_dict['fcst_lead_hh'] + " lon " + str(track_dict['lon']) + " lat " + str(track_dict['lat'])) else: # Not the requested model, move to next # row of data continue else: # Not the requested init ymd move to next # row of data continue # Now separate the data based on storm id. for cur_unique in self.unique_storm_id: cur_storm_list = [] for cur_line in all_tracks_list: if cur_line['storm_id'] == cur_unique: cur_storm_list.append(cur_line) else: # Continue to next line in all_tracks_list continue # Create the storm_id_dict, which is the data # structure used to separate the storm data based on # storm id. self.storm_id_dict[cur_unique] = cur_storm_list else: self.logger.error("Input directory expected, check " + "configuration file and try again.") sys.exit(1)
def run_all_times(self): """! Builds the command for invoking tcmpr.R plot script. Args: Returns: """ base_cmds_list = [' Rscript ', self.tcmpr_script, ' -lookin '] base_cmds = ''.join(base_cmds_list) self.logger.debug("base_cmds " + base_cmds) cmds_list = [] self.logger.debug("DEBUG: TCMPR input " + self.input_data) self.logger.debug("DEBUG: TCMPR config file " + self.plot_config_file) self.logger.debug("DEBUG: output " + self.output_base_dir) # Create a list of all the "optional" options and flags. optionals_list = self.retrieve_optionals() # Create the output base directory util.mkdir_p(self.output_base_dir) # If input data is a file, create a single command and invoke R script. if os.path.isfile(self.input_data): self.logger.debug("Currently plotting " + self.input_data) cmds_list.append(base_cmds) cmds_list.append(self.input_data) # Special treatment of the "optional" output_base_dir option # because we are supporting the plotting of multiple tcst files # in a directory. if self.output_base_dir: # dated_output_dir = self.create_output_subdir(self.input_data) optionals_list.append(' -outdir ') # optionals_list.append(dated_output_dir) optionals_list.append(self.output_base_dir) optionals = ''.join(optionals_list) if optionals: cmds_list.append(optionals) # Due to the way cmds_list was created, join it all in to # one string and than split that in to a list, so element [0] # is 'Rscript', instead of 'Rscript self.tcmpr_script -lookin' cmds_list = ''.join(cmds_list).split() # cmd = batchexe('sh')['-c',''.join(cmds_list)] > '/dev/null' cmd = batchexe(cmds_list[0])[cmds_list[1:]] > '/dev/null' self.logger.debug("DEBUG: Command run " + cmd.to_shell()) self.logger.info("INFO: Generating requested plots for " + self.input_data) # pylint:disable=unnecessary-pass # If a tc file is empty, continue to the next, thus the pass # isn't unnecessary. try: checkrun(cmd) except produtil.run.ExitStatusException as ese: self.logger.warn("WARN: plot_tcmpr.R returned non-zero" " exit status, " "tcst file may be missing data, " "continuing: " + ese) # Remove the empty directory if not os.listdir(self.output_base_dir): os.rmdir(self.output_base_dir) pass # If the input data is a directory, create a list of all the # files in the directory and invoke the R script for this list # of files. if os.path.isdir(self.input_data): self.logger.debug("plot all files in directory " + self.input_data) cmds_list = [] all_tcst_files_list = util.get_files(self.input_data, ".*.tcst", self.logger) all_tcst_files = ' '.join(all_tcst_files_list) self.logger.debug("num of files " + str(len(all_tcst_files))) # Append the mandatory -lookin option to the base command. cmds_list.append(base_cmds) cmds_list.append(all_tcst_files) # dated_output_dir = self.create_output_subdir(self.output_plot) dated_output_dir = self.output_base_dir if self.output_base_dir: cmds_list.append(' -outdir ') util.mkdir_p(self.output_base_dir) cmds_list.append(self.output_base_dir) self.logger.debug("DEBUG: Creating dated output dir " + dated_output_dir) if optionals_list: remaining_options = ''.join(optionals_list) cmds_list.append(remaining_options) # Due to the way cmds_list was created, join it all in to # one string and than split that in to a list, so element [0] # is 'Rscript', instead of 'Rscript self.tcmpr_script -lookin' cmds_list = ''.join(cmds_list).split() cmd = batchexe(cmds_list[0])[cmds_list[1:]] > '/dev/null' self.logger.debug("DEBUG: Command run " + cmd.to_shell()) # pylint:disable=unnecessary-pass # If a tc file is empty, continue to the next, thus the pass # isn't unnecessary. try: checkrun(cmd) except produtil.run.ExitStatusException as ese: # If the tcst file is empty (with the exception of the # header), or there is some other problem, then # plot_tcmpr.R will return with a non-zero exit status of 1 self.logger.warn("WARN: plot_tcmpr.R returned non-zero" " exit status, tcst file may be missing" " data... continuing: " + str(ese)) # Remove the empty directory # Remove the empty directory if not os.listdir(dated_output_dir): os.rmdir(dated_output_dir) pass # Reset empty cmds_list to prepare for next tcst file. cmds_list = [] self.logger.info("INFO: Plotting complete")
def run_all_times(self): """! Builds the command for invoking tcmpr.R plot script. Args: Returns: """ self.logger.debug("TCMPR input " + self.input_data) self.logger.debug("TCMPR config file " + self.plot_config_file) self.logger.debug("output " + self.output_base_dir) # Create a dictionary of all the "optional" options and flags. cmds_dict = self.retrieve_optionals() # Create the TCMPR output base directory, where the final plots # will be saved. util.mkdir_p(self.output_base_dir) # If input data is a file, create a single command and invoke R script. if os.path.isfile(self.input_data): self.logger.debug("Currently plotting " + self.input_data) cmds_dict[' -lookin '] = self.input_data # Special treatment of the "optional" output_base_dir option # because we are supporting the plotting of multiple tcst files # in a directory. if self.output_base_dir: # dated_output_dir = self.create_output_subdir(self.input_data) cmds_dict[' -outdir '] = self.output_base_dir # Generate the list, where the -args are separated by their # values. full_cmd_list = ['Rscript' + self.tcmpr_script] for key, value in cmds_dict.items(): full_cmd_list.append(key) full_cmd_list.append(value) # Separate the 'Rscript' portion from the args, to conform to # produtil's exe syntax. cmd = exe(full_cmd_list[0])[full_cmd_list[1:]] > '/dev/null' self.logger.debug("Command run " + cmd.to_shell()) self.logger.info("Generating requested plots for " + self.input_data) # pylint:disable=unnecessary-pass # If a tc file is empty, continue to the next, thus the pass # isn't unnecessary. try: checkrun(cmd) except produtil.run.ExitStatusException as ese: self.logger.warn("plot_tcmpr.R returned non-zero" " exit status, " "tcst file may be missing data, " "continuing: " + repr(ese)) # If the input data is a directory, create a list of all the # files in the directory and invoke the R script for this list # of files. elif os.path.isdir(self.input_data): self.logger.debug("plot all files in directory " + self.input_data) cmds_dict = self.retrieve_optionals() all_tcst_files_list = util.get_files(self.input_data, ".*.tcst", self.logger) all_tcst_files = ' '.join(all_tcst_files_list) self.logger.debug("num of files " + str(len(all_tcst_files))) # Append the mandatory -lookin option to the base command. cmds_dict['-lookin'] = all_tcst_files if self.output_base_dir: cmds_dict['-outdir'] = self.output_base_dir self.logger.debug("Creating dated output dir " + self.output_base_dir) # Create the full_cmd_list from the keys and values of the # cmds_dict and then form one command list. full_cmd_list = list() full_cmd_list.append("Rscript") full_cmd_list.append(self.tcmpr_script) for key, value in cmds_dict.items(): full_cmd_list.append(key) if key == '-lookin': # treat the list of dirs in -lookin differently, # append each individual directory to replicate original # implementation's behavior of splitting the commands # by whitespace and assigning each command to an item # in a list. for tcst_file in all_tcst_files_list: full_cmd_list.append(tcst_file) elif key == '-plot': # plot types list is also appended as a single string, # delimited by ','. full_cmd_list.append(','.join(value)) elif key == '-dep': # dependant variables list items are appended # as one string. Convert list into a string delimited # by ','. full_cmd_list.append(','.join(value)) else: full_cmd_list.append(value) # Separate the 'Rscript' portion from the args, to conform to # produtil's exe syntax. cmd = exe(full_cmd_list[0])[full_cmd_list[1:]] > '/dev/null' # This can be a very long command if the user has # indicated a directory. Only log this if necessary. # self.logger.debug("DEBUG: Command run " + cmd.to_shell()) # cmd_str = ' '.join(full_cmd_list) # cmd_list = 'Rscript ' + cmd_str # self.logger.debug('TCMPR Command run: ' + cmd_str) # Now run the command via produtil try: checkrun(cmd) except produtil.run.ExitStatusException as ese: # If the tcst file is empty (with the exception of the # header), or there is some other problem, then # plot_tcmpr.R will return with a non-zero exit status of 1 self.logger.error("plot_tcmpr.R returned non-zero" " exit status, tcst file may be missing" " data... continuing: " + str(ese)) sys.exit(1) else: self.logger.error("Expected input is neither a file nor directory," "exiting...") sys.exit(1) self.logger.info("Plotting complete")
def create_input_file_info(self, file_type): """! Consolidate all the relevant information on the input files such as full filepath, date (ymd or ymdh), and cycle and offset times (if applicable/available from filename), and the valid time. Args: file_type - either "fcst" (model) or "obs" Returns: consolidated_file_info - a list of named tuples containing information useful for determining the valid time of the file: full_filepath, date (ymd or ymdh), and offset/fhr if available/applicable. """ # pylint:disable=protected-access # Need to call sys.__getframe() to get the filename and method/func # for logging information. # Used for logging. cur_filename = sys._getframe().f_code.co_filename cur_function = sys._getframe().f_code.co_name self.logger.info("INFO|:" + cur_function + '|' + cur_filename + '| ' + "Creating file information for model/fcst or obs...") # Get a list of all the model/fcst files dir_to_search = self.ps_dict['FCST_INPUT_DIR'] fcst_file_regex = self.ps_dict['FCST_INPUT_FILE_REGEX'] all_fcst_files = util.get_files(dir_to_search, fcst_file_regex, self.logger) # Get a list of all the obs files dir_to_search = self.ps_dict['OBS_INPUT_DIR'] obs_file_regex = self.ps_dict['OBS_INPUT_FILE_REGEX'] all_obs_files = util.get_files(dir_to_search, obs_file_regex, self.logger) # Initialize the output list consolidated_file_info = [] # Determine which files are within the valid time window. # Whenever there is more than one fcst file with the same valid time, # keep it, because we want to perform verification for all fcst/model # forecast hours. time_method = self.ps_dict['TIME_METHOD'] valid_start = self.ps_dict['START_DATE'] valid_end = self.ps_dict['END_DATE'] fhr_start = self.ps_dict['FCST_HR_START'] fhr_end = self.ps_dict['FCST_HR_END'] fhr_interval = self.ps_dict['FCST_HR_INTERVAL'] fhr_start_secs = int(fhr_start) * self.HOURS_TO_SECONDS fhr_end_secs = int(fhr_end) * self.HOURS_TO_SECONDS last_fhr = fhr_end_secs + 1 fhr_interval_secs = int(fhr_interval) * self.HOURS_TO_SECONDS date_start = self.convert_date_strings_to_unix_times(str(valid_start)) date_end = self.convert_date_strings_to_unix_times(str(valid_end)) all_valid_times = [] all_dates = [] all_fhrs = [] for cur_fhr in range(fhr_start_secs, last_fhr, fhr_interval_secs): all_fhrs.append(cur_fhr) # create a list of tuples: date (yyyymmdd) and forecast hour (both # in seconds) to represent all the valid times of interest. if time_method == 'BY_VALID': for cur_date in range(date_start, date_end, fhr_interval_secs): for cur_fhr in range(fhr_start_secs, last_fhr, fhr_interval_secs): cur_init_time = cur_date - cur_fhr if cur_init_time not in all_dates: all_dates.append(cur_init_time) all_valid_times.append(cur_date) if time_method == 'BY_INIT': #original code from Minna for cur_date in range(date_start, date_end, fhr_interval_secs): for cur_fhr in range(fhr_start_secs, last_fhr, fhr_interval_secs): cur_valid_time = cur_date + cur_fhr if cur_valid_time not in all_valid_times: all_valid_times.append(cur_valid_time) all_dates.append(cur_date) InputFileInfo = namedtuple( 'InputFileInfo', 'full_filepath, date, ' 'valid_time, cycle_or_fcst') if file_type == "fcst": # Get the information for the fcst/model file if all_fcst_files: fcst_input_regex = self.ps_dict['FCST_INPUT_FILE_REGEX'] regex_match = re.compile(fcst_input_regex) for fcst_file in all_fcst_files: match = re.match(regex_match, fcst_file) time_info_tuple = \ self.get_time_info_from_file(match) # Determine if this file's valid time is one of the valid times of interest and corresponds to # the expected forecast hour (based on forecast hour start and forecast hour interval). If # so, then consolidate the time info into the InputFileInfo tuple. if time_info_tuple.date in all_dates and time_info_tuple.cycle_or_fcst in all_fhrs: input_file_info = \ InputFileInfo(fcst_file, time_info_tuple.date, time_info_tuple.valid, time_info_tuple.cycle_or_fcst) consolidated_file_info.append(input_file_info) else: self.logger.error('ERROR:|' + cur_function + '|' + cur_filename + 'No fcst files found in ' 'specified input directory. ' ' Please verify that data ' 'files are present and the ' 'input directory path in ' 'the config file is correct.') else: # Get the relevant information for the obs file if all_obs_files: obs_input_regex = self.ps_dict['OBS_INPUT_FILE_REGEX'] regex_match = re.compile(obs_input_regex) for obs_file in all_obs_files: match = re.match(regex_match, obs_file) time_info_tuple = self.get_time_info_from_file(match) # Determine if this file's valid time is one of the valid times of interest. If # so, then consolidate the time info into the InputFileInfo tuple. Obs files may or may not have # a cycle time (e.g. no cycle time: prepbufr.gdas.2017061500.nc vs. # cycle time: prepbufr.nam.20170611.t00z.tm00.nc), so we need to check if the cycle_or_fcst tuple # value is None: if time_info_tuple.cycle_or_fcst is None: if time_info_tuple.valid in all_valid_times: input_file_info = \ InputFileInfo(obs_file, time_info_tuple.date, time_info_tuple.valid, time_info_tuple.cycle_or_fcst) consolidated_file_info.append(input_file_info) else: if time_info_tuple.valid in all_valid_times and time_info_tuple.cycle_or_fcst in all_fhrs: input_file_info = \ InputFileInfo(obs_file, time_info_tuple.date, time_info_tuple.valid, time_info_tuple.cycle_or_fcst) consolidated_file_info.append(input_file_info) else: self.logger.error('ERROR:|' + cur_function + '|' + cur_filename + '| No obs files found in ' 'specified input directory. ' ' Please verify that data ' 'files are present and the ' 'input directory path in ' 'the config file is correct.') return consolidated_file_info
def run_all_times(self): """! Invoke the series analysis script based on the init time in the format YYYYMMDD_hh Args: Returns: None: Creates graphical plots of storm tracks """ # pylint:disable=protected-access # Need to call sys.__getframe() to get the filename and method/func # for logging information. # Used for logging. cur_filename = sys._getframe().f_code.co_filename cur_function = sys._getframe().f_code.co_name self.logger.info("Starting series analysis by init time") # Set up the environment variable to be used in the Series Analysis # Config file (SERIES_ANALYSIS_BY_INIT_CONFIG_FILE) # Used to set cnt value in output_stats in # "SERIES_ANALYSIS_BY_INIT_CONFIG_FILE" # Need to do some pre-processing so that Python will use " and not ' # because currently MET doesn't support single-quotes tmp_stat_string = str(self.stat_list) tmp_stat_string = tmp_stat_string.replace("\'", "\"") # For example, we want tmp_stat_string to look like # '["TOTAL","FBAR"]', NOT "['TOTAL','FBAR']" os.environ['STAT_LIST'] = tmp_stat_string self.add_env_var('STAT_LIST', tmp_stat_string) series_filter_opts = \ self.config.getstr('config', 'SERIES_ANALYSIS_FILTER_OPTS') if self.regrid_with_met_tool: # Regridding via MET Tool regrid_data_plane. fcst_tile_regex = self.config.getstr('regex_pattern', 'FCST_NC_TILE_REGEX') anly_tile_regex = self.config.getstr('regex_pattern', 'ANLY_NC_TILE_REGEX') else: # Regridding via wgrib2 tool. fcst_tile_regex = self.config.getstr('regex_pattern', 'FCST_TILE_REGEX') anly_tile_regex = self.config.getstr('regex_pattern', 'ANLY_TILE_REGEX') # Initialize the tile_dir to point to the extract_tiles_dir. # And retrieve a list of init times based on the data available in # the extract tiles directory. tile_dir = self.extract_tiles_dir init_times = util.get_updated_init_times(tile_dir, self.logger) # Check for input tile data. try: util.check_for_tiles(tile_dir, fcst_tile_regex, anly_tile_regex, self.logger) except OSError: msg = ("Missing n x m tile files. " + "Extract tiles needs to be run") self.logger.error(msg) # If applicable, apply any filtering via tc_stat, as indicated in the # parameter/config file. tmp_dir = os.path.join(self.config.getdir('TMP_DIR'), str(os.getpid())) if series_filter_opts: self.apply_series_filters(tile_dir, init_times, self.series_filtered_out_dir, self.filter_opts, tmp_dir) # Clean up any empty files and directories that could arise as # a result of filtering util.prune_empty(self.series_filtered_out_dir, self.logger) # Get the list of all the files that were created as a result # of applying the filter options. # First, make sure that the series_lead_filtered_out # directory isn't empty. If so, then no files fall within the # filter criteria. if os.listdir(self.series_filtered_out_dir): # The series filter directory has data, use this directory as # input for series analysis. tile_dir = self.series_filtered_out_dir # Generate the tmp_anly and tmp_fcst files used to validate # filtering and for troubleshooting # The tmp_fcst and tmp_anly ASCII files contain the # list of files that meet the filter criteria. filtered_dirs_list = util.get_files(tile_dir, ".*.", self.logger) util.create_filter_tmp_files(filtered_dirs_list, self.series_filtered_out_dir, self.logger) else: # Applying the filter produced no results. Rather than # stopping, continue by using the files from extract_ # tiles as input. msg = ("Applied series filter options, no results..." + "using extract tiles data for series analysis input.") self.logger.debug(msg) tile_dir = self.extract_tiles_dir else: # No additional filtering was requested. # Use the data in the extract tiles directory # as input for series analysis. # source of input tile data. tile_dir = self.extract_tiles_dir # Create FCST and ANLY ASCII files based on init time and storm id. # These are arguments to the # -fcst and -obs arguments to the MET Tool series_analysis. # First, get an updated list of init times, # since filtering can reduce the amount of init times. sorted_filter_init = self.get_ascii_storm_files_list(tile_dir) # Clean up any remaining empty files and dirs util.prune_empty(self.series_out_dir, self.logger) self.logger.debug("Finished creating FCST and ANLY ASCII files, and " + "cleaning empty files and dirs") # Build up the arguments to and then run the MET tool series_analysis. self.build_and_run_series_request(sorted_filter_init, tile_dir) # Generate plots # Check for .nc files in output_dir first, if these are absent, the # there is a problem. if self.is_netcdf_created(): self.generate_plots(sorted_filter_init, tile_dir) else: self.logger.error("No NetCDF files were created by" " series_analysis, exiting...") sys.exit(errno.ENODATA) self.logger.info("Finished series analysis by init time")
def get_ascii_storm_files_list(self, tile_dir): """! Creates the list of ASCII files that contain the storm id and init times. The list is used to create an ASCII file which will be used as the option to the -obs or -fcst flag to the MET series_analysis tool. Args: @param tile_dir: The directory where input files reside. Returns: sorted_filter_init: A list of the sorted directories corresponding to the init times after filtering has been applied. If filtering produced no results, this is the list of files created from running extract_tiles. """ # pylint:disable=protected-access # Need to call sys.__getframe() to get the filename and method/func # for logging information. # For logging cur_filename = sys._getframe().f_code.co_filename cur_function = sys._getframe().f_code.co_name filter_init_times = util.get_updated_init_times(tile_dir, self.logger) sorted_filter_init = sorted(filter_init_times) for cur_init in sorted_filter_init: # Get all the storm ids for storm track pairs that # correspond to this init time. storm_list = self.get_storms_for_init(cur_init, tile_dir) if not storm_list: # No storms for this init time, # check next init time in list continue else: for cur_storm in storm_list: # First get the filenames for the gridded forecast and # analysis (n deg x m deg tiles that were created by # extract_tiles). These files are aggregated by # init time and storm id. anly_grid_regex = ".*ANLY_TILE_F.*grb2" fcst_grid_regex = ".*FCST_TILE_F.*grb2" if self.regrid_with_met_tool: anly_grid_regex = ".*ANLY_TILE_F.*nc" fcst_grid_regex = ".*FCST_TILE_F.*nc" anly_grid_files = util.get_files(tile_dir, anly_grid_regex, self.logger) fcst_grid_files = util.get_files(tile_dir, fcst_grid_regex, self.logger) # Now do some checking to make sure we aren't # missing either the forecast or # analysis files, if so log the error and proceed to next # storm in the list. if not anly_grid_files or not fcst_grid_files: # No gridded analysis or forecast # files found, continue self.logger.info("no gridded analysis or forecast " + "file found, continue to next storm") continue # Now create the FCST and ANLY ASCII files based on # cur_init and cur_storm: self.create_fcst_anly_to_ascii_file( fcst_grid_files, cur_init, cur_storm, self.fcst_ascii_file_prefix) self.create_fcst_anly_to_ascii_file( anly_grid_files, cur_init, cur_storm, self.anly_ascii_file_prefix) util.prune_empty(self.series_out_dir, self.logger) return sorted_filter_init
def get_fcst_file_info(self, dir_to_search, cur_init, cur_storm): """! Get the number of all the gridded forecast n x m tile files for a given storm id and init time (that were created by extract_tiles). Determine the filename of the first and last files. This information is used to create the title value to the -title opt in plot_data_plane. Args: @param dir_to_search: The directory of the gridded files of interest. @param cur_init: The init time of interest. @param cur_storm: The storm id of interest. Returns: num, beg, end: A tuple representing the number of forecast tile files, and the first and last file. sys.exit(1) otherwise """ # pylint:disable=protected-access # Need to call sys.__getframe() to get the filename and method/func # for logging information. # For logging cur_filename = sys._getframe().f_code.co_filename cur_function = sys._getframe().f_code.co_name # Get a sorted list of the forecast tile files for the init # time of interest for all the storm ids and return the # forecast hour corresponding to the first and last file. # base_dir_to_search = os.path.join(output_dir, cur_init) gridded_dir = os.path.join(dir_to_search, cur_init, cur_storm) search_regex = ".*FCST_TILE.*.grb2" if self.regrid_with_met_tool: search_regex = ".*FCST_TILE.*.nc" files_of_interest = util.get_files(gridded_dir, search_regex, self.logger) sorted_files = sorted(files_of_interest) if not files_of_interest: msg = ("exiting, no files found for " + "init time of interest" + " and directory:" + dir_to_search) self.logger.error(msg) sys.exit(1) first = sorted_files[0] last = sorted_files[-1] # Extract the forecast hour from the first and last # filenames. match_beg = re.search(".*FCST_TILE_(F[0-9]{3}).*.grb2", first) match_end = re.search(".*FCST_TILE_(F[0-9]{3}).*.grb2", last) if self.regrid_with_met_tool: match_beg = re.search(".*FCST_TILE_(F[0-9]{3}).*.nc", first) match_end = re.search(".*FCST_TILE_(F[0-9]{3}).*.nc", last) if match_beg: beg = match_beg.group(1) else: msg = ("Unexpected file format encountered, exiting...") self.logger.error(msg) sys.exit(1) if match_end: end = match_end.group(1) else: msg = ("Unexpected file format encountered, exiting...") self.logger.error(msg) sys.exit(1) # Get the number of forecast tile files num = len(sorted_files) return num, beg, end