def HL_error_covs(list_of_files, outfilename="corrs.nc"): """ Top-level routine that calculates the H+L error covariances from a list of netcdf files containing the accumulated statistics for each grid box. *************** PARAMETERS ******************* 1. list_of_files: list of netcdf files containing the accumulated statistics 2. outfilename: name of file to contain the output statistics """ # Check list of netcdf files list_of_files = Utils.check_files(list_of_files) if list_of_files is None: raise ValueError("[ERROR] NETCDF FILES NOT FOUND") grid_lat, grid_lon, depths, bins = IO.ncread_dimension_variables(list_of_files[0]) nbin = len(bins) ndep = len(depths) nlat = len(grid_lat) nlon = len(grid_lon) # Create netcdf object and add dimensions outfile = IO.nc_define_dimensions(outfilename, nlat, nlon, nbin, ndep) # Add netcdf variables (final ErrorCovs stats) IO.nc_define_cov_variables(outfile) # Write dimension variables (depth, lat, lon and bins) IO.ncwrite_dimension_variables(outfile, grid_lat, grid_lon, depths, bins) for dep_lev in range(0, ndep): print("MESSAGE: Calculating error covariance for level: " + str(depths[dep_lev]) + " m") final_cov_stats = arrays.CovSumStats((nlat, nlon, nbin)) final_grid_stats = arrays.GridSumStats((nlat, nlon)) for f in list_of_files: print("MESSAGE: Reading file {}".format(f)) cov_stats, grid_stats = IO.ncread_accum_stats(f, nlat, nlon, nbin, dep_lev) final_cov_stats += cov_stats final_grid_stats += grid_stats # Calculate correlation and covariancee cov_xy, corr_xy, grid_mean, grid_var, numobsgrid, \ numpairscov = HLerrorCovs.calc_err_covs(final_cov_stats, final_grid_stats, nbin, nlat, nlon) # Mask output data applyMask.mask_output_cov_data(grid_mean, grid_var, numobsgrid, numpairscov, cov_xy, corr_xy) # Write error covariances to output file IO.ncwrite_covariance(outfile, dep_lev, grid_mean, grid_var, numobsgrid, numpairscov, cov_xy, corr_xy) outfile.close()
def mp_calc_cov_accum_stats(self, args): """ Routine to run on multiprocessors that reads in feedback file and calculates accumulated grid and covariance statistics ********* PARAMETERS ******** 1. args: dictionary with members: list of files, bins, grid_lat, grid_lon, depth_range, obs_type and source_types ********* RETURNS *********** 1. sum_stats: CovSumStats object containing arrays of summed covariance statistical quantities for each grid box 2. grid_stats: GridSumStats object containing arrays of within grid summed statistical quantities for each grid box """ # initalises data array classes nlat = len(args["grid_lat"]) nlon = len(args["grid_lon"]) nbin = len(args["bins"]) depth_range = args["depth_range"] source_types = args["source_types"] cov_stats = arrays.CovSumStats((nlat, nlon, nbin)) grid_stats = arrays.GridSumStats((nlat, nlon)) # loop over files and calculate the accumulated stats for infile in args["list_of_files"]: if depth_range: print("Processing file: {} | Depths: {} to {}".format( infile, str(depth_range[0]), str(depth_range[1]))) else: print("Processing file: {} | Depths: Surface variable".format( infile)) # Read fdbk variables fdbk_var_array, depths = IO.ncread_fdbk_vars( infile, args['obs_type'], args['source_types']) # For profiles pick a single observation at each depth range/latitude/longitude # This is done to avoid profiles being correlated with themselves if depth_range: fdbk_var_array = ObsProfiles.random_subsample_profiles( depths, depth_range, fdbk_var_array) if len(fdbk_var_array.lats) > 0: if (np.min(args["grid_lon"]) >= 0.): fdbk_var_array.lons[np.where( fdbk_var_array.lons < 0.)] = fdbk_var_array.lons + 360. # Need to squash obs and model arrays to be 1D array fdbk_var_array.mod_vals = fdbk_var_array.mod_vals.flatten() fdbk_var_array.obs_vals = fdbk_var_array.obs_vals.flatten() # Update stats with summed quantities for each call cov_stats, grid_stats = self.calc_cov_stats( args["grid_lat"], args["grid_lon"], args["bins"], fdbk_var_array, cov_stats, grid_stats) return cov_stats, grid_stats
def HL_cov_accum_stats(list_of_fdbackfiles, obs_type="SST", outfilename="accum_stats.nc", grid_def=[[-90,90,2.],[-180,180,2.]], bins=np.arange(50,1000.,50.), depth_boundaries=[], source_types=[], nproc=1): """ Top-level routine that calculates the H+L accumulated statistics from a list of feedback files using multiprocessing and produces an output file which is further used to calculate the error covariances. *************** PARAMETERS ******************* 1. list_of_fdbackfiles: list of feedback files 2. outfilename: name of file to contain the output statistics 3. grid_def: Definition of pre-sorting grid; should be a list-of-lists of the form [[min lat, max lat, delta lat], [min lat, max lat, delta lat]] 4. obs_type: observation type to process 5. bins: list defining the upper boundary of the bins of separation distance (in km) to be used for correlation calculation 6. depth_boundaries: depth level boundaries (only used for processing profile observations) 7. source_types: observation id types to process (default: [], which means process all) 8. nproc: number of processors (default is 1) """ # Check list of feedack files list_of_fdbackfiles = Utils.check_files(list_of_fdbackfiles) if list_of_fdbackfiles is None: raise ValueError("[ERROR] FEEDBACK FILES NOT FOUND") # define number of bins nbin = len(bins) bins = np.array(bins) # define depth variables ndep = 1 depths = 0 if any(depth_boundaries): depth_boundaries = np.array(depth_boundaries) ndep = len(depth_boundaries)-1 depths = 0.5 * (depth_boundaries[:-1] + depth_boundaries[1:]) # create variables for pre sorting grid grid_lat = np.arange(grid_def[0][0],grid_def[0][1],grid_def[0][2]) + grid_def[0][2]/2. grid_lon = np.arange(grid_def[1][0],grid_def[1][1],grid_def[1][2]) + grid_def[1][2]/2. nlat = len(grid_lat) nlon = len(grid_lon) # divide list of files for each processor list_per_proc = Utils.divide_files_per_proc(nproc, list_of_fdbackfiles) # Create netcdf object and add dimensions outfile = IO.nc_define_dimensions(outfilename, nlat, nlon, nbin, ndep) # Add netcdf variables (accumulated stats) IO.nc_define_accum_stats_variables(outfile) # Write dimension variables (depth, lat, lon and bins) IO.ncwrite_dimension_variables(outfile, grid_lat, grid_lon, depths, bins) print("MESSAGE: {} nprocs to process {} feedback files".format(nproc, len(list_of_fdbackfiles))) arg_list=[] for dep_lev in range(0, ndep): print("MESSAGE: Calculating accumulated stats for level: ", dep_lev) # set up workers workers=Pool(nproc) if ndep == 1: depth_range = [] else: depth_range = [depth_boundaries[dep_lev],depth_boundaries[dep_lev+1]] arg_list=[] for n in range(0, nproc): arg_list += [{"list_of_files":list_per_proc[n], "bins": bins*1000, # NOTE: conversion to meters "grid_lon": grid_lon, "grid_lat": grid_lat, "depth_range": depth_range, "obs_type": obs_type, "source_types": source_types}] # send tasks off to workers work_output = workers.map(HLerrorCovs.mp_calc_cov_accum_stats, arg_list) workers.close() # Accumulate stats over all the processors sum_stats = arrays.CovSumStats((nlat, nlon, nbin)) grid_stats = arrays.GridSumStats((nlat, nlon)) for p in work_output: sum_stats += p[0] grid_stats += p[1] # Write accumulated stats to output file IO.ncwrite_accum_stats(outfile, dep_lev, sum_stats, grid_stats) outfile.close()
# error covariance calculation. os.environ["MKL_NUM_THREADS"] = "1" os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" import numpy as np from multiprocessing import Pool ################## Code modules ############################## import arrays from io_data import IO from utils import Utils from errorCovs import HLerrorCovs from masks import applyMask # Initialising the classes IO = IO() Utils = Utils() HLerrorCovs = HLerrorCovs() applyMask = applyMask() def HL_cov_accum_stats(list_of_fdbackfiles, obs_type="SST", outfilename="accum_stats.nc", grid_def=[[-90,90,2.],[-180,180,2.]], bins=np.arange(50,1000.,50.), depth_boundaries=[], source_types=[], nproc=1): """ Top-level routine that calculates the H+L accumulated statistics from a list of feedback files using multiprocessing and produces an output file which is further used to calculate the error covariances. *************** PARAMETERS ******************* 1. list_of_fdbackfiles: list of feedback files
def HL_fitting_function(infile, outfilename, func_name="MultiGauss", num_funcs=2, lenscale=(400, 40), plot=None, outfig='./figures', nproc=4, min_num_obs=2, max_iter=100): """ Top-level routine that fits a specific function to HL stats covariance file ***************** PARAMETERS ******************* 1. infile: name of file containing the HL error covariances 2. outfilename: name of file to write the results to 3. func_name: name of function to fit to (options: MultiGauss and MultiGauss_Fixed) 4. num_funcs: number of functions to use (default: 2) 5. lenscale: Tuple of pre-defined lengthscales used in MultiGauss_Fixed or in MultiGauss as initial guesses for the lengthscales. Number of tuple members must be equal to number of functions. 6. plot: positions of (x,y) pairs to plot or None (default: None) 7. outfig: path to save the figs 8. nproc: number of processors to use (default: 4) 9. min_num_obs: minimum number of observations to do calculations 10. max_iter: max number of iterations """ # Checking consistency of input parameters if (func_name != "MultiGauss" and func_name != "MultiGauss_Fixed"): raise ValueError("[ERROR] FUNCTION NOT AVAILABLE") if (len(lenscale) != num_funcs): raise ValueError("[ERROR] NUMBER OF LENGTHSCALES NOT COMPATIBLE " + "WITH NUMBER OF FUNCTIONS") if (not os.path.exists(infile)): raise ValueError("[ERROR] INPUT FILE NOT FOUND") # Read dimension variables from netcdf file lats, lons, depth, bins = IO.ncread_dimension_variables(infile) # Create netcdf object and add dimensions outfile = IO.nc_define_dimensions(outfilename, len(lats), len(lons), len(depth)) # Write dimension variables (depth, lat, lon and bins) IO.ncwrite_dimension_variables(outfile, lats, lons, depth) # Add attributes outfile.Function = "Function fitting done using the " + func_name + " function" # Add variables IO.nc_define_vars(outfile, "Chi_sq", 'f', ("depth", "latitude", "longitude")) IO.nc_define_vars(outfile, "obs_err", 'f', ("depth", "latitude", "longitude")) # Calculate x positions based on the separation distances x_val = Posproc.calc_x_positions(bins) for lev in range(0, len(depth)): print("MESSAGE: Fitting function " + func_name + " to ErrorCov data: " + str(depth[lev]) + " m") # set up workers workers = Pool(nproc) # Reading error covariance variables var, cors, numobsvar = IO.ncread_errorcovs(infile, lev) # Creating list with arguments to run in parallel arg_lists = Posproc.create_arg_list(x_val, cors, var, numobsvar, min_num_obs, func_name, num_funcs, lenscale, max_iter) # Get workers to do parallel calculations results = workers.map(Posproc.fitter, arg_lists) workers.close() # Unravel results into output grids params, obs_err, chi_grid = Posproc.results_to_grid( results, len(lats), len(lons)) # Plot some results if requested if plot != None: print("MESSAGE: Plotting results - data versus fitting: " + str(depth[lev]) + " m") Plots.plot_data_vs_fitting(outfig, plot, x_val, cors, var, obs_err, lats, lons, depth[lev], params, func_name, num_funcs, lenscale) print("MESSAGE: Writing data to netcdf file: " + str(depth[lev]) + " m") if lev == 0: for param in range(0, len(params)): # Define netcdf variables from fitting function results IO.nc_define_vars(outfile, arg_lists[0]["func"].param_names()[param], 'f', ("depth", "latitude", "longitude")) # Add variables to netcdf IO.ncwrite_output(outfile, arg_lists[0]["func"], chi_grid, obs_err, params, lev) outfile.close()