def HL_error_covs(list_of_files, outfilename="corrs.nc"):

    """ Top-level routine that calculates the H+L error covariances from a 
        list of netcdf files containing the accumulated statistics for each
        grid box.

    *************** PARAMETERS *******************
    1. list_of_files: list of netcdf files containing the accumulated statistics
    2. outfilename: name of file to contain the output statistics
    """ 
    # Check list of netcdf files
    list_of_files = Utils.check_files(list_of_files)
    if list_of_files is None:
       raise ValueError("[ERROR] NETCDF FILES NOT FOUND")

    grid_lat, grid_lon, depths, bins = IO.ncread_dimension_variables(list_of_files[0])
    nbin = len(bins)
    ndep = len(depths)
    nlat = len(grid_lat)
    nlon = len(grid_lon)

    # Create netcdf object and add dimensions
    outfile = IO.nc_define_dimensions(outfilename, nlat, nlon, nbin, ndep)
    
    # Add netcdf variables (final ErrorCovs stats)
    IO.nc_define_cov_variables(outfile)

    # Write dimension variables (depth, lat, lon and bins)
    IO.ncwrite_dimension_variables(outfile, grid_lat, grid_lon, 
                                   depths, bins)

    for dep_lev in range(0, ndep):
        print("MESSAGE: Calculating error covariance for level: " + str(depths[dep_lev]) + " m")
        final_cov_stats = arrays.CovSumStats((nlat, nlon, nbin))
        final_grid_stats = arrays.GridSumStats((nlat, nlon))
        for f in list_of_files:
            print("MESSAGE: Reading file {}".format(f))
            cov_stats, grid_stats = IO.ncread_accum_stats(f, nlat, nlon, nbin, dep_lev)
            final_cov_stats += cov_stats
            final_grid_stats += grid_stats

        # Calculate correlation and covariancee
        cov_xy, corr_xy, grid_mean, grid_var, numobsgrid, \
        numpairscov = HLerrorCovs.calc_err_covs(final_cov_stats, 
                             final_grid_stats, nbin, nlat, nlon)

        # Mask output data
        applyMask.mask_output_cov_data(grid_mean, grid_var, numobsgrid,
                                       numpairscov, cov_xy, corr_xy)

        # Write error covariances to output file
        IO.ncwrite_covariance(outfile, dep_lev, grid_mean, grid_var,
                           numobsgrid, numpairscov, cov_xy, corr_xy)

    outfile.close()
예제 #2
0
    def mp_calc_cov_accum_stats(self, args):
        """ Routine to run on multiprocessors that reads in feedback 
              file and calculates accumulated grid and covariance statistics  
      
              ********* PARAMETERS ********
              1. args: dictionary with members: list of files, bins, 
                       grid_lat, grid_lon, depth_range, obs_type and 
                       source_types

              ********* RETURNS ***********
              1. sum_stats: CovSumStats object containing arrays of
                            summed covariance statistical quantities
                            for each grid box
              2. grid_stats: GridSumStats object containing arrays of
                             within grid summed statistical quantities
                             for each grid box
          """
        # initalises data array classes
        nlat = len(args["grid_lat"])
        nlon = len(args["grid_lon"])
        nbin = len(args["bins"])
        depth_range = args["depth_range"]
        source_types = args["source_types"]
        cov_stats = arrays.CovSumStats((nlat, nlon, nbin))
        grid_stats = arrays.GridSumStats((nlat, nlon))

        # loop over files and calculate the accumulated stats
        for infile in args["list_of_files"]:
            if depth_range:
                print("Processing file: {} | Depths: {} to {}".format(
                    infile, str(depth_range[0]), str(depth_range[1])))
            else:
                print("Processing file: {} | Depths: Surface variable".format(
                    infile))

            # Read fdbk variables
            fdbk_var_array, depths = IO.ncread_fdbk_vars(
                infile, args['obs_type'], args['source_types'])

            # For profiles pick a single observation at each depth range/latitude/longitude
            # This is done to avoid profiles being correlated with themselves
            if depth_range:
                fdbk_var_array = ObsProfiles.random_subsample_profiles(
                    depths, depth_range, fdbk_var_array)

            if len(fdbk_var_array.lats) > 0:
                if (np.min(args["grid_lon"]) >= 0.):
                    fdbk_var_array.lons[np.where(
                        fdbk_var_array.lons < 0.)] = fdbk_var_array.lons + 360.

                # Need to squash obs and model arrays to be 1D array
                fdbk_var_array.mod_vals = fdbk_var_array.mod_vals.flatten()
                fdbk_var_array.obs_vals = fdbk_var_array.obs_vals.flatten()

                # Update stats with summed quantities for each call
                cov_stats, grid_stats = self.calc_cov_stats(
                    args["grid_lat"], args["grid_lon"], args["bins"],
                    fdbk_var_array, cov_stats, grid_stats)

        return cov_stats, grid_stats
def HL_cov_accum_stats(list_of_fdbackfiles, obs_type="SST",
                       outfilename="accum_stats.nc", grid_def=[[-90,90,2.],[-180,180,2.]],
                       bins=np.arange(50,1000.,50.), depth_boundaries=[],
                       source_types=[], nproc=1):

    """ Top-level routine that calculates the H+L accumulated statistics from a 
        list of feedback files using multiprocessing and produces an output file
        which is further used to calculate the error covariances.

    *************** PARAMETERS *******************
    1. list_of_fdbackfiles: list of feedback files
    2. outfilename: name of file to contain the output statistics
    3. grid_def: Definition of pre-sorting grid; should be a list-of-lists of the form
                 [[min lat, max lat, delta lat], [min lat, max lat, delta lat]]
    4. obs_type: observation type to process
    5. bins: list defining the upper boundary of the bins of separation distance
             (in km) to be used for correlation calculation
    6. depth_boundaries: depth level boundaries (only used for processing 
             profile observations)
    7. source_types: observation id types to process 
                     (default: [], which means process all)
    8. nproc: number of processors (default is 1)
    """ 
    # Check list of feedack files
    list_of_fdbackfiles = Utils.check_files(list_of_fdbackfiles)
    if list_of_fdbackfiles is None:
       raise ValueError("[ERROR] FEEDBACK FILES NOT FOUND")

    # define number of bins
    nbin = len(bins)
    bins = np.array(bins)

    # define depth variables 
    ndep = 1
    depths = 0
    if any(depth_boundaries):
       depth_boundaries = np.array(depth_boundaries)
       ndep = len(depth_boundaries)-1
       depths = 0.5 * (depth_boundaries[:-1] + depth_boundaries[1:])
    
    # create variables for pre sorting grid
    grid_lat = np.arange(grid_def[0][0],grid_def[0][1],grid_def[0][2]) + grid_def[0][2]/2.
    grid_lon = np.arange(grid_def[1][0],grid_def[1][1],grid_def[1][2]) + grid_def[1][2]/2.
    nlat = len(grid_lat)
    nlon = len(grid_lon)

    # divide list of files for each processor
    list_per_proc = Utils.divide_files_per_proc(nproc, list_of_fdbackfiles)

    # Create netcdf object and add dimensions
    outfile = IO.nc_define_dimensions(outfilename, nlat, nlon, nbin, ndep)
    
    # Add netcdf variables (accumulated stats)
    IO.nc_define_accum_stats_variables(outfile)

    # Write dimension variables (depth, lat, lon and bins)
    IO.ncwrite_dimension_variables(outfile, grid_lat, grid_lon, depths, bins)

    print("MESSAGE: {} nprocs to process {} feedback files".format(nproc,
                                               len(list_of_fdbackfiles)))
    arg_list=[]
    for dep_lev in range(0, ndep):
        print("MESSAGE: Calculating accumulated stats for level: ", dep_lev)
        
        # set up workers
        workers=Pool(nproc)
        
        if ndep == 1:
            depth_range = []
        else:
            depth_range = [depth_boundaries[dep_lev],depth_boundaries[dep_lev+1]]
            
        arg_list=[]
        for n in range(0, nproc):
            arg_list += [{"list_of_files":list_per_proc[n],
                          "bins": bins*1000,    # NOTE: conversion to meters
                          "grid_lon": grid_lon,
                          "grid_lat": grid_lat,
                          "depth_range": depth_range,
                          "obs_type": obs_type,
                          "source_types": source_types}]

        # send tasks off to workers
        work_output = workers.map(HLerrorCovs.mp_calc_cov_accum_stats, arg_list)
        workers.close()
 
        # Accumulate stats over all the processors
        sum_stats = arrays.CovSumStats((nlat, nlon, nbin))
        grid_stats = arrays.GridSumStats((nlat, nlon))
        for p in work_output:
            sum_stats += p[0]
            grid_stats += p[1]
  
        # Write accumulated stats to output file
        IO.ncwrite_accum_stats(outfile, dep_lev, sum_stats, grid_stats)

    outfile.close()
# error covariance calculation.
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
import numpy as np
from multiprocessing import Pool
################## Code modules ##############################
import arrays
from io_data import IO
from utils import Utils
from errorCovs import HLerrorCovs
from masks import applyMask

# Initialising the classes
IO = IO()
Utils = Utils()
HLerrorCovs = HLerrorCovs()
applyMask = applyMask()

def HL_cov_accum_stats(list_of_fdbackfiles, obs_type="SST",
                       outfilename="accum_stats.nc", grid_def=[[-90,90,2.],[-180,180,2.]],
                       bins=np.arange(50,1000.,50.), depth_boundaries=[],
                       source_types=[], nproc=1):

    """ Top-level routine that calculates the H+L accumulated statistics from a 
        list of feedback files using multiprocessing and produces an output file
        which is further used to calculate the error covariances.

    *************** PARAMETERS *******************
    1. list_of_fdbackfiles: list of feedback files
def HL_fitting_function(infile,
                        outfilename,
                        func_name="MultiGauss",
                        num_funcs=2,
                        lenscale=(400, 40),
                        plot=None,
                        outfig='./figures',
                        nproc=4,
                        min_num_obs=2,
                        max_iter=100):
    """ Top-level routine that fits a specific function to HL stats covariance file

    ***************** PARAMETERS *******************
    1. infile: name of file containing the HL error covariances
    2. outfilename: name of file to write the results to
    3. func_name: name of function to fit to (options: MultiGauss and MultiGauss_Fixed)
    4. num_funcs: number of functions to use (default: 2)
    5. lenscale: Tuple of pre-defined lengthscales used in MultiGauss_Fixed
                 or in MultiGauss as initial guesses for the lengthscales.
                 Number of tuple members must be equal to number of functions.
    6. plot: positions of (x,y) pairs to plot or None (default: None)
    7. outfig: path to save the figs
    8. nproc: number of processors to use (default: 4)
    9. min_num_obs: minimum number of observations to do calculations
    10. max_iter: max number of iterations
    """

    # Checking consistency of input parameters
    if (func_name != "MultiGauss" and func_name != "MultiGauss_Fixed"):
        raise ValueError("[ERROR] FUNCTION NOT AVAILABLE")

    if (len(lenscale) != num_funcs):
        raise ValueError("[ERROR] NUMBER OF LENGTHSCALES NOT COMPATIBLE " +
                         "WITH NUMBER OF FUNCTIONS")

    if (not os.path.exists(infile)):
        raise ValueError("[ERROR] INPUT FILE NOT FOUND")

    # Read dimension variables from netcdf file
    lats, lons, depth, bins = IO.ncread_dimension_variables(infile)

    # Create netcdf object and add dimensions
    outfile = IO.nc_define_dimensions(outfilename, len(lats), len(lons),
                                      len(depth))

    # Write dimension variables (depth, lat, lon and bins)
    IO.ncwrite_dimension_variables(outfile, lats, lons, depth)

    # Add attributes
    outfile.Function = "Function fitting done using the " + func_name + " function"

    # Add variables
    IO.nc_define_vars(outfile, "Chi_sq", 'f',
                      ("depth", "latitude", "longitude"))
    IO.nc_define_vars(outfile, "obs_err", 'f',
                      ("depth", "latitude", "longitude"))

    # Calculate x positions based on the separation distances
    x_val = Posproc.calc_x_positions(bins)

    for lev in range(0, len(depth)):
        print("MESSAGE: Fitting function " + func_name +
              " to ErrorCov data: " + str(depth[lev]) + " m")

        # set up workers
        workers = Pool(nproc)

        # Reading error covariance variables
        var, cors, numobsvar = IO.ncread_errorcovs(infile, lev)

        # Creating list with arguments to run in parallel
        arg_lists = Posproc.create_arg_list(x_val, cors, var, numobsvar,
                                            min_num_obs, func_name, num_funcs,
                                            lenscale, max_iter)

        # Get workers to do parallel calculations
        results = workers.map(Posproc.fitter, arg_lists)
        workers.close()

        # Unravel results into output grids
        params, obs_err, chi_grid = Posproc.results_to_grid(
            results, len(lats), len(lons))

        # Plot some results if requested
        if plot != None:
            print("MESSAGE: Plotting results - data versus fitting: " +
                  str(depth[lev]) + " m")
            Plots.plot_data_vs_fitting(outfig, plot, x_val, cors, var, obs_err,
                                       lats, lons, depth[lev], params,
                                       func_name, num_funcs, lenscale)

        print("MESSAGE: Writing data to netcdf file: " + str(depth[lev]) +
              " m")
        if lev == 0:
            for param in range(0, len(params)):
                # Define netcdf variables from fitting function results
                IO.nc_define_vars(outfile,
                                  arg_lists[0]["func"].param_names()[param],
                                  'f', ("depth", "latitude", "longitude"))

        # Add variables to netcdf
        IO.ncwrite_output(outfile, arg_lists[0]["func"], chi_grid, obs_err,
                          params, lev)

    outfile.close()