def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", do3hr = False, doQC = False, doBC = False):
    '''
    Make 1x1 pentad climatology

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/both?
    :param bool do3hr: run on 3hr --> pentad data
    :param bool doQC: incorporate the QC flags or not
    :param bool doBC: work on the bias corrected data

    :returns:
    '''
    settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)

    if suffix == "relax":
        N_YEARS_PRESENT = 10 # number of years present to calculate climatology
    elif suffix == "strict":
        N_YEARS_PRESENT = 15 # number of years present to calculate climatology


    print "Do 3hrly: {}".format(do3hr)

    N_YEARS = end_year - start_year + 1

    # read in each variable - memory issues

    all_clims = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    # KW - why set up as np.ones?
    all_clims.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])

    all_stds = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    all_stds.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    
    # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though?
    all_n_obs = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
    all_n_obs.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
    all_n_obs.fill_value = -1
    
    for v, var in enumerate(OBS_ORDER):
	    
        print var.name

        # number of pentads = 365/5 = 73
        # set up empty data array
        all_pentads = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
	# sets up a mask of 'False' = not masked!
        all_pentads.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
        all_pentads.fill_value = settings.mdi

        # read in relevant years
        for y, year in enumerate(np.arange(start_year, end_year + 1)): 

            print year

            if do3hr:
                filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)
 
            else:
                filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)

            ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

            all_pentads[y, :, :, :] = ncdf_file.variables[var.name][:]

            if v == 0:
                all_n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:]

        # years x pentads x lats x lons
        n_grids = np.ma.count(all_pentads, axis = 0)

        # collapse down the years
        if settings.doMedian:
            all_clims[v, :, :, :] = utils.bn_median(all_pentads, axis = 0)
        else:
            all_clims[v, :, :, :] = np.ma.mean(all_pentads, axis = 0)

        all_stds[v, :, :, :] = np.ma.std(all_pentads, axis = 0)

        # mask where fewer than 50% of years have data
        locs = np.ma.where(n_grids < N_YEARS_PRESENT)
        all_clims[v, :, :, :].mask[locs] = True
        # KW should probably mask stdev too - although unmasked it does show the potential coverage
        all_stds[v, :, :, :].mask[locs] = True

        if settings.plots and v == 0:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5)
            plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r")       
            plt.title("Number of years present in each pentad")
            plt.xlabel("Number of years (max = 30)")
            plt.ylabel("Frequency (log scale)")
            plt.savefig(settings.PLOT_LOCATION + "pentad_clims_n_years_{}_{}_{}.png".format(year, period, suffix))

            
    # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this 
    # is working as it should. No import numpy.ma?        
    all_obs = np.ma.sum(all_n_obs, axis = 0)

    # set up time array
    times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time")
    times.data = np.arange(0, 73) * 5

    # write files
    if do3hr:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.nc".format(period, suffix)
    else:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.nc".format(period, suffix)

    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")

    if do3hr:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_from_3hrly_{}_{}.nc".format(period, suffix)
    else:
       out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_{}_{}.nc".format(period, suffix)

    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")

    # test distribution of obs with grid boxes
    if do3hr:
        outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.txt".format(period, suffix), "w")
    else:
        outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.txt".format(period, suffix), "w")

    utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT)

    return # calculate_climatology
def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", daily = False, 
                          doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, 
			  doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False):
#def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", daily = False, doQC = False, doBC = False):
# end
    '''
    Make 5x5 monthly climatology

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/both?
    :param bool daily: run in 1x1 daily --> 5x5 monthly data
    :param bool doQC: incorporate the QC flags or not
# KATE modified
    :param bool doQC1it: incorporate the 1st iteration QC flags or not
    :param bool doQC2it: incorporate the 2nd iteration QC flags or not
    :param bool doQC3it: incorporate the 3rd iteration QC flags or not
# end
    :param bool doBC: work on the bias corrected data
# KATE modified
    :param bool doBCtotal: work on the bias corrected data
    :param bool doBChgt: work on the height only bias corrected data
    :param bool doBCscn: work on the screen only bias corrected data
# end

    :returns:
    '''
# KATE modified
    settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it)
    #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)
# end

    if suffix == "relax":
        N_YEARS_PRESENT = 10 # number of years present to calculate climatology
    elif suffix == "strict":
        N_YEARS_PRESENT = 15 # number of years present to calculate climatology


    print "Do daily: {}".format(daily)

    N_YEARS = end_year - start_year + 1

    # read in each variable - memory issues

    all_clims = np.ma.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])
    # KW - why set up as np.ones?
    all_clims.mask = np.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])

    all_stds = np.ma.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])
    all_stds.mask = np.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)])
    
    # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though?
    all_n_obs = np.ma.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
    all_n_obs.mask = np.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
    all_n_obs.fill_value = -1
    
    if daily:
        filename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}_{}.nc".format(settings.OUTROOT, period, suffix)
            
    else:
        filename = settings.DATA_LOCATION + "{}_5x5_monthly_{}_{}.nc".format(settings.OUTROOT, period, suffix)

    ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

    times = ncdf_file.variables["time"]
    data_start = int(times.long_name.split(" ")[2].split("/")[-1])
    clim_offset = (start_year - data_start) * 12
        
    for v, var in enumerate(OBS_ORDER):
	    
        print var.name

        # number of pentads = 365/5 = 73
        # set up empty data array
        all_months = np.ma.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
	# sets up a mask of 'False' = not masked!
        all_months.mask = np.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)])
        all_months.fill_value = settings.mdi

        all_months[:, :, :] = ncdf_file.variables[var.name][clim_offset:clim_offset + (30*12)]

        # months x lats x lons
        shape = all_months.shape
        all_months = all_months.reshape(-1, 12, shape[-2], shape[-1])

        n_grids = np.ma.count(all_months, axis = 0)

        # collapse down the years
# KATE MEDIAN WATCH
# KATE modified - forced to use MEAN
        all_clims[v, :, :, :] = np.ma.mean(all_months, axis = 0)
        #if settings.doMedian:
        #    all_clims[v, :, :, :] = utils.bn_median(all_months, axis = 0)
        #else:
        #    all_clims[v, :, :, :] = np.ma.mean(all_months, axis = 0)
# end
        all_stds[v, :, :, :] = np.ma.std(all_months, axis = 0)

        # mask where fewer than 50% of years have data
        locs = np.ma.where(n_grids < N_YEARS_PRESENT)
        all_clims[v, :, :, :].mask[locs] = True
        # KW should probably mask stdev too - although unmasked it does show the potential coverage
        all_stds[v, :, :, :].mask[locs] = True

        if settings.plots and v == 0:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5)
            plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r")       
            plt.title("Number of years present in each pentad")
            plt.xlabel("Number of years (max = 30)")
            plt.ylabel("Frequency (log scale)")
            plt.savefig(settings.PLOT_LOCATION + "monthly_5x5_clims_n_years_{}_{}.png".format(period, suffix))

            
    # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this 
    # is working as it should. No import numpy.ma?        
    all_n_obs[:, :, :] = ncdf_file.variables["n_obs"][clim_offset:clim_offset + (30*12)]
    all_n_obs = all_n_obs.reshape(-1, 12, shape[-2], shape[-1])
    all_obs = np.ma.sum(all_n_obs, axis = 0)

    # set up time array
    times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time")
    month_lengths = [calendar.monthrange(1, x + 1)[1] for x in range(12)]
    times.data = [sum(month_lengths[0:x]) for x in range(12)]

    # write files
    if daily:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_climatology_from_daily_{}_{}.nc".format(period, suffix)
    else:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_climatology_{}_{}.nc".format(period, suffix)

# KATE modified - only outputting 90 to -90 now and have changed grid_lats above
    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
    #if period == "both":
    #    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats[::-1], grid_lons, times, frequency = "Y")
    #else:
    #    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
# end
    if daily:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_stdev_from_daily_{}_{}.nc".format(period, suffix)
    else:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_stdev_{}_{}.nc".format(period, suffix)

# KATE modified - only outputting 90 to -90 now and have changed grid_lats above
    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
    #if period == "both":
    #    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats[::-1], grid_lons, times, frequency = "Y")
    #else:
    #    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y")
# end
    # test distribution of obs with grid boxes
    if daily:
        outfile = file(settings.OUTROOT + "_5x5_monthly_climatology_from_daily_{}_{}.txt".format(period, suffix), "w")
    else:
        outfile = file(settings.OUTROOT + "_5x5_monthly_climatology_{}_{}.txt".format(period, suffix), "w")
        
    utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT)

    return # calculate_climatology
def do_merge(fileroot, mdi, suffix = "relax", clims = False, doMedian = False):
    '''
    Merge the _day and _night files

    Do a np.ma.mean or median for the data and a sum for the n_obs and n_grids

    Output with a _both suffix

    :param str fileroot: root for filenames
    :param flt mdi: missing data indicator
    :param str suffix: "relax" or "strict" criteria
    :param bool clims: if climatologies then don't try and process anomalies.
    '''

    OBS_ORDER = utils.make_MetVars(mdi, multiplier = False)

    if clims:
        # KW make OBS_ORDER only the actual variables - remove anomalies
        NEWOBS_ORDER = []
        for v, var in enumerate(OBS_ORDER):
            if "anomalies" not in var.name:
                NEWOBS_ORDER.append(var)
        del OBS_ORDER
        OBS_ORDER = np.copy(NEWOBS_ORDER)
        del NEWOBS_ORDER     


    # spin through both periods
    for p, period in enumerate(["day", "night"]):
        print period
        
        # go through the variables
        for v, var in enumerate(OBS_ORDER):

            print "   {}".format(var.name)

            ncdf_file = ncdf.Dataset("{}_{}_{}.nc".format(fileroot, period, suffix),'r', format='NETCDF4')

            if v == 0 and p == 0:

                shape = list(ncdf_file.variables[var.name][:].shape)
                shape.insert(0, len(OBS_ORDER)+2) # add all the variables
                shape.insert(0, 2) # insert extra dimension to allow day + night

                all_data = np.ma.zeros(shape)

                all_data[p, v] = ncdf_file.variables[var.name][:]

                # get lats/lons of box centres
                lat_centres = ncdf_file.variables["latitude"]
# KATE modified - this results in lats that go from 92.5 to -82,5 so I've switched the + for a -
                latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.
                #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.
# end
                lon_centres = ncdf_file.variables["longitude"]
                longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

                # get times - make a dummy object and then populate attributes
                times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(1, 1973), "hours", "time")

                times.long_name = ncdf_file.variables["time"].long_name
                times.standard_name = ncdf_file.variables["time"].standard_name
                times.long_name = ncdf_file.variables["time"].long_name
                times.units = ncdf_file.variables["time"].units

                times.data = ncdf_file.variables["time"][:]

            else:
                all_data[p, v] = ncdf_file.variables[var.name][:]

        # and get n_obs and n_grids
        all_data[p, -2] = ncdf_file.variables["n_grids"][:]
        all_data[p, -1] = ncdf_file.variables["n_obs"][:]

    # invert latitudes
    latitudes = latitudes[::-1]
    all_data = all_data[:,:,:,::-1,:]

    # got all the info, now merge
    if doMedian:
        merged_data = utils.bn_median(all_data[:, :len(OBS_ORDER)], axis = 0)
    else:
        merged_data = np.ma.mean(all_data[:, :len(OBS_ORDER)], axis = 0)

    # and process the grids and observations (split off here so have incorporated latitude inversion)
    n_grids = np.ma.sum(all_data[:, -2], axis = 0)
    n_obs = np.ma.sum(all_data[:, -1], axis = 0)
    n_obs.fill_value = -1
    n_grids.fill_value = -1

    # write the output file
    utils.netcdf_write("{}_{}_{}.nc".format(fileroot, "both", suffix), merged_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P")

    # test distribution of obs with grid boxes
    outfile = file("{}_{}_{}.txt".format(fileroot.split("/")[-1], "both", suffix), "w")
    utils.boxes_with_n_obs(outfile, n_obs, merged_data[0], "")


    return # do_merge