def do_conversion(start_year=defaults.START_YEAR, end_year=defaults.END_YEAR, period="all", doBC=False, doQC=True):
    """
    Convert dailies to pentads 1x1

    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/all?
    :param bool doBC: work on the bias corrected data
    :param bool doQC: incorporate the QC flags or not


    :returns:
    """
    settings = set_paths_and_vars.set(doBC=doBC, doQC=doQC)

    OBS_ORDER = utils.make_MetVars(settings.mdi, multiplier=False)

    for year in np.arange(start_year, end_year + 1):

        # set up empty data array
        all_dailies = np.ma.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)])
        all_dailies.mask = np.zeros([len(OBS_ORDER), utils.days_in_year(year), len(grid_lats), len(grid_lons)])
        all_dailies.fill_value = settings.mdi

        all_n_obs = np.zeros([utils.days_in_year(year), len(grid_lats), len(grid_lons)])

        year_start = dt.datetime(year, 1, 1, 0, 0)

        for month in np.arange(12) + 1:
            print year, month

            month_start = utils.day_of_year(year, month)
            month_end = month_start + calendar.monthrange(year, month)[1]

            filename = "{}/{}_1x1_daily_{}{:02d}_{}.nc".format(
                settings.DATA_LOCATION, settings.OUTROOT, year, month, period
            )

            ncdf_file = ncdf.Dataset(filename, "r", format="NETCDF4")

            for v, var in enumerate(OBS_ORDER):

                if month == 12:
                    # run to end of year if december
                    all_dailies[v, month_start:, :, :] = ncdf_file.variables[var.name][:]
                else:
                    all_dailies[v, month_start:month_end, :, :] = ncdf_file.variables[var.name][:]

            # now get number of observations
            if month == 12:
                all_n_obs[month_start:, :, :] = ncdf_file.variables["n_obs"][:]
            else:
                all_n_obs[month_start:month_end, :, :] = ncdf_file.variables["n_obs"][:]

        if calendar.isleap(year):
            assert all_dailies.shape[1] == 366

            # extract 6-day pentad
            incl_feb29th = all_dailies[:, 55:61, :, :]

            # remove the data of Feb 29th from array
            # np.ma.delete doesn't exist, so have to copy mask separately
            mask = all_dailies.mask
            all_dailies = np.delete(all_dailies, 59, 1)
            mask = np.delete(mask, 59, 1)
            all_dailies = np.ma.array(all_dailies, mask=mask)
            del mask

            # number of observations
            incl_feb29th_n_obs = all_n_obs[55:61, :, :]
            all_n_obs = np.delete(all_n_obs, 59, 0)

        else:
            assert all_dailies.shape[1] == 365

        shape = all_dailies.shape
        all_dailies = all_dailies.reshape(shape[0], -1, 5, shape[-2], shape[-1])

        n_days_per_pentad = np.ma.count(all_dailies, axis=2)

        if settings.doMedian:
            pentad_grid = utils.bn_median(all_dailies, axis=2)
        else:
            pentad_grid = np.ma.mean(all_dailies, axis=2)

        # clear up memory
        del all_dailies
        gc.collect()

        all_n_obs = all_n_obs.reshape(-1, 5, shape[-2], shape[-1])
        all_n_obs = np.sum(all_n_obs, axis=1)

        pentad_grid.mask[
            n_days_per_pentad < N_OBS
        ] = True  # mask where fewer than 2 days have values # KW THIS IS ACTUALLY 2 - WHICH I THINK IS GOOD

        # the pentad containing feb 29th is the 11th in the year
        if calendar.isleap(year):
            #  overwrite this with the me(di)an of a 6-day pentad
            if settings.doMedian:
                pentad_grid[:, 11, :, :] = utils.bn_median(incl_feb29th, axis=1)
            else:
                pentad_grid[:, 11, :, :] = np.ma.mean(incl_feb29th, axis=1)

            feb_n_days_per_pentad = np.ma.count(incl_feb29th, axis=1)
            pentad_grid.mask[:, 11, :, :][feb_n_days_per_pentad < N_OBS] = True
            n_days_per_pentad[:, 11, :, :] = feb_n_days_per_pentad

            all_n_obs[11, :, :] = np.sum(incl_feb29th_n_obs, axis=0)

            print "processed Feb 29th"

        times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time")
        times.data = np.arange(0, pentad_grid.shape[1]) * 5 * 24

        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_{}_{}.nc".format(year, period)

        utils.netcdf_write(
            out_filename,
            pentad_grid,
            n_days_per_pentad[0],
            all_n_obs,
            OBS_ORDER,
            grid_lats,
            grid_lons,
            times,
            frequency="P",
        )

        del pentad_grid
        del all_n_obs
        del n_days_per_pentad
        gc.collect()

    return  # do_conversion
def do_merge(fileroot, mdi, suffix = "relax", clims = False, doMedian = False):
    '''
    Merge the _day and _night files

    Do a np.ma.mean or median for the data and a sum for the n_obs and n_grids

    Output with a _both suffix

    :param str fileroot: root for filenames
    :param flt mdi: missing data indicator
    :param str suffix: "relax" or "strict" criteria
    :param bool clims: if climatologies then don't try and process anomalies.
    '''

    OBS_ORDER = utils.make_MetVars(mdi, multiplier = False)

    if clims:
        # KW make OBS_ORDER only the actual variables - remove anomalies
        NEWOBS_ORDER = []
        for v, var in enumerate(OBS_ORDER):
            if "anomalies" not in var.name:
                NEWOBS_ORDER.append(var)
        del OBS_ORDER
        OBS_ORDER = np.copy(NEWOBS_ORDER)
        del NEWOBS_ORDER     


    # spin through both periods
    for p, period in enumerate(["day", "night"]):
        print period
        
        # go through the variables
        for v, var in enumerate(OBS_ORDER):

            print "   {}".format(var.name)

            ncdf_file = ncdf.Dataset("{}_{}_{}.nc".format(fileroot, period, suffix),'r', format='NETCDF4')

            if v == 0 and p == 0:

                shape = list(ncdf_file.variables[var.name][:].shape)
                shape.insert(0, len(OBS_ORDER)+2) # add all the variables
                shape.insert(0, 2) # insert extra dimension to allow day + night

                all_data = np.ma.zeros(shape)

                all_data[p, v] = ncdf_file.variables[var.name][:]

                # get lats/lons of box centres
                lat_centres = ncdf_file.variables["latitude"]
# KATE modified - this results in lats that go from 92.5 to -82,5 so I've switched the + for a -
                latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2.
                #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2.
# end
                lon_centres = ncdf_file.variables["longitude"]
                longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2.

                # get times - make a dummy object and then populate attributes
                times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(1, 1973), "hours", "time")

                times.long_name = ncdf_file.variables["time"].long_name
                times.standard_name = ncdf_file.variables["time"].standard_name
                times.long_name = ncdf_file.variables["time"].long_name
                times.units = ncdf_file.variables["time"].units

                times.data = ncdf_file.variables["time"][:]

            else:
                all_data[p, v] = ncdf_file.variables[var.name][:]

        # and get n_obs and n_grids
        all_data[p, -2] = ncdf_file.variables["n_grids"][:]
        all_data[p, -1] = ncdf_file.variables["n_obs"][:]

    # invert latitudes
    latitudes = latitudes[::-1]
    all_data = all_data[:,:,:,::-1,:]

    # got all the info, now merge
    if doMedian:
        merged_data = utils.bn_median(all_data[:, :len(OBS_ORDER)], axis = 0)
    else:
        merged_data = np.ma.mean(all_data[:, :len(OBS_ORDER)], axis = 0)

    # and process the grids and observations (split off here so have incorporated latitude inversion)
    n_grids = np.ma.sum(all_data[:, -2], axis = 0)
    n_obs = np.ma.sum(all_data[:, -1], axis = 0)
    n_obs.fill_value = -1
    n_grids.fill_value = -1

    # write the output file
    utils.netcdf_write("{}_{}_{}.nc".format(fileroot, "both", suffix), merged_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P")

    # test distribution of obs with grid boxes
    outfile = file("{}_{}_{}.txt".format(fileroot.split("/")[-1], "both", suffix), "w")
    utils.boxes_with_n_obs(outfile, n_obs, merged_data[0], "")


    return # do_merge
def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", do3hr = False, doQC = False, doBC = False):
    '''
    Make 1x1 pentad climatology

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/both?
    :param bool do3hr: run on 3hr --> pentad data
    :param bool doQC: incorporate the QC flags or not
    :param bool doBC: work on the bias corrected data

    :returns:
    '''
    settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)

    if suffix == "relax":
        N_YEARS_PRESENT = 10 # number of years present to calculate climatology
    elif suffix == "strict":
        N_YEARS_PRESENT = 15 # number of years present to calculate climatology


    print "Do 3hrly: {}".format(do3hr)

    N_YEARS = end_year - start_year + 1

    # read in each variable - memory issues

    all_clims = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    # KW - why set up as np.ones?
    all_clims.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])

    all_stds = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    all_stds.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
    
    # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though?
    all_n_obs = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
    all_n_obs.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
    all_n_obs.fill_value = -1
    
    for v, var in enumerate(OBS_ORDER):
	    
        print var.name

        # number of pentads = 365/5 = 73
        # set up empty data array
        all_pentads = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
	# sets up a mask of 'False' = not masked!
        all_pentads.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)])
        all_pentads.fill_value = settings.mdi

        # read in relevant years
        for y, year in enumerate(np.arange(start_year, end_year + 1)): 

            print year

            if do3hr:
                filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)
 
            else:
                filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix)

            ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4')

            all_pentads[y, :, :, :] = ncdf_file.variables[var.name][:]

            if v == 0:
                all_n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:]

        # years x pentads x lats x lons
        n_grids = np.ma.count(all_pentads, axis = 0)

        # collapse down the years
        if settings.doMedian:
            all_clims[v, :, :, :] = utils.bn_median(all_pentads, axis = 0)
        else:
            all_clims[v, :, :, :] = np.ma.mean(all_pentads, axis = 0)

        all_stds[v, :, :, :] = np.ma.std(all_pentads, axis = 0)

        # mask where fewer than 50% of years have data
        locs = np.ma.where(n_grids < N_YEARS_PRESENT)
        all_clims[v, :, :, :].mask[locs] = True
        # KW should probably mask stdev too - although unmasked it does show the potential coverage
        all_stds[v, :, :, :].mask[locs] = True

        if settings.plots and v == 0:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5)
            plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r")       
            plt.title("Number of years present in each pentad")
            plt.xlabel("Number of years (max = 30)")
            plt.ylabel("Frequency (log scale)")
            plt.savefig(settings.PLOT_LOCATION + "pentad_clims_n_years_{}_{}_{}.png".format(year, period, suffix))

            
    # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this 
    # is working as it should. No import numpy.ma?        
    all_obs = np.ma.sum(all_n_obs, axis = 0)

    # set up time array
    times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time")
    times.data = np.arange(0, 73) * 5

    # write files
    if do3hr:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.nc".format(period, suffix)
    else:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.nc".format(period, suffix)

    utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")

    if do3hr:
        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_from_3hrly_{}_{}.nc".format(period, suffix)
    else:
       out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_{}_{}.nc".format(period, suffix)

    utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")

    # test distribution of obs with grid boxes
    if do3hr:
        outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.txt".format(period, suffix), "w")
    else:
        outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.txt".format(period, suffix), "w")

    utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT)

    return # calculate_climatology
def do_conversion(suffix = "relax", start_year = defaults.START_YEAR, end_year = defaults.END_YEAR, period = "all", doQC = False, doBC = False):
    '''
    Convert 3 hrlies to pentads 1x1

    First get pentad average of 3hrly values (so values at 0, 3, 6, ... averaged over 5 days)
    Then get average over the pentad.

    :param str suffix: "relax" or "strict" criteria
    :param int start_year: start year to process
    :param int end_year: end year to process
    :param str period: which period to do day/night/all?
    :param bool doQC: incorporate the QC flags or not
    :param bool doBC: work on the bias corrected data

    :returns:
    '''
    settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC)


    # KW Added SUFFIX variable because all hourlies/dailies/monthlies now have suffix 'strict' (4/2 per daily/day-night) 
    # or 'relax' (2/1 per daily/day-night)
    if suffix == "relax":
        N_OBS_OVER_DAYS = 1 # at least 1 obs at this 3 hr timestamp from 5 days in pentad
        N_OBS_OVER_PENTAD = 2

    elif suffix == "strict":
        N_OBS_OVER_DAYS = 2
        N_OBS_OVER_PENTAD = 4  # at least 4 timestamps (of 8) in pentad, could be 2 for local 'relax' setting


    N_OBS_PER_DAY = 24/DELTA_HOUR

    for year in np.arange(start_year, end_year + 1): 

        all_pentads =  np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])
        all_pentads.mask =  np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)])

        # read in a years worth of 3hrly data
        for v, var in enumerate(OBS_ORDER):
            # arrays too massive to process all variables at once.
            print var.name
       
            var_3hrlys = read_data(settings, suffix, var.name, year, grid_lats, grid_lons, period, N_OBS_PER_DAY)

            # reshape to days x 3hrly obs (365(366),8,180,360)
            var_3hrlys = var_3hrlys.reshape(-1, N_OBS_PER_DAY, var_3hrlys.shape[1], var_3hrlys.shape[2])

            # process the leap-year if appropriate
            if calendar.isleap(year):
                var_3hrlys, incl_feb29th  = process_february(var_3hrlys, doMask = True)
            else:
                assert var_3hrlys.shape[0] == 365

            # get pentadly values for each timestep (73,5,8,180,360)
            shape = var_3hrlys.shape
            var_3hrlys = var_3hrlys.reshape(-1, 5, shape[-3], shape[-2], shape[-1]) # n_pentads x days x hrs x lat x lon

            n_days_per_timestamp = np.ma.count(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon

            # get average at each timestamp across the pentad - so have N_OBS_PER_DAY averaged values per pentad
            if settings.doMedian:
                pentad_3hrly_grid = utils.bn_median(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon
            else:
                pentad_3hrly_grid = np.ma.mean(var_3hrlys, axis = 1) # n_pentads x hrs x lat x lon

            pentad_3hrly_grid.mask[n_days_per_timestamp < N_OBS_OVER_DAYS] = True # mask where fewer than N_OBS_OVER_DAYS days have values
            
            # clear up memory
            del var_3hrlys
            gc.collect()

            # the pentad containing feb 29th is the 11th in the year (KW actually its the 12th, so the 11 in array speak which is what you have done)
            if calendar.isleap(year):
                #  overwrite this with the me(di)an of a 6-day pentad
                if settings.doMedian:
                    pentad_3hrly_grid[11, :, :, :] = utils.bn_median(incl_feb29th, axis = 0)
                else:
                    pentad_3hrly_grid[11, :, :, :] = np.ma.mean(incl_feb29th, axis = 0)

                feb_n_days_per_timestamp = np.ma.count(incl_feb29th, axis = 0)
                pentad_3hrly_grid.mask[11, :, :, :][feb_n_days_per_timestamp < N_OBS_OVER_DAYS] = True
                n_days_per_timestamp[11, :, :, :] = feb_n_days_per_timestamp

                print "processed Feb 29th"

            if settings.plots and v == 0:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.hist(n_days_per_timestamp.reshape(-1), bins = np.arange(-1,7), align = "left", log = True, rwidth=0.5)
                plt.axvline(x = N_OBS_OVER_DAYS-0.5, color = "r")       
                plt.title("Number of days with obs at each 3hrly timestamp (over entire year)")
                plt.xlabel("Number of days (max = 5)")
                plt.ylabel("Frequency (log scale)")
                plt.savefig(settings.PLOT_LOCATION + "pentads_n_days_{}_{}_{}.png".format(year, period, suffix))

            # get single pentad values
            n_hrs_per_pentad = np.ma.count(pentad_3hrly_grid, axis = 1) # get the number of pentad-hours present in each pentad
            n_grids_per_pentad = np.sum(n_days_per_timestamp, axis = 1) # get the number of 3hrly 1x1 grids included per pentad 1x1

            # get average at each timestamp across the pentad - so have N_OBS_PER_DAY values per pentad
            if settings.doMedian:
                pentad_grid = utils.bn_median(pentad_3hrly_grid, axis = 1)
            else:
                pentad_grid = np.ma.mean(pentad_3hrly_grid, axis = 1)

            if period == "all":
# KW are you sure this should be n_hrs_per_pentad and not n_grids_per_pentad here? I think it should
                pentad_grid.mask[n_hrs_per_pentad < N_OBS_OVER_PENTAD] = True # mask where fewer than N_OBS_OVER_PENTAD hours have values
            else:
# KW are you sure this should be n_hrs_per_pentad and not n_grids_per_pentad here? I think it should
                pentad_grid.mask[n_hrs_per_pentad < (N_OBS_OVER_PENTAD/2.)] = True # mask where fewer than N_OBS_OVER_PENTAD hours have values
            
            all_pentads[v, :, :, :] = pentad_grid

            # diagnostics plots of obs/grids per pentad
            if settings.plots and v == 0:
                plt.clf()
                plt.hist(n_hrs_per_pentad.reshape(-1), bins = np.arange(-1,10), align = "left", log = True, rwidth=0.5)
                if period == "all":
                    plt.axvline(x = N_OBS_OVER_PENTAD-0.5, color = "r")       
                else:
                    plt.axvline(x = (N_OBS_OVER_PENTAD/2.)-0.5, color = "r")       
                plt.title("Number of hrs with obs in each pentad (over entire year)")
                plt.xlabel("Number of days (max = 8)")
                plt.ylabel("Frequency (log scale)")
                plt.savefig(settings.PLOT_LOCATION + "pentads_n_hrs_{}_{}_{}.png".format(year, period, suffix))

            # clear up memory
            del pentad_3hrly_grid
            del pentad_grid
            gc.collect()

        # done all main variables.  Now for number of observations
        print "n_obs"
        n_obs = read_data(settings, suffix, "n_obs", year, grid_lats, grid_lons, period, N_OBS_PER_DAY)
	# KW so we've gone from 8*365hrs,lats,lons to 365,8,lats,lons
        n_obs = n_obs.reshape(-1, N_OBS_PER_DAY, n_obs.shape[1], n_obs.shape[2])
        if calendar.isleap(year):
            n_obs, incl_feb29th  = process_february(n_obs, doMask = True)
        else:
            assert n_obs.shape[0] == 365    

        shape = n_obs.shape
	# KW so we're now at pentads, 5days, 8hours, lats, lons
        n_obs = n_obs.reshape(-1, 5, shape[-3], shape[-2], shape[-1]) # pentads x days x hours x lat x lon
        
	# KW This should sum over the 5days leaving pentads, 8hrs, lats, lons
	# n_obs has -1 as missing data!!! So sum will not work properly
	# set up fill_value as -1
	n_obs.fill_value = -1
        n_obs_per_3hrly_pentad = np.ma.sum(n_obs, axis = 1)
        n_obs_per_3hrly_pentad.fill_value = -1

        if calendar.isleap(year):
            n_obs_per_3hrly_pentad[11, :, :, :] = np.ma.sum(incl_feb29th, axis = 0)

        n_obs_per_pentad = np.ma.sum(n_obs_per_3hrly_pentad, axis = 1)

        # and write out
        times = utils.TimeVar("time", "time since 1/1/{} in hours".format(year), "hours", "time")
        times.data = np.arange(0, all_pentads.shape[1]) * 5 * 24

        out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(year, period, suffix)
        
        utils.netcdf_write(out_filename, all_pentads, n_grids_per_pentad, n_obs_per_pentad, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P")


    return # do_conversion