def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", do3hr = False, doQC = False, doBC = False): ''' Make 1x1 pentad climatology :param str suffix: "relax" or "strict" criteria :param int start_year: start year to process :param int end_year: end year to process :param str period: which period to do day/night/both? :param bool do3hr: run on 3hr --> pentad data :param bool doQC: incorporate the QC flags or not :param bool doBC: work on the bias corrected data :returns: ''' settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC) if suffix == "relax": N_YEARS_PRESENT = 10 # number of years present to calculate climatology elif suffix == "strict": N_YEARS_PRESENT = 15 # number of years present to calculate climatology print "Do 3hrly: {}".format(do3hr) N_YEARS = end_year - start_year + 1 # read in each variable - memory issues all_clims = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) # KW - why set up as np.ones? all_clims.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) all_stds = np.ma.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) all_stds.mask = np.zeros([len(OBS_ORDER), 73, len(grid_lats), len(grid_lons)]) # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though? all_n_obs = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) all_n_obs.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) all_n_obs.fill_value = -1 for v, var in enumerate(OBS_ORDER): print var.name # number of pentads = 365/5 = 73 # set up empty data array all_pentads = np.ma.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) # sets up a mask of 'False' = not masked! all_pentads.mask = np.zeros([N_YEARS, 73, len(grid_lats), len(grid_lons)]) all_pentads.fill_value = settings.mdi # read in relevant years for y, year in enumerate(np.arange(start_year, end_year + 1)): print year if do3hr: filename = settings.DATA_LOCATION + "{}_1x1_pentad_from_3hrly_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix) else: filename = settings.DATA_LOCATION + "{}_1x1_pentad_{}_{}_{}.nc".format(settings.OUTROOT, year, period, suffix) ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4') all_pentads[y, :, :, :] = ncdf_file.variables[var.name][:] if v == 0: all_n_obs[y, :, :, :] = ncdf_file.variables["n_obs"][:] # years x pentads x lats x lons n_grids = np.ma.count(all_pentads, axis = 0) # collapse down the years if settings.doMedian: all_clims[v, :, :, :] = utils.bn_median(all_pentads, axis = 0) else: all_clims[v, :, :, :] = np.ma.mean(all_pentads, axis = 0) all_stds[v, :, :, :] = np.ma.std(all_pentads, axis = 0) # mask where fewer than 50% of years have data locs = np.ma.where(n_grids < N_YEARS_PRESENT) all_clims[v, :, :, :].mask[locs] = True # KW should probably mask stdev too - although unmasked it does show the potential coverage all_stds[v, :, :, :].mask[locs] = True if settings.plots and v == 0: import matplotlib.pyplot as plt plt.clf() plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5) plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r") plt.title("Number of years present in each pentad") plt.xlabel("Number of years (max = 30)") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "pentad_clims_n_years_{}_{}_{}.png".format(year, period, suffix)) # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this # is working as it should. No import numpy.ma? all_obs = np.ma.sum(all_n_obs, axis = 0) # set up time array times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time") times.data = np.arange(0, 73) * 5 # write files if do3hr: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.nc".format(period, suffix) else: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.nc".format(period, suffix) utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P") if do3hr: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_from_3hrly_{}_{}.nc".format(period, suffix) else: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_1x1_pentad_stdev_{}_{}.nc".format(period, suffix) utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "P") # test distribution of obs with grid boxes if do3hr: outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_from_3hrly_{}_{}.txt".format(period, suffix), "w") else: outfile = file(settings.OUTROOT + "_1x1_pentad_climatology_{}_{}.txt".format(period, suffix), "w") utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT) return # calculate_climatology
def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", daily = False, doQC = False, doQC1it = False, doQC2it = False, doQC3it = False, doBC = False, doBCtotal = False, doBChgt = False, doBCscn = False): #def calculate_climatology(suffix = "relax", start_year = 1981, end_year = 2010, period = "both", daily = False, doQC = False, doBC = False): # end ''' Make 5x5 monthly climatology :param str suffix: "relax" or "strict" criteria :param int start_year: start year to process :param int end_year: end year to process :param str period: which period to do day/night/both? :param bool daily: run in 1x1 daily --> 5x5 monthly data :param bool doQC: incorporate the QC flags or not # KATE modified :param bool doQC1it: incorporate the 1st iteration QC flags or not :param bool doQC2it: incorporate the 2nd iteration QC flags or not :param bool doQC3it: incorporate the 3rd iteration QC flags or not # end :param bool doBC: work on the bias corrected data # KATE modified :param bool doBCtotal: work on the bias corrected data :param bool doBChgt: work on the height only bias corrected data :param bool doBCscn: work on the screen only bias corrected data # end :returns: ''' # KATE modified settings = set_paths_and_vars.set(doBC = doBC, doBCtotal = doBCtotal, doBChgt = doBChgt, doBCscn = doBCscn, doQC = doQC, doQC1it = doQC1it, doQC2it = doQC2it, doQC3it = doQC3it) #settings = set_paths_and_vars.set(doBC = doBC, doQC = doQC) # end if suffix == "relax": N_YEARS_PRESENT = 10 # number of years present to calculate climatology elif suffix == "strict": N_YEARS_PRESENT = 15 # number of years present to calculate climatology print "Do daily: {}".format(daily) N_YEARS = end_year - start_year + 1 # read in each variable - memory issues all_clims = np.ma.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)]) # KW - why set up as np.ones? all_clims.mask = np.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)]) all_stds = np.ma.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)]) all_stds.mask = np.zeros([len(OBS_ORDER), 12, len(grid_lats), len(grid_lons)]) # KW no mask??? I've set one with fill_value as -1 - should the mask be .zeros or .ones though? all_n_obs = np.ma.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)]) all_n_obs.mask = np.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)]) all_n_obs.fill_value = -1 if daily: filename = settings.DATA_LOCATION + "{}_5x5_monthly_from_daily_{}_{}.nc".format(settings.OUTROOT, period, suffix) else: filename = settings.DATA_LOCATION + "{}_5x5_monthly_{}_{}.nc".format(settings.OUTROOT, period, suffix) ncdf_file = ncdf.Dataset(filename,'r', format='NETCDF4') times = ncdf_file.variables["time"] data_start = int(times.long_name.split(" ")[2].split("/")[-1]) clim_offset = (start_year - data_start) * 12 for v, var in enumerate(OBS_ORDER): print var.name # number of pentads = 365/5 = 73 # set up empty data array all_months = np.ma.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)]) # sets up a mask of 'False' = not masked! all_months.mask = np.zeros([N_YEARS * 12, len(grid_lats), len(grid_lons)]) all_months.fill_value = settings.mdi all_months[:, :, :] = ncdf_file.variables[var.name][clim_offset:clim_offset + (30*12)] # months x lats x lons shape = all_months.shape all_months = all_months.reshape(-1, 12, shape[-2], shape[-1]) n_grids = np.ma.count(all_months, axis = 0) # collapse down the years # KATE MEDIAN WATCH # KATE modified - forced to use MEAN all_clims[v, :, :, :] = np.ma.mean(all_months, axis = 0) #if settings.doMedian: # all_clims[v, :, :, :] = utils.bn_median(all_months, axis = 0) #else: # all_clims[v, :, :, :] = np.ma.mean(all_months, axis = 0) # end all_stds[v, :, :, :] = np.ma.std(all_months, axis = 0) # mask where fewer than 50% of years have data locs = np.ma.where(n_grids < N_YEARS_PRESENT) all_clims[v, :, :, :].mask[locs] = True # KW should probably mask stdev too - although unmasked it does show the potential coverage all_stds[v, :, :, :].mask[locs] = True if settings.plots and v == 0: import matplotlib.pyplot as plt plt.clf() plt.hist(n_grids.reshape(-1), bins = np.arange(-1,32), align = "left", log = True, rwidth=0.5) plt.axvline(x = N_YEARS_PRESENT-0.5, color = "r") plt.title("Number of years present in each pentad") plt.xlabel("Number of years (max = 30)") plt.ylabel("Frequency (log scale)") plt.savefig(settings.PLOT_LOCATION + "monthly_5x5_clims_n_years_{}_{}.png".format(period, suffix)) # now process number of observations (KW all_n_obs wasn't a masked array - so have set it up as one - BUT not really convinced this # is working as it should. No import numpy.ma? all_n_obs[:, :, :] = ncdf_file.variables["n_obs"][clim_offset:clim_offset + (30*12)] all_n_obs = all_n_obs.reshape(-1, 12, shape[-2], shape[-1]) all_obs = np.ma.sum(all_n_obs, axis = 0) # set up time array times = utils.TimeVar("time", "time since 1/1/{} in days".format(1), "days", "time") month_lengths = [calendar.monthrange(1, x + 1)[1] for x in range(12)] times.data = [sum(month_lengths[0:x]) for x in range(12)] # write files if daily: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_climatology_from_daily_{}_{}.nc".format(period, suffix) else: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_climatology_{}_{}.nc".format(period, suffix) # KATE modified - only outputting 90 to -90 now and have changed grid_lats above utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y") #if period == "both": # utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats[::-1], grid_lons, times, frequency = "Y") #else: # utils.netcdf_write(out_filename, all_clims, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y") # end if daily: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_stdev_from_daily_{}_{}.nc".format(period, suffix) else: out_filename = settings.DATA_LOCATION + settings.OUTROOT + "_5x5_monthly_stdev_{}_{}.nc".format(period, suffix) # KATE modified - only outputting 90 to -90 now and have changed grid_lats above utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y") #if period == "both": # utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats[::-1], grid_lons, times, frequency = "Y") #else: # utils.netcdf_write(out_filename, all_stds, n_grids, all_obs, OBS_ORDER, grid_lats, grid_lons, times, frequency = "Y") # end # test distribution of obs with grid boxes if daily: outfile = file(settings.OUTROOT + "_5x5_monthly_climatology_from_daily_{}_{}.txt".format(period, suffix), "w") else: outfile = file(settings.OUTROOT + "_5x5_monthly_climatology_{}_{}.txt".format(period, suffix), "w") utils.boxes_with_n_obs(outfile, all_obs, all_clims[0], N_YEARS_PRESENT) return # calculate_climatology
def do_merge(fileroot, mdi, suffix = "relax", clims = False, doMedian = False): ''' Merge the _day and _night files Do a np.ma.mean or median for the data and a sum for the n_obs and n_grids Output with a _both suffix :param str fileroot: root for filenames :param flt mdi: missing data indicator :param str suffix: "relax" or "strict" criteria :param bool clims: if climatologies then don't try and process anomalies. ''' OBS_ORDER = utils.make_MetVars(mdi, multiplier = False) if clims: # KW make OBS_ORDER only the actual variables - remove anomalies NEWOBS_ORDER = [] for v, var in enumerate(OBS_ORDER): if "anomalies" not in var.name: NEWOBS_ORDER.append(var) del OBS_ORDER OBS_ORDER = np.copy(NEWOBS_ORDER) del NEWOBS_ORDER # spin through both periods for p, period in enumerate(["day", "night"]): print period # go through the variables for v, var in enumerate(OBS_ORDER): print " {}".format(var.name) ncdf_file = ncdf.Dataset("{}_{}_{}.nc".format(fileroot, period, suffix),'r', format='NETCDF4') if v == 0 and p == 0: shape = list(ncdf_file.variables[var.name][:].shape) shape.insert(0, len(OBS_ORDER)+2) # add all the variables shape.insert(0, 2) # insert extra dimension to allow day + night all_data = np.ma.zeros(shape) all_data[p, v] = ncdf_file.variables[var.name][:] # get lats/lons of box centres lat_centres = ncdf_file.variables["latitude"] # KATE modified - this results in lats that go from 92.5 to -82,5 so I've switched the + for a - latitudes = lat_centres - (lat_centres[1] - lat_centres[0])/2. #latitudes = lat_centres + (lat_centres[1] - lat_centres[0])/2. # end lon_centres = ncdf_file.variables["longitude"] longitudes = lon_centres + (lon_centres[1] - lon_centres[0])/2. # get times - make a dummy object and then populate attributes times = utils.TimeVar("time", "time since 1/{}/{} in hours".format(1, 1973), "hours", "time") times.long_name = ncdf_file.variables["time"].long_name times.standard_name = ncdf_file.variables["time"].standard_name times.long_name = ncdf_file.variables["time"].long_name times.units = ncdf_file.variables["time"].units times.data = ncdf_file.variables["time"][:] else: all_data[p, v] = ncdf_file.variables[var.name][:] # and get n_obs and n_grids all_data[p, -2] = ncdf_file.variables["n_grids"][:] all_data[p, -1] = ncdf_file.variables["n_obs"][:] # invert latitudes latitudes = latitudes[::-1] all_data = all_data[:,:,:,::-1,:] # got all the info, now merge if doMedian: merged_data = utils.bn_median(all_data[:, :len(OBS_ORDER)], axis = 0) else: merged_data = np.ma.mean(all_data[:, :len(OBS_ORDER)], axis = 0) # and process the grids and observations (split off here so have incorporated latitude inversion) n_grids = np.ma.sum(all_data[:, -2], axis = 0) n_obs = np.ma.sum(all_data[:, -1], axis = 0) n_obs.fill_value = -1 n_grids.fill_value = -1 # write the output file utils.netcdf_write("{}_{}_{}.nc".format(fileroot, "both", suffix), merged_data, n_grids, n_obs, OBS_ORDER, latitudes, longitudes, times, frequency = "P") # test distribution of obs with grid boxes outfile = file("{}_{}_{}.txt".format(fileroot.split("/")[-1], "both", suffix), "w") utils.boxes_with_n_obs(outfile, n_obs, merged_data[0], "") return # do_merge