def process_dataset(dataset, diagnostics=False): """ Read in the specified dataset inventory. Check each station for suitability for HadEX3 :param datasetObj dataset: dataset object holding metadata about the input dataset :param bool diagnostics: output diagnostic information """ # read in the dataset inventory metadata dataset_stations = utils.read_inventory(dataset, subdir="formatted/indices") # spin through stations for station in dataset_stations: # spin through indices for index in PERCENTILE_INDICES: # select appropriate timescales if index in utils.MONTHLY_INDICES: timescales = ["ANN", "MON"] else: timescales = ["ANN"] # and spin through those for timescale in timescales: if os.path.exists( os.path.join( station.location, station.id, "{}_{}_{}.csv".format(station.id, index.lower(), timescale))): os.remove( os.path.join( station.location, station.id, "{}_{}_{}.csv".format(station.id, index.lower(), timescale))) if diagnostics: print("Removing {}".format( os.path.join( station.location, station.id, "{}_{}_{}.csv".format(station.id, index.lower(), timescale)))) return # process_dataset
def main(indata="ghcndex", index="R95pTOT", diagnostics=False): """ Read PRCPTOT and other indices and write out """ # check if need to do monthly ones if index in utils.MONTHLY_INDICES: timescales = ["ANN", "MON"] else: timescales = ["ANN"] # get all possible datasets all_datasets = utils.get_input_datasets() # and their names names = np.array([d.name for d in all_datasets]) # if dataset selected and in the list of available, then run if indata in names: dataset = all_datasets[names == indata][0] dataset_stations = utils.read_inventory(dataset, subdir="formatted/indices") # check each station for stn in dataset_stations: if diagnostics: print("{} - {}".format(dataset.name, stn.id)) # for appropriate number of timescales for ts in timescales: if os.path.exists( os.path.join( stn.location, stn.id, "{}_{}_{}.csv".format( stn.id, PARTNERS[index].lower(), ts))) and os.path.exists( os.path.join( stn.location, stn.id, "{}_{}_{}.csv".format( stn.id, "PRCPTOT".lower(), ts))): rtimes, rXXp = utils.read_station_index( stn, PARTNERS[index].lower(), ts) ptimes, prcptot = utils.read_station_index( stn, "PRCPTOT".lower(), ts) match = np.in1d(rtimes, ptimes) match_b = np.in1d(ptimes, rtimes) if len(match) != 0 and len(match_b) != 0: rXXptot = (100 * rXXp) / prcptot rXXptot_times = rtimes[match] if ts == "MON": myears = [] months = [] for y in rXXptot_times: for m in range(1, 13): myears += [y] months += [m] stn.monthly = rXXptot.filled().reshape(-1) stn.myears = myears stn.months = months path = os.path.join( dataset.location, "formatted", "indices", stn.id, "{}_{}_MON.csv".format(stn.id, index.lower())) if not os.path.exists(path): utils.write_station_index(path, stn, index, doMonthly=True) else: stn.years = rXXptot_times stn.annual = rXXptot.filled() path = os.path.join( dataset.location, "formatted", "indices", stn.id, "{}_{}_ANN.csv".format(stn.id, index.lower())) if not os.path.exists(path): utils.write_station_index(path, stn, index) return # main
def main(index="TX90p", diagnostics=False, qc_flags=""): """ The main DLS function :param str index: which index to run :param bool diagnostics: extra verbose output :param str qc_flags: which QC flags to process W, B, A, N, C, R, F, E, V, M """ if index in utils.MONTHLY_INDICES: nmonths = 13 timescale = "MON" else: nmonths = 1 timescale = "ANN" # move this up one level eventually? all_datasets = utils.get_input_datasets() # spin through all datasets stations = np.array([]) for dataset in all_datasets: try: ds_stations = utils.read_inventory(dataset, subdir="formatted/indices", final=True, timescale=timescale, index=index, qc_flags=qc_flags) good_stations = utils.select_qc_passes(ds_stations, qc_flags=qc_flags) stations = np.append(stations, good_stations) print("Adding {} ({} stations), nstations = {}".format( dataset.name, len(good_stations), len(stations))) except IOError: # file missing print("No stations with data for {}".format(dataset.name)) nstations = len(stations) # array of lats and lons for calculation of separations all_locations = np.array([[stn.latitude, stn.longitude] for stn in stations]) # get the separations (km, radians) stn_separation, stn_angle = get_separations(stations, all_locations) # assign stations to bands StationBands = assign_to_latitude_bands(stations) # read in all the station data all_data = get_all_data(stations, index, timescale, nyears, nmonths) # set up the DLS defaults bins = np.arange(0, MAX_SEPARATION + BIN_WIDTH, BIN_WIDTH) all_dls = np.zeros([len(utils.LAT_BANDS), nmonths]) all_dls[:] = utils.DEFAULT_DLS # now spin through all latitude bands and months. for lb, band in enumerate(utils.LAT_BANDS): stations_in_bands, = np.where(StationBands == lb) if len(stations_in_bands) <= 30: # insufficient stations within this latitude band, next band if diagnostics: print("Index {}, Band {} to {}".format(index, band[0], band[1])) print("Number of stations {}".format(len(stations_in_bands))) print("Ann, Jan -- Dec, DLS = {} km".format(utils.DEFAULT_DLS)) # spin through months to remove old plots if they exist for month in range(nmonths): if os.path.exists( os.path.join( utils.PLOTLOCS, "DLS", "DLS_{}_{}_{}to{}.png".format( index, month_names[month], band[0], band[1]))): os.remove( os.path.join( utils.PLOTLOCS, "DLS", "DLS_{}_{}_{}to{}.png".format( utils.PLOTLOCS, index, month_names[month], band[0], band[1]))) continue print("{}, # stations {}".format(band, len(stations_in_bands))) # process each month for month in range(nmonths): print(month_names[month]) month_data = all_data[stations_in_bands, :, month] names = [s.id for s in stations[stations_in_bands]] # get the separation and correlation for each cross pair # correlations only from 1951 (match HadEX2) cor_yr = 1951 - utils.STARTYEAR.year seps, cors = separations_and_correlations( month_data[:, cor_yr:], stn_separation[stations_in_bands, :][:, stations_in_bands], names, diagnostics=diagnostics) if len(seps) == 0 and len(cors) == 0: # then none of the available stations either had sufficient overlapping data # or values at that particular point (correlations of lots of zeros doesn't mean anything) # so escape and go on to next month if diagnostics: print("Index {}, Band {} to {}, month {}".format( index, band[0], band[1], month_names[month])) print("Number of stations {}".format( len(stations_in_bands))) print( "Likely that all values for this index, month and band are zero\n hence correlations don't mean anything" ) print("Using default DLS = {}km".format(utils.DEFAULT_DLS)) else: print("No stations, {} - {} DLS = {} km".format( band, month_names[month], utils.DEFAULT_DLS)) continue # get the bins bin_assignment = np.digitize( seps, bins, right=True) # "right" means left bin edge included bin_centers = bins - BIN_WIDTH / 2. # average value for each bin if sufficient correlations to do so. means = np.zeros(len(bins)) sigmas = np.zeros(len(bins)) for b, bin in enumerate(bins): locs, = np.where(bin_assignment == b) if len(locs) > MIN_PER_BIN: # means[b] = np.ma.mean(cors[locs]) means[b] = np.ma.median(cors[locs]) sigmas[b] = np.ma.std(cors[locs]) # print(bin, means[b], len(locs), cors[locs]) # raw_input("stop") filled_bins, = np.where(means != 0) # if sufficient bins are filled then fit the curve if len(filled_bins) / float(len(bins)) >= 0.5: if utils.FIX_ZERO: # fix zero bin to be 1.0, and use bin edges, not centres (HadEX2) means[0] = 1. sigmas[0] = sigmas[1] dls, plot_curve, chisq, R2 = exponential_fit(bins, means, sigmas, C=C) else: dls, plot_curve, chisq, R2 = exponential_fit( bin_centers[1:], means[1:], sigmas[1:], C=C) # only take fit if greater than minimum set overall all_dls[lb, month] = np.max([dls, utils.DEFAULT_DLS]) # test at 5% level and 2 or 3 dofs, as per HadEX2 if utils.FIX_ZERO and chisq >= chi2.isf( 0.05, len(bins[sigmas != 0]) - 2): print("inadequately good fit") all_dls[lb, month] = utils.DEFAULT_DLS elif chisq >= chi2.isf(0.05, len(bins[sigmas != 0]) - 3): print("inadequately good fit") all_dls[lb, month] = utils.DEFAULT_DLS # plot the fit if required plt.clf() plt.scatter(seps, cors, c='b', marker='.', alpha=0.1, edgecolor=None) # calculate the 2D density of the data given counts, xbins, ybins = np.histogram2d(seps, cors, bins=50) # make the contour plot (5 levels) plt.contour(counts.transpose(), 5, extent=[ xbins.min(), xbins.max(), ybins.min(), ybins.max() ], linewidths=1, colors='black', linestyles='solid') if utils.FIX_ZERO: plt.plot(bins[sigmas != 0], means[sigmas != 0], 'ro') plt.errorbar(bins[sigmas != 0], means[sigmas != 0], yerr=sigmas[sigmas != 0], fmt="none", ecolor="r") plt.plot(bins, plot_curve, c='cyan', ls='-', lw=2) else: plt.plot(bin_centers[1:][sigmas[1:] != 0], means[1:][sigmas[1:] != 0], 'ro') plt.errorbar(bin_centers[1:][sigmas[1:] != 0], means[1:][sigmas[1:] != 0], yerr=sigmas[1:][sigmas[1:] != 0], fmt="none", ecolor="r") plt.plot(bin_centers[1:], plot_curve, c='cyan', ls='-', lw=2) # plot curve will have been truncated plt.axvline(dls, c='magenta', ls="--", lw=2) plt.axvline(utils.DEFAULT_DLS, c='k', ls=":", lw=1) plt.axvline(utils.MAX_DLS, c='k', ls=":", lw=1) plt.text(dls + 10, 0.95, "dls = {:4.0f}km".format(dls)) plt.text(3010, -0.95, "r2 = {:6.4f}".format(R2)) plt.text(3010, -0.85, "chi2 = {:6.4f}".format(chisq)) plt.text(3010, -0.75, "Nstat = {}".format(len(stations_in_bands))) plt.xlim([-100, 5000]) plt.ylim([-1, None]) plt.xlabel("Separation (km)") plt.ylabel("Correlation") plt.title("{} - {}; {} to {}".format(index, month_names[month], band[0], band[1])) # add text to show what code created this and when if utils.WATERMARK: watermarkstring = "/".join( os.getcwd().split('/')[4:]) + '/' + os.path.basename( __file__) + " " + dt.datetime.strftime( dt.datetime.now(), "%d-%b-%Y %H:%M") plt.figtext(0.01, 0.01, watermarkstring, size=6) if utils.FIX_ZERO: plt.savefig(os.path.join(utils.PLOTLOCS, "DLS", \ "DLS_{}_{}_{}_{}to{}_fixzero.png".format(index, "{}-{}".format(str(utils.REF_START)[-2:], \ str(utils.REF_END)[-2:]), month_names[month], band[0], band[1])), dpi=300) else: plt.savefig(os.path.join(utils.PLOTLOCS, "DLS", \ "DLS_{}_{}_{}_{}to{}.png".format(index, "{}-{}".format(str(utils.REF_START)[-2:], \ str(utils.REF_END)[-2:]), month_names[month], band[0], band[1])), dpi=300) print("DLS = {:7.2f} km".format(dls)) else: print("insufficient bins for fit ({}/{})".format( len(filled_bins), float(len(bins)))) if os.path.exists(os.path.join(utils.PLOTLOCS, "DLS", \ "DLS_{}_{}_{}_{}to{}.png".format(index, "{}-{}".format(str(utils.REF_START)[-2:], \ str(utils.REF_END)[-2:]), month_names[month], band[0], band[1]))): # remove old plots! os.remove(os.path.join(utils.PLOTLOCS, "DLS", \ "DLS_{}_{}_{}_{}to{}.png".format(index, "{}-{}".format(str(utils.REF_START)[-2:], \ str(utils.REF_END)[-2:]), month_names[month], band[0], band[1]))) # replace those dls < default with default and > max with max all_dls[all_dls < utils.DEFAULT_DLS] = utils.DEFAULT_DLS # interpolate grid_dls = interpolate_dls_to_grid(all_dls, nmonths) # write output file write_dls_file(os.path.join(utils.DLSLOCS, "dls_{}.txt".format(index)), grid_dls, nmonths, month_names) return # main
def main(indata="ghcnd", diagnostics=False): """ Call the R package climpact2 with appropriate settings to calculate the indices :param str indata: name of dataset to process :param bool diagnostics: output diagnostic information """ # get all possible datasets all_datasets = utils.get_input_datasets() # and their names names = np.array([d.name for d in all_datasets]) # select the matching one if indata in names: dataset = all_datasets[names == indata][0] ''' Process call structure climpact2.batch.stations.r ./sample_data/ ./sample_data/climpact2.sample.batch.metadata.txt 1971 2000 4 ''' # check that there are stations to process for this dataset stations = utils.read_inventory(dataset) if len(stations) != 0: try: with utils.cd(utils.CLIMPACT_LOCS): # call the R process - which should automatically do everything and make suitable files etc # runs in subfolder with context manager, so returning to parent once done. # ACRE (and others?) have stations that do not overlap the reference period. # Means that the QC process throws them out if insufficient overlap between data and reference period if dataset.name == "acre": ref_start = 1901 ref_end = 1930 else: ref_start = utils.REF_START ref_end = utils.REF_END print(" ".join([ "Rscript", "climpact2.batch.stations.r", os.path.join(dataset.location, "formatted"), os.path.join(dataset.location, "{}.metadata.txt".format(dataset.name)), str(ref_start), str(ref_end), str(utils.NCORES) ])) subprocess.check_call([ "Rscript", "climpact2.batch.stations.r", os.path.join(dataset.location, "formatted"), os.path.join(dataset.location, "{}.metadata.txt".format(dataset.name)), str(ref_start), str(ref_end), str(utils.NCORES) ]) except subprocess.CalledProcessError: # handle errors in the called executable raise Exception except OSError: # executable not found print("Cannot find Rscript") raise OSError # fail gracefully else: print("No stations available in {}".format(indata)) print(" Climpact2 not run") # remove plots, qc, thres and trend folders (save space) if utils.REMOVE_EXTRA: for subdir in ["plots", "qc", "thres", "trend"]: try: shutil.rmtree( os.path.join(dataset.location, "formatted", subdir)) except FileNotFoundError: print("{} doesn't exist".format( os.path.join(dataset.location, "formatted", subdir))) # fail gracefully else: print("data name not available: {}\n".format(indata)) print("available data names: {}".format(" ".join(names))) return # main
def main(indata="acre", diagnostics=False): """ Call the R package climpact2 with appropriate settings to calculate the indices :param str indata: name of dataset to process :param bool diagnostics: output diagnostic information """ # get all possible datasets all_datasets = utils.get_input_datasets() # and their names names = np.array([d.name for d in all_datasets]) # select the matching one if indata in names: dataset = all_datasets[names == indata][0] # check that there are stations to process for this dataset stations = utils.read_inventory(dataset) if len(stations) != 0: for station in stations: # read the station data infile = os.path.join(dataset.location, "formatted", "{}.txt".format(station.id)) indata = np.genfromtxt(infile) # get the first year and last year ref_start = int(indata[0][0]) ref_end = int(indata[-1][0]) # write a temporary inventory file for just this station utils.write_climpact_inventory_header(os.path.join(dataset.location, "{}_temp.metadata.txt".format(dataset.name))) utils.write_climpact_inventory(os.path.join(dataset.location, "{}_temp.metadata.txt".format(dataset.name)), station) try: with utils.cd(utils.CLIMPACT_LOCS): # call the R process - which should automatically do everything and make suitable files etc # runs in subfolder with context manager, so returning to parent once done. # ACRE (and others?) have stations that do not overlap the reference period. # Means that the QC process throws them out if insufficient overlap between data and reference period print(" ".join(["Rscript", "climpact2.batch.stations.r", os.path.join(dataset.location, "formatted"), os.path.join(dataset.location, "{}_temp.metadata.txt".format(dataset.name)), str(ref_start), str(ref_end), str(utils.NCORES)])) subprocess.check_call(["Rscript", "climpact2.batch.stations.r", os.path.join(dataset.location, "formatted"), os.path.join(dataset.location, "{}_temp.metadata.txt".format(dataset.name)), str(ref_start), str(ref_end), str(utils.NCORES)]) except subprocess.CalledProcessError: # handle errors in the called executable raise Exception except OSError: # executable not found print("Cannot find Rscript") raise OSError # remove temporary metadata file os.remove(os.path.join(dataset.location, "{}_temp.metadata.txt".format(dataset.name))) # fail gracefully else: print("No stations available in {}".format(indata)) print(" Climpact2 not run") # remove plots, qc, thres and trend folders (save space) if utils.REMOVE_EXTRA: for subdir in ["plots", "qc", "thres", "trend"]: shutil.rmtree(os.path.join(dataset.location, "formatted", subdir)) # fail gracefully else: print("data name not available: {}\n".format(indata)) print("available data names: {}".format(" ".join(names))) return # main
def adw(all_datasets, index, timescale, nyears, qc_flags="", month_index=0, diagnostics=False, hadex2_adw=False, anomalies="None"): """ Angular Distance Weighting :param array all_datasets: array of dataset objects :param str index: which index to run :param str timescale: which timescale (MON/ANN) :param int nyears: number of years - to define array :param str qc_flags: which QC flags to process W, B, A, N, C, R, F, E :param int month_index: which month to read :param bool diagnostics: output diagnostic information :param bool hadex2_adw: use the HadEX2 (erroneous) ADW method :param str anomalies: run code on anomalies or climatology rather than raw data """ # http://journals.ametsoc.org/doi/pdf/10.1175/1520-0442%282000%29013%3C2217%3ARTCSTC%3E2.0.CO%3B2 # change to do one month at a time to parallelise a little nmonths = 1 loopwise = False print("Running Month {}".format(month_index)) def calculate_cosine_term(stns_in_dls, box_angle, station_angle, hadex2_adw=False): """ Helper routine to calculate the cosine term for the weighting function """ if hadex2_adw: box_angle_k = station_angle[stns_in_dls][:, stns_in_dls] box_angle_i = np.tile(box_angle[stns_in_dls], (stns_in_dls.shape[0], 1)) else: # do change as RJHD thinks - November 2017 # So that it is the angles between the stations and the box centre # rather than the station and other stations. box_angle_i = np.tile(box_angle[stns_in_dls], (stns_in_dls.shape[0], 1)) box_angle_k = box_angle_i.T cosines = np.cos(box_angle_k - box_angle_i) box_angle_k = 0 box_angle_i = 0 return cosines # calculate_cosine_term def calculate_weighting_term(distance_weight, stns_in_dls): """ Helper routine to calculate the weighting term """ # tile the distance_weight array so replicated for all sid dist_weight_array = np.tile(distance_weight, (len(stns_in_dls), 1)) dist_weight_array = np.ma.swapaxes(dist_weight_array, 0, 1) # set diagonal to zero (k != l) diag = np.arange(dist_weight_array.shape[-1]) dist_weight_array[diag, diag] = 0.0 return dist_weight_array # calculate_weighting_term def calculate_top(dist_weight_array, cosines, nyears, mask): """ Helper routine to calculate the top part of the weighting function """ top_part = dist_weight_array * (1.0 - cosines) # now repeat this nyears times top_part = np.tile(top_part, (nyears, 1, 1)) # as doing the sum, can just set masked elements to zero top_part[mask == True] = 0 return np.sum(top_part, axis=-2) # calculate_top def calculate_bottom(dist_weight_array, nyears, mask): """ Helper routine to calculate the bottom part of the weighting function """ bottom_part = np.tile(dist_weight_array, (nyears, 1, 1)) # as doing the sum, can just set masked elements to zero bottom_part[mask == True] = 0 return np.sum(bottom_part, axis=-2) # calculate_bottom def calculate_adw(distance_weight, mask, top, bottom): """ Helper routine to calculate the angular distance weights """ distance_weight = np.tile(distance_weight, (nyears, 1)) distance_weight[mask == True] = 0 return distance_weight * (1 + (top / bottom)) # calculate_adw def calculate_separations_and_angles(stations, station_locs): """ Calculate the station-station separation and bearing arrays """ separation = np.zeros((stations.shape[0], stations.shape[0])) angle = np.copy(separation) for s, stn in enumerate(stations): this_stn = np.empty([len(stations), 2]) this_stn[:, 0] = stn.latitude this_stn[:, 1] = stn.longitude separation[s, :], angle[s, :] = utils.map_2_points( this_stn, station_locs) return separation, angle #******************************************* # set up the grids GridData, GridStations, GridDLSStations = set_up_grids(nyears, nmonths) # get the DLS raw_dls = np.genfromtxt(os.path.join(utils.DLSLOCS, "dls_{}.txt".format(index)), dtype=(float), skip_header=4) dls_lat = raw_dls[:, 0] dls = raw_dls[:, 1:] # get the stations which actually have data # spin through all datasets stations = np.array([]) for dataset in all_datasets: try: # choose appropriate subdirectory. if anomalies == "None": subdir = "formatted/indices" elif anomalies == "anomalies": subdir = "formatted/anomalies" elif anomalies == "climatology": subdir = "formatted/climatology" ds_stations = utils.read_inventory(dataset, subdir=subdir, final=True, timescale=timescale, index=index, anomalies=anomalies, qc_flags=qc_flags) good_stations = utils.select_qc_passes(ds_stations, qc_flags=qc_flags) stations = np.append(stations, good_stations) print("Adding {}, nstations = {}".format(dataset.name, len(stations))) except IOError: # file missing print("No stations with data for {}".format(dataset.name)) # may have no stations for particular ETSCI combinations if len(stations) == 0: if diagnostics: print("No stations for {} - {}".format(index, timescale)) return GridData, GridStations, GridDLSStations # adw station_locs = np.array([[stn.latitude, stn.longitude] for stn in stations]) # get the distance and bearing arrays station_separation, station_angle = calculate_separations_and_angles( stations, station_locs) #********************* # read in all the data in one step for all the stations all_station_data = np.ma.zeros([nyears, nmonths, len(stations)]) all_station_data.mask = np.ones( np.shape(all_station_data)) # mask everything latitudes = np.zeros(len(stations)) longitudes = np.zeros(len(stations)) # big and slow read loop for s, stat in enumerate(stations): data = get_all_data(stat, index, timescale, nyears, month_index) all_station_data[:, :, s] = data # store all the info all_station_data.mask[:, :, s] = data.mask # store all the info latitudes[s] = stat.latitude longitudes[s] = stat.longitude #********************* # run through each grid box for tlats, latitude in enumerate(utils.box_centre_lats): print(str(tlats) + "/" + str(len(utils.box_centre_lats)), latitude) for tlons, longitude in enumerate(utils.box_centre_lons): # distance of this box centre to all stations this_box = np.empty([len(stations), 2]) this_box[:, 0] = latitude this_box[:, 1] = longitude box_separation, box_angle = utils.map_2_points( this_box, station_locs) # find those stations close enough to contribute # need to adjust if doing all months so that can read in all, but restrict to relevant ones if necessary stns_in_dls, = np.where(box_separation <= np.max(dls[tlats])) stns_in_dls_separations = box_separation[stns_in_dls] if len(stns_in_dls) < utils.STATIONS_IN_DLS: # none of the months have DLS such that sufficient stations are included # skip to next box if diagnostics: print( "skipping lat {}, lon {} - no stations in range (max DLS = {})" .format(latitude, longitude, np.max(dls[tlats]))) continue # REMOVED FOR SINGLE READ # set up blank array to store all station data that can contribute # stations_contrib_to_box_data = np.ma.zeros([nyears, nmonths, len(stns_in_dls)]) # stations_contrib_to_box_data.mask = np.ones(np.shape(stations_contrib_to_box_data)) # mask everything # REMOVED FOR SINGLE READ # this is for the stations actually within the grid box! stations_in_box = np.zeros([nyears, nmonths]) print(" nstats {}".format(len(stns_in_dls))) # get the stations contributing to the box (i.e. within a DLS) stations_contrib_to_box_data = np.ma.copy( all_station_data[:, :, stns_in_dls]) stations_in_box = np.ma.count(all_station_data[:, :, stns_in_dls], axis=2) # or get the stations located in the box # lat_locs, = np.where(np.logical_and(utils.box_edge_lats[tlats] < latitudes, latitudes <= utils.box_edge_lats[tlats+1])) # lon_locs, = np.where(np.logical_and(utils.box_edge_lons[tlons] < longitudes, longitudes <= utils.box_edge_lons[tlons+1])) # # station matches both latitude and longitude constraints # both_lat_and_lon = np.in1d(lat_locs, lon_locs) # in_box_locs = lat_locs[both_lat_and_lon] # if len(in_box_locs) > 0: # stations_in_box = np.ma.count(all_station_data[:, :, in_box_locs], axis=2) # REMOVED FOR SINGLE READ # # read in all the stations - and do this once # for s, li in enumerate(stns_in_dls): # # if diagnostics: # # print(stations[li], stns_in_dls_separations[s]) # # read in the station - matching done in subroutine # data = get_all_data(stations[li], index, timescale, nyears, month_index) # stations_contrib_to_box_data[:, :, s] = data # store all the info # stations_contrib_to_box_data.mask[:, :, s] = data.mask # store all the info # # and number of stations in the box # # need to subtract if not present at any year or with smaller DLS # if (utils.box_edge_lats[tlats] < stations[li].latitude <= utils.box_edge_lats[tlats+1]) \ # and (utils.box_edge_lons[tlons] < stations[li].longitude <= utils.box_edge_lons[tlons+1]): # stations_in_box[data.mask == False] += 1 # REMOVED FOR SINGLE READ # go through each month label - not used as parallelised instead for month in range(nmonths): if diagnostics: print( "latitude {} ({}), longitude {} ({}), dls {}, nstations {}" .format( latitude, tlats, longitude, tlons, dls[tlats][month_index], len( np.where(box_separation[stns_in_dls] < dls[tlats][month_index])[0]))) # get weights distance_weight = np.exp( utils.M * -box_separation[stns_in_dls] / dls[tlats][month_index]) # for each contributing station # filter out stations too far away for this month DLS sep_locs, = np.where( stns_in_dls_separations > dls[tlats][month_index]) stations_contrib_to_box_data[:, month, sep_locs] = 0 stations_contrib_to_box_data.mask[:, month, sep_locs] = True if loopwise: pass # # this is the original longhand version. # for year in range(nyears): # if np.ma.count(stations_contrib_to_box_data[:, month], axis=1)[year] < utils.STATIONS_IN_DLS: # # insufficient stations - don't bother # continue # # which stations do contribute # this_year_mask = -stations_contrib_to_box_data.mask[year, month] # if diagnostics: # # testing long-hand looping # # using Caesar et al terminology - http://onlinelibrary.wiley.com/doi/10.1029/2005JD006280/pdf # w_is=[] # for i in stns_in_dls[this_year_mask]: # w_i = np.exp(utils.M * -box_separation[i]/dls[tlats][month_index]) # tops=[] # bottoms=[] # for k in stns_in_dls[this_year_mask]: # all other stations # if i != k: # w_k = np.exp(utils.M * -box_separation[k]/dls[tlats][month_index]) # bottoms += [w_k] # if hadex2_adw: # tops += [w_k * (1.0 - np.cos(station_angle[k,i] - box_angle[i]))] # else: # tops += [w_k * (1.0 - np.cos(box_angle[k] - box_angle[i]))] # w_is += [w_i * (1 + np.sum(tops)/np.sum(bottoms))] # # print("weights", w_is/sum(w_is)) # # tile the distance_weight array so replicated for all sid # dist_weight_array = np.tile(distance_weight[this_year_mask], (np.ma.count(stations_contrib_to_box_data[year, month]), 1)) # dist_weight_array = np.swapaxes(dist_weight_array, 0, 1) # # set diagonal to zero (k != l) # diag = np.arange(dist_weight_array.shape[0]) # dist_weight_array[diag, diag] = 0.0 # # make a mesh of these so can subtract. # box_angle_i = np.tile(box_angle[stns_in_dls][this_year_mask],(stns_in_dls[this_year_mask].shape[0],1)) # if hadex2_adw: # box_angle_k = station_angle[stns_in_dls[this_year_mask]][:,stns_in_dls[this_year_mask]] # else: # box_angle_k = box_angle_i.T # # get array for top # top_sum = dist_weight_array * (1.0 - np.cos(box_angle_k - box_angle_i)) # top = np.sum(top_sum, axis=0) # bottom = np.sum(dist_weight_array, axis=0) # # un-normalised weights # angular_distance_weight = distance_weight[this_year_mask] * (1 + top / bottom) # final_weights = angular_distance_weight/np.ma.sum(angular_distance_weight) # # if diagnostics: print("weights", final_weights) # should match print line above) # GridData[year, month, tlats, tlons] = np.ma.sum(final_weights * stations_contrib_to_box_data[year, month].compressed()) # GridStations[year, month, tlats, tlons] = stations_in_box[year, month] # # if diagnostics: # # print(GridData[year, month, tlats, tlons]) # # raw_input("stop {}".format(hadex2_adw)) else: # not loopwise """ W_k = weight_k * (1 + a_k) a_k = top/bottom bottom = sum_1_nstations(w_k) top = sum_1_nstations(w_k * (1 - cos(theta_k - theta_l))), k != l """ # aim to remove this loop (longer than months loop, so saves more time?) # if sufficient stations insufficient_station_count = np.ma.count( stations_contrib_to_box_data[:, month, :], axis=1) if max(insufficient_station_count) < utils.STATIONS_IN_DLS: # no year has sufficient stations if diagnostics: print( "skipping lat {}, lon {}, month {} - no stations in range (DLS = {})" .format(latitude, longitude, month + 1, dls[tlats])) continue this_month_mask = stations_contrib_to_box_data.mask[:, month, :] mask = np.array([make_square(m) for m in this_month_mask]) # calculate angular part cosines = calculate_cosine_term(stns_in_dls, box_angle, station_angle, hadex2_adw=hadex2_adw) # calculate weight part dist_weight_array = calculate_weighting_term( distance_weight, stns_in_dls) # cosine and dist_weight the same each year - doesn't change # calculate the top without years, clear memory, then expand and mask top = calculate_top(dist_weight_array, cosines, nyears, mask) cosines = 0 bottom = calculate_bottom(dist_weight_array, nyears, mask) dist_weight_array = 0 angular_distance_weight = calculate_adw( distance_weight, this_month_mask, top, bottom) top = 0 bottom = 0 normalisation = np.ma.sum(angular_distance_weight, axis=1) final_weights = angular_distance_weight / normalisation[:, None] normalisation = 0 angular_distance_weight = 0 # if diagnostics: print(final_weights) GridData[:, month, tlats, tlons] = np.ma.sum( final_weights * stations_contrib_to_box_data[:, month, :], axis=1) GridData.mask[ insufficient_station_count < utils.STATIONS_IN_DLS, month, tlats, tlons] = True GridStations[:, month, tlats, tlons] = stations_in_box[:, month] GridDLSStations[:, month, tlats, tlons] = insufficient_station_count # if diagnostics: # print(np.max(insufficient_station_count)) # print(GridData[:, month, tlats, tlons]) # raw_input("stop {}".format(hadex2_adw)) gc.collect() sys.stdout.flush() return GridData, GridStations, GridDLSStations # adw
def main(index="TX90p", diagnostics=False, qc_flags="", anomalies="None"): """ Read inventories and make scatter plot :param str index: which index to run :param bool diagnostics: extra verbose output :param str qc_flags: which QC flags to process W, B, A, N, C, R, F, E, V, M :param str anomalies: run code on anomalies or climatology rather than raw data """ if index in utils.MONTHLY_INDICES: timescale = ["ANN", "MON"] else: timescale = ["ANN"] # move this up one level eventually? all_datasets = utils.get_input_datasets() for ts in timescale: # set up the figure fig = plt.figure(figsize=(10, 6.5)) plt.clf() ax = plt.axes([0.025, 0.14, 0.95, 0.90], projection=cartopy.crs.Robinson()) ax.gridlines() #draw_labels=True) ax.add_feature(cartopy.feature.LAND, zorder=0, facecolor="0.9", edgecolor="k") ax.coastlines() # dummy scatters for full extent plt.scatter([-180, 180, 0, 0], [0, 0, -90, 90], c="w", s=1, transform=cartopy.crs.Geodetic(), \ edgecolor='w', linewidth='0.01') # run all datasets total = 0 for dataset in all_datasets: try: # choose appropriate subdirectory. if anomalies == "None": subdir = "formatted/indices" elif anomalies == "anomalies": subdir = "formatted/anomalies" elif anomalies == "climatology": subdir = "formatted/climatology" ds_stations = utils.read_inventory(dataset, subdir=subdir, final=True, \ timescale=ts, index=index, anomalies=anomalies, qc_flags=qc_flags) ds_stations = utils.select_qc_passes(ds_stations, qc_flags=qc_flags) except IOError: # file missing print("No stations with data for {}".format(dataset.name)) ds_stations = [] if len(ds_stations) > 0: lats = np.array([stn.latitude for stn in ds_stations]) lons = np.array([stn.longitude for stn in ds_stations]) # and plot scatter = plt.scatter(lons, lats, c=COLOURS[dataset.name], s=15, \ label="{} ({})".format(get_label(dataset.name), len(ds_stations)), \ transform=cartopy.crs.Geodetic(), edgecolor='0.5', linewidth='0.5') total += len(ds_stations) # make a legend leg = plt.legend(loc='lower center', ncol=5, bbox_to_anchor=(0.50, -0.3), \ frameon=False, title="", prop={'size':12}, labelspacing=0.15, columnspacing=0.5, numpoints=3) plt.setp(leg.get_title(), fontsize=12) plt.figtext(0.06, 0.91, "{} Stations".format(total)) plt.title("{} - {}".format(index, ts)) # extra information if utils.WATERMARK: watermarkstring = "{} {}".format(os.path.join("/".join(os.getcwd().split('/')[4:]), os.path.basename(__file__)), dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y %H:%M")) plt.figtext(0.01, 0.01, watermarkstring, size=6) # plt.figtext(0.03, 0.95, "(c)", size=14) # and save outname = putils.make_filenames("station_locations", index=index, grid="ADW", anomalies=anomalies, month=ts.capitalize()) plt.savefig("{}/{}/{}".format(utils.PLOTLOCS, index, outname)) plt.close() # write out total station number if ts == "ANN": with open(os.path.join(utils.INFILELOCS, "{}_stations.txt".format(index)), "w") as outfile: outfile.write("{}\n".format(index)) outfile.write("{}".format(total)) return # main
def cam(all_datasets, index, timescale, nyears, qc_flags="", month_index=0, diagnostics=False, anomalies="None"): """ Climate anomaly method gridding :param array all_datasets: array of dataset objects :param str index: which index to run :param str timescale: which timescale (MON/ANN) :param int nyears: number of years - to define array :param str qc_flags: which QC flags to process W, B, A, N, C, R, F, E, V, M :param int month_index: which month to read :param bool diagnostics: output diagnostic information :param str anomalies: run code on anomalies or climatology rather than raw data """ # change to do one month at a time to parallelise a little nmonths = 1 print("Running Month {}".format(month_index)) GridData, GridStations, dummy = set_up_grids(nyears, nmonths) # get the stations which actually have data # spin through all datasets stations = np.array([]) for dataset in all_datasets: try: # choose appropriate subdirectory. if anomalies == "None": subdir = "formatted/indices" elif anomalies == "anomalies": subdir = "formatted/anomalies" elif anomalies == "climatology": subdir = "formatted/climatology" ds_stations = utils.read_inventory(dataset, subdir=subdir, final=True, timescale=timescale, index=index, anomalies=anomalies, qc_flags=qc_flags) good_stations = utils.select_qc_passes(ds_stations, qc_flags=qc_flags) stations = np.append(stations, good_stations) print("Adding {}, nstations = {}".format(dataset.name, len(stations))) except IOError: # file missing print("No stations with data for {}".format(dataset.name)) # which lat and lon sequence does the station sit in. # As will be using box centres, need to send list with same length as box_centres assign_stations_to_grid_boxes(stations, utils.box_edge_lats[1:], utils.box_edge_lons[1:]) lon_sequence = np.array([stn.box_lon_sequence for stn in stations]) lat_sequence = np.array([stn.box_lat_sequence for stn in stations]) #********************* # run through each grid box for tlats, latitude in enumerate(utils.box_centre_lats): print(str(tlats) + "/" + str(len(utils.box_centre_lats)), latitude) lat_index, = np.where(lat_sequence == tlats) if len(lat_index) == 0: # no stations at this latitude so don't bother going any further continue for tlons, longitude in enumerate(utils.box_centre_lons): lon_index, = np.where(lon_sequence == tlons) if len(lon_index) == 0: # no stations so don't bother continue # get the common stations to both selections (this lat and this lon sequence) lat_lon_match = np.intersect1d(lat_index, lon_index) if len(lat_lon_match) < 0: # no stations so don't bother continue # have at least one station in this grid box # go through each grid box box_data = np.ma.zeros([nyears, nmonths, len(lat_lon_match)]) box_data.mask = np.ones(box_data.shape) for month in range(nmonths): print(len(lat_lon_match)) for s, li in enumerate(lat_lon_match): print(stations[li]) # read in the stations - use same routine as for ADW data = get_all_data(stations[li], index, timescale, nyears, month_index) if anomalies == "climatology": # just read in to store box_data[:, :, s] = data else: # calculate the anomalies # back calculate times good_times = utils.REFERENCEYEARS[data.mask[:, 0] == False] # if no data then skip if len(good_times) == 0: continue #********************* # anomalise clim_years = np.where( (utils.REFERENCEYEARS >= utils.CLIM_START.year) & (utils.REFERENCEYEARS < utils.CLIM_END.year), True, False) clim_data = data[clim_years] #********************* # check sufficient data points completeness = np.ma.count(clim_data, axis=0) locs, = np.where( completeness >= utils.CAM_COMPLETENESS) if len(locs) == 0: continue # single month at a time climatology = np.ma.mean(clim_data, axis=0) stn_anomalies = data - climatology box_data[:, :, s] = stn_anomalies # done all stations in the box, take the mean GridData[:, month, tlats, tlons] = np.ma.mean(box_data, axis=-1)[:, 0] GridStations[:, month, tlats, tlons] = np.ma.count(box_data, axis=-1)[:, 0] # need at least N stations (default=3), so mask insufficient_stations = np.ma.where( GridStations[:, month, tlats, tlons] < utils.STATIONS_IN_BOX) GridData.mask[insufficient_stations, month, tlats, tlons] = True GridStations.mask[insufficient_stations, month, tlats, tlons] = True return GridData, GridStations # cam
def main(indata="ghcnd", diagnostics=False): """ Read TXn and TNn and write out ETR as difference """ index = "ETR" # check if need to do monthly ones if index in utils.MONTHLY_INDICES: timescales = ["ANN", "MON"] else: timescales = ["ANN"] # get all possible datasets all_datasets = utils.get_input_datasets() # and their names names = np.array([d.name for d in all_datasets]) # if dataset selected and in the list of available, then run if indata in names: dataset = all_datasets[names == indata][0] dataset_stations = utils.read_inventory(dataset, subdir="formatted/indices") # check each station for stn in dataset_stations: if diagnostics: print("{} - {}".format(dataset.name, stn.id)) # for appropriate number of timescales for ts in timescales: if os.path.exists(os.path.join(stn.location, stn.id, "{}_{}_{}.csv".format(stn.id, "txx", ts))) and os.path.exists(os.path.join(stn.location, stn.id, "{}_{}_{}.csv".format(stn.id, "tnn", ts))): xtimes, txx = utils.read_station_index(stn, "TXx", ts) ntimes, tnn = utils.read_station_index(stn, "TNn", ts) match = np.in1d(xtimes, ntimes) match_b = np.in1d(ntimes, xtimes) if len(match) != 0 and len(match_b) != 0: etr = txx[match]-tnn[match_b] etr_times = xtimes[match] if ts == "MON": myears = [] months = [] for y in etr_times: for m in range(1, 13): myears += [y] months += [m] stn.monthly = etr.filled().reshape(-1) stn.myears = myears stn.months = months path = os.path.join(dataset.location, "formatted", "indices", stn.id, "{}_{}_MON.csv".format(stn.id, index.lower())) if not os.path.exists(path): utils.write_station_index(path, stn, "ETR", doMonthly=True) else: stn.years = etr_times stn.annual = etr.filled() path = os.path.join(dataset.location, "formatted", "indices", stn.id, "{}_{}_ANN.csv".format(stn.id, index.lower())) if not os.path.exists(path): utils.write_station_index(path, stn, "ETR") return # main
def main(diagnostics=False): """ Read inventories and make scatter plot :param bool diagnostics: extra verbose output """ # move this up one level eventually? all_datasets = utils.get_input_datasets() # set up the figure fig = plt.figure(figsize=(10, 6.7)) plt.clf() ax = plt.axes([0.025, 0.14, 0.95, 0.90], projection=cartopy.crs.Robinson()) ax.gridlines() #draw_labels=True) ax.add_feature(cartopy.feature.LAND, zorder=0, facecolor="0.9", edgecolor="k") ax.coastlines() # dummy scatters for full extent plt.scatter([-180, 180, 0, 0], [0, 0, -90, 90], c="w", s=1, transform=cartopy.crs.Geodetic(), \ edgecolor='w', linewidth='0.01') # run all datasets total = 0 for dataset in all_datasets: try: # choose appropriate subdirectory. subdir = "formatted/indices" ds_stations = utils.read_inventory(dataset, subdir=subdir, final=False, \ timescale="", index="", anomalies="None", qc_flags="") except IOError: # file missing print("No stations with data for {}".format(dataset.name)) ds_stations = [] if len(ds_stations) > 0: lats = np.array([stn.latitude for stn in ds_stations]) lons = np.array([stn.longitude for stn in ds_stations]) # and plot scatter = plt.scatter(lons, lats, c=COLOURS[dataset.name], s=15, \ label="{} ({})".format(get_label(dataset.name), len(ds_stations)), \ transform=cartopy.crs.Geodetic(), edgecolor='0.5', linewidth='0.5') total += len(ds_stations) # make a legend leg = plt.legend(loc='lower center', ncol=5, bbox_to_anchor=(0.50, -0.34), \ frameon=False, title="", prop={'size':12}, labelspacing=0.15, columnspacing=0.5, numpoints=3) plt.setp(leg.get_title(), fontsize=12) plt.figtext(0.05, 0.92, "{} Stations".format(total)) plt.title("HadEX3 stations") # and save outname = putils.make_filenames("station_locations", index="All", grid="ADW", anomalies="None", month="All") plt.savefig("{}/{}".format(utils.PLOTLOCS, outname), dpi=300) plt.close() return # main
def main(index="TX90p", diagnostics=False): """ For all datasets, finds stations that exist for given index (and appropriate timescales) Checks for presence of data and write final station listing :param str index: which index to process :param bool diagnostics: output diagnostic information """ # check if need to do monthly ones if index in utils.MONTHLY_INDICES: timescales = ["ANN", "MON"] else: timescales = ["ANN"] # read in all datasets all_datasets = utils.get_input_datasets() # for appropriate number of timescales for ts in timescales: print("{}".format(ts)) # spin through each dataset for d, dataset in enumerate(all_datasets): dataset_stations = utils.read_inventory(dataset, subdir="formatted/indices") if diagnostics: print("{} - {}".format(dataset.name, index)) final_inventory = [] # check each station for stn in dataset_stations: if diagnostics: print("{} - {}".format(dataset.name, stn.id)) if assess_station(stn, index, ts, diagnostics=diagnostics): final_inventory += [stn] if diagnostics: print("{}\n".format(len(final_inventory))) else: if diagnostics: print("\n") # then write everything out. utils.write_climpact_inventory_header( os.path.join( dataset.location, "{}.metadata.{}.{}.txt".format(dataset.name, index, ts))) for stn in final_inventory: utils.write_climpact_inventory( os.path.join( dataset.location, "{}.metadata.{}.{}.txt".format(dataset.name, index, ts)), stn) print("{} - {} stations".format(dataset.name, len(final_inventory))) return # main
def main(index="TX90p", diagnostics=False, qc_flags="", anomalies="None"): """ Read inventories and make scatter plot :param str index: which index to run :param bool diagnostics: extra verbose output :param str qc_flags: which QC flags to process W, B, A, N, C, R, F, E, V, M :param str anomalies: run code on anomalies or climatology rather than raw data """ with open( os.path.join(utils.INFILELOCS, "{}_yearly_stations.txt".format(index)), "w") as outfile: outfile.write("{}\n".format(index)) if index in utils.MONTHLY_INDICES: timescale = ["ANN", "MON"] # allow for future! else: timescale = ["ANN"] # move this up one level eventually? all_datasets = utils.get_input_datasets() for ts in timescale: # run all datasets for d, dataset in enumerate(all_datasets): print(dataset) try: # choose appropriate subdirectory. subdir = "formatted/indices" ds_stations = utils.read_inventory(dataset, subdir=subdir, final=True, \ timescale=ts, index=index, anomalies=anomalies, qc_flags=qc_flags) ds_stations = utils.select_qc_passes(ds_stations, qc_flags=qc_flags) except IOError: # file missing print("No stations with data for {}".format(dataset.name)) ds_stations = [] # extract relevant info for this dataset if len(ds_stations) > 0: # extract values for this dataset for s, stn in enumerate(ds_stations): presence = time_presence(stn, index, ts) # year/month if s == 0: ds_presence = np.expand_dims(presence, axis=0)[:] else: ds_presence = np.append(ds_presence, np.expand_dims(presence, axis=0), axis=0) # station/year/month ds_lats = np.array([stn.latitude for stn in ds_stations]) ds_lons = np.array([stn.longitude for stn in ds_stations]) # store in overall arrays try: all_lats = np.append(all_lats, ds_lats[:], axis=0) all_lons = np.append(all_lons, ds_lons[:], axis=0) all_presence = np.append( all_presence, ds_presence[:], axis=0) # dataset*station/year/month all_dataset_names = np.append( all_dataset_names, np.array([dataset.name for i in ds_lats])) except NameError: # if not yet defined, then set up all_lats = ds_lats[:] all_lons = ds_lons[:] all_presence = ds_presence[:] all_dataset_names = np.array( [dataset.name for i in ds_lats]) for y, year in enumerate(utils.REFERENCEYEARS): # set up the figure fig = plt.figure(figsize=(10, 6.5)) plt.clf() ax = plt.axes([0.025, 0.10, 0.95, 0.90], projection=cartopy.crs.Robinson()) ax.gridlines() #draw_labels=True) ax.add_feature(cartopy.feature.LAND, zorder=0, facecolor="0.9", edgecolor="k") ax.coastlines() # dummy scatters for full extent plt.scatter([-180, 180, 0, 0], [0, 0, -90, 90], c="w", s=1, transform=cartopy.crs.Geodetic(), \ edgecolor='w', linewidth='0.01') total = 0 for dataset in all_datasets: ds, = np.where(all_dataset_names == dataset.name) locs, = np.where(all_presence[ds, y, 0] == 1) if len(locs) > 0: plt.scatter(all_lons[ds][locs], all_lats[ds][locs], c=ps.COLOURS[dataset.name], \ s=15, label="{} ({})".format(ps.get_label(dataset.name), len(locs)), \ transform=cartopy.crs.Geodetic(), edgecolor='0.5', linewidth='0.5') total += len(locs) else: # aiming to show all, even if zero plt.scatter([-180], [-90], c=ps.COLOURS[dataset.name], s=15, \ label="{} ({})".format(ps.get_label(dataset.name), len(locs)), \ transform=cartopy.crs.Geodetic(), edgecolor='0.5', linewidth='0.5') time.sleep(1) # make a legend leg = plt.legend(loc='lower center', ncol=6, bbox_to_anchor=(0.50, -0.25), frameon=False, \ title="", prop={'size':10}, labelspacing=0.15, columnspacing=0.5, numpoints=3) plt.setp(leg.get_title(), fontsize=12) plt.figtext(0.05, 0.92, "{} Stations".format(total)) plt.title("{} - {} - {}".format(index, ts, year)) # and save outname = putils.make_filenames("station_locations_{}_{}".format( ts.capitalize(), year), index=index, grid="ADW", anomalies=anomalies) plt.savefig("{}/{}/{}".format(utils.PLOTLOCS, index, outname)) plt.close() plt.clf() print("{} done".format(year)) # write out total station number with open( os.path.join(utils.INFILELOCS, "{}_yearly_stations.txt".format(index)), "a") as outfile: outfile.write("{} {}\n".format(year, total)) time.sleep(1) # reset namespace del all_lats del all_lons del all_presence del all_dataset_names return # main