def find_monthly_scaling(obs_var, station, config_file, diagnostics=False): """ Find scaling parameters for monthly values and store in config file :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ all_years = np.unique(station.years) for month in range(1, 13): month_averages = prepare_monthly_data(obs_var, station, month, diagnostics=diagnostics) if len(month_averages.compressed()) >= VALID_MONTHS: # have months, now to standardise climatology = utils.average(month_averages) # mean spread = utils.spread(month_averages) # IQR currently if spread < SPREAD_LIMIT: spread = SPREAD_LIMIT # write out the scaling... utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) else: utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) return # find_monthly_scaling
def find_thresholds(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ # get hourly climatology for each month for month in range(1, 13): variances = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) if len(variances.compressed()) >= MIN_VARIANCES: average_variance = utils.average(variances) variance_spread = utils.spread(variances) else: average_variance = utils.MDI variance_spread = utils.MDI utils.write_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month), "{}".format(average_variance), diagnostics=diagnostics) utils.write_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(variance_spread), diagnostics=diagnostics) return # find_thresholds
def identify_values(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Find average and spread of differences :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) return # identify_values
def variance_check(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) # get hourly climatology for each month for month in range(1, 13): month_locs, = np.where(station.months == month) variances = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) try: average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) if average_variance == utils.MDI and variance_spread == utils.MDI: # couldn't be calculated, move on continue bad_years, = np.where( np.abs(variances - average_variance) / variance_spread > SPREAD_THRESHOLD) # prepare wind and pressure data in case needed to check for storms if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: wind_monthly_data = station.wind_speed.data[month_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_monthly_data = obs_var.data[month_locs] else: pressure_monthly_data = station.sea_level_pressure.data[ month_locs] if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell # move on continue wind_average = utils.average(wind_monthly_data) wind_spread = utils.spread(wind_monthly_data) pressure_average = utils.average(pressure_monthly_data) pressure_spread = utils.spread(pressure_monthly_data) # go through each bad year for this month all_years = np.unique(station.years) for year in bad_years: # corresponding locations ym_locs, = np.where( np.logical_and(station.months == month, station.years == all_years[year])) # if pressure or wind speed, need to do some further checking before applying flags if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: # pull out the data wind_data = station.wind_speed.data[ym_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_data = obs_var.data[ym_locs] else: pressure_data = station.sea_level_pressure.data[ym_locs] # need sufficient data to work with for storm check to work, else can't tell if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # move on continue # find locations of high wind speeds and low pressures, cross match high_winds, = np.ma.where( (wind_data - wind_average) / wind_spread > STORM_THRESHOLD) low_pressures, = np.ma.where( (pressure_average - pressure_data) / pressure_spread > STORM_THRESHOLD) match = np.in1d(high_winds, low_pressures) couldbe_storm = False if len(match) > 0: # this could be a storm, either at tropical station (relatively constant pressure) # or out of season in mid-latitudes. couldbe_storm = True if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: diffs = np.ma.diff(pressure_data) elif obs_var.name == "wind_speed": diffs = np.ma.diff(wind_data) # count up the largest number of sequential negative and positive differences negs, poss = 0, 0 biggest_neg, biggest_pos = 0, 0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not couldbe_storm: # insufficient to identify as a storm (HadISD values) # leave flags set pass else: # could be a storm, so better to leave this month unflagged # zero length array to flag ym_locs = np.ma.array([]) # copy over the flags, if any if len(ym_locs) != 0: # and set the flags flags[ym_locs] = "V" # diagnostic plots if plots: import matplotlib.pyplot as plt scaled_variances = ((variances - average_variance) / variance_spread) bins = utils.create_bins(scaled_variances, 0.25, obs_var.name) hist, bin_edges = np.histogram(scaled_variances, bins) plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Months") plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(SPREAD_THRESHOLD, c="r") plt.axvline(-SPREAD_THRESHOLD, c="r") bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Variance {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # variance_check
def pressure_offset(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Flag locations where difference between station and sea-level pressure falls outside of bounds :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(sealp.data.shape[0])]) difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: try: average = float( utils.read_qc_config(config_file, "PRESSURE", "average")) spread = float( utils.read_qc_config(config_file, "PRESSURE", "spread")) except KeyError: print("Information missing in config file") average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) if np.abs(np.ma.mean(difference) - np.ma.median(difference)) > THRESHOLD * spread: if diagnostics: print("Large difference between mean and median") print("Likely to have two populations of roughly equal size") print("Test won't work") pass else: high, = np.ma.where(difference > (average + (THRESHOLD * spread))) low, = np.ma.where(difference < (average - (THRESHOLD * spread))) # diagnostic plots if plots: bins = np.arange( np.round(difference.min()) - 1, np.round(difference.max()) + 1, 0.1) import matplotlib.pyplot as plt plt.clf() plt.hist(difference.compressed(), bins=bins) plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r") plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r") plt.xlim([bins[0] - 1, bins[-1] + 1]) plt.ylabel("Observations") plt.xlabel("Difference (hPa)") plt.show() if len(high) != 0: flags[high] = "p" if diagnostics: print("Pressure".format(stnlp.name)) print(" Number of high differences {}".format(len(high))) if plots: for bad in high: plot_pressure(sealp, stnlp, times, bad) if len(low) != 0: flags[low] = "p" if diagnostics: print(" Number of low differences {}".format(len(low))) if plots: for bad in low: plot_pressure(sealp, stnlp, times, bad) # only flag the station level pressure stnlp.flags = utils.insert_flags(stnlp.flags, flags) if diagnostics: print("Pressure {}".format(stnlp.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # pressure_offset
def prepare_data(obs_var, station, month, diagnostics=False, winsorize=True): """ Prepare the data for the climatological check. Makes anomalies and applies low-pass filter :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: which month to run on :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ anomalies = np.ma.zeros(obs_var.data.shape[0]) anomalies.mask = np.ones(anomalies.shape[0]) normed_anomalies = np.ma.copy(anomalies) mlocs, = np.where(station.months == month) nyears = len(np.unique(station.years[mlocs])) # need to have some data and in at least 5 years! if len(mlocs) >= utils.DATA_COUNT_THRESHOLD and nyears >= 5: anomalies.mask[mlocs] = False normed_anomalies.mask[mlocs] = False hourly_clims = np.ma.zeros(24) hourly_clims.mask = np.ones(24) for hour in range(24): # calculate climatology hlocs, = np.where( np.logical_and(station.months == month, station.hours == hour)) hour_data = obs_var.data[hlocs] if winsorize: if len(hour_data.compressed()) > 10: hour_data = utils.winsorize(hour_data, 5) if len(hour_data) >= utils.DATA_COUNT_THRESHOLD: hourly_clims[hour] = np.ma.mean(hour_data) hourly_clims.mask[hour] = False # make anomalies - keeping the order anomalies[hlocs] = obs_var.data[hlocs] - hourly_clims[hour] # if insufficient data at each hour, then no anomalies calculated if len(anomalies[mlocs].compressed()) >= utils.DATA_COUNT_THRESHOLD: # for the month, normalise anomalies by spread spread = utils.spread(anomalies[mlocs]) if spread < 1.5: spread = 1.5 normed_anomalies[mlocs] = anomalies[mlocs] / spread # apply low pass filter derived from monthly values all_years = np.unique(station.years) monthly_anoms = np.ma.zeros(all_years.shape[0]) for y, year in enumerate(all_years): ylocs, = np.where(station.years == year) year_data = obs_var.data[ylocs] monthly_anoms[y] = utils.average(year_data) lp_filtered_anomalies = low_pass_filter(normed_anomalies, station, monthly_anoms, month) return lp_filtered_anomalies # prepare_data else: return anomalies # prepare_data else: return anomalies # prepare_data
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False): """ Extract data for month and find secondary populations in distribution. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) for month in range(1, 13): normalised_anomalies = prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=diagnostics) if (len(normalised_anomalies.compressed()) == 1 and normalised_anomalies[0] == utils.MDI): # no data to work with for this month, move on. continue bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies, bins) try: upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) if upper_threshold == utils.MDI and lower_threshold == utils.MDI: # these weren't able to be calculated, move on continue elif len(np.unique(normalised_anomalies)) == 1: # all the same value, so won't be able to fit a histogram continue # now to find the gaps uppercount = len(np.where(normalised_anomalies > upper_threshold)[0]) lowercount = len(np.where(normalised_anomalies < lower_threshold)[0]) month_locs, = np.where( station.months == month) # append should keep year order if uppercount > 0: gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies > gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" flags[month_locs] = month_flags if lowercount > 0: gap_start = utils.find_gap(hist, bins, lower_threshold, GAP_SIZE, upwards=False) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies < gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" # TODO - can this bit be refactored? # for pressure data, see if the flagged obs correspond with high winds # could be a storm signal if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: wind_monthly_data = prepare_monthly_data( station.wind_speed, station, month) pressure_monthly_data = prepare_monthly_data( obs_var, station, month) if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell pass else: wind_monthly_average = utils.average(wind_monthly_data) wind_monthly_spread = utils.spread(wind_monthly_data) pressure_monthly_average = utils.average( pressure_monthly_data) pressure_monthly_spread = utils.spread( pressure_monthly_data) # already a single calendar month, so go through each year all_years = np.unique(station.years) for year in all_years: # what's best - extract only when necessary but repeatedly if so, or always, but once this_year_locs = np.where( station.years[month_locs] == year) if "d" not in month_flags[this_year_locs]: # skip if you get the chance continue wind_data = station.wind_speed.data[month_locs][ this_year_locs] pressure_data = obs_var.data[month_locs][ this_year_locs] storms, = np.ma.where( np.logical_and( (((wind_data - wind_monthly_average) / wind_monthly_spread) > STORM_THRESHOLD), (((pressure_monthly_average - pressure_data ) / pressure_monthly_spread) > STORM_THRESHOLD))) # more than one entry - check if separate events if len(storms) >= 2: # find where separation more than the usual obs separation storm_1diffs = np.ma.diff(storms) separations, = np.where( storm_1diffs > np.ma.median( np.ma.diff(wind_data))) if len(separations) != 0: # multiple storm signals storm_start = 0 storm_finish = separations[0] + 1 first_storm = expand_around_storms( storms[storm_start:storm_finish], len(wind_data)) final_storm_locs = copy.deepcopy( first_storm) for j in range(len(separations)): # then do the rest in a loop if j + 1 == len(separations): # final one this_storm = expand_around_storms( storms[separations[j] + 1:], len(wind_data)) else: this_storm = expand_around_storms( storms[separations[j] + 1:separations[j + 1] + 1], len(wind_data)) final_storm_locs = np.append( final_storm_locs, this_storm) else: # locations separated at same interval as data final_storm_locs = expand_around_storms( storms, len(wind_data)) # single entry elif len(storms) != 0: # expand around the storm signal (rather than # just unflagging what could be the peak and # leaving the entry/exit flagged) final_storm_locs = expand_around_storms( storms, len(wind_data)) # unset the flags if len(storms) > 0: month_flags[this_year_locs][ final_storm_locs] = "" # having checked for storms now store final flags flags[month_locs] = month_flags # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") bad_locs, = np.where(flags[month_locs] == "d") bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Distribution (all) {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # all_obs_gap
def prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=False): """ Extract data for the month, make & store or read average and spread. Use to calculate normalised anomalies. :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: month to process :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ month_locs, = np.where(station.months == month) all_month_data = obs_var.data[month_locs] if full: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) else: try: climatology = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) if climatology == utils.MDI and spread == utils.MDI: # these weren't calculable, move on return np.ma.array([utils.MDI]) elif spread == 0: # all the same value return (all_month_data - climatology) # prepare_all_data else: return (all_month_data - climatology) / spread # prepare_all_data