def get_repeating_string_threshold(obs_var, config_file, plots=False, diagnostics=False): """ Use distribution to determine threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # mask calm periods (as these could be a reasonable string) this_var = copy.deepcopy(obs_var) if obs_var.name == "wind_speed": calms, = np.ma.where(this_var.data == 0) this_var.data[calms] = utils.MDI this_var.data.mask[calms] = True # only process further if there is enough data if len(this_var.data.compressed()) > 1: repeated_string_lengths, grouped_diffs, strings = prepare_data_repeating_string( this_var, plots=plots, diagnostics=diagnostics) # bin width is 1 as dealing in time index. # minimum bin value is 2 as this is the shortest string possible threshold = utils.get_critical_values(repeated_string_lengths, binmin=2, binwidth=1.0, plots=plots, diagnostics=diagnostics, title=this_var.name.capitalize(), xlabel="Repeating string length") # write out the thresholds... utils.write_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight", "{}".format(threshold), diagnostics=diagnostics) else: # store high value so threshold never reached utils.write_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight", "{}".format(-utils.MDI), diagnostics=diagnostics) return # repeating_string_threshold
def find_thresholds(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ # get hourly climatology for each month for month in range(1, 13): variances = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) if len(variances.compressed()) >= MIN_VARIANCES: average_variance = utils.average(variances) variance_spread = utils.spread(variances) else: average_variance = utils.MDI variance_spread = utils.MDI utils.write_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month), "{}".format(average_variance), diagnostics=diagnostics) utils.write_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(variance_spread), diagnostics=diagnostics) return # find_thresholds
def find_monthly_scaling(obs_var, station, config_file, diagnostics=False): """ Find scaling parameters for monthly values and store in config file :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ all_years = np.unique(station.years) for month in range(1, 13): month_averages = prepare_monthly_data(obs_var, station, month, diagnostics=diagnostics) if len(month_averages.compressed()) >= VALID_MONTHS: # have months, now to standardise climatology = utils.average(month_averages) # mean spread = utils.spread(month_averages) # IQR currently if spread < SPREAD_LIMIT: spread = SPREAD_LIMIT # write out the scaling... utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) else: utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) utils.write_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) return # find_monthly_scaling
def identify_values(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Find average and spread of differences :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) return # identify_values
def find_offset(obs_var, station, config_file, plots=False, diagnostics=False): """ Find the best offset for a sine curve to represent the cycle :param MetVar obs_var: Meteorological Variable object :param Station station: Station Object for the station :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ best_fit_diurnal, best_fit_uncertainty = prepare_data(station, obs_var) # done complete record, have best fit for each day # now to find best overall fit. # find median offset for each uncertainty range from 1 to 6 hours best_fits = MISSING*np.ones(6).astype(int) for h in range(6): locs, = np.where(best_fit_uncertainty == h+1) if len(locs) >= utils.DATA_COUNT_THRESHOLD: best_fits[h] = np.median(best_fit_diurnal[locs]) # now go through each of the 6hrs of uncertainty and see if the range # of the best fit +/- uncertainty overlap across them. # if they do, it's a well defined cycle, if not, then there's a problem '''Build up range of cycles incl, uncertainty to find where best of best located''' hours = np.arange(24) hour_matches = np.zeros(24) diurnal_peak = MISSING number_estimates = 0 for h in range(6): if best_fits[h] != MISSING: '''Store lowest uncertainty best fit as first guess''' if diurnal_peak == MISSING: diurnal_peak = best_fits[h] hours = np.roll(hours, 11-int(diurnal_peak)) hour_matches[11-(h+1):11+(h+2)] = 1 number_estimates += 1 # get spread of uncertainty, and +1 to this range centre, = np.where(hours == best_fits[h]) if (centre[0] - (h + 1)) >= 0: if (centre[0] + h + 1) <= 23: hour_matches[centre[0] - (h + 1) : centre[0] + (h + 2)] += 1 else: hour_matches[centre[0] - (h + 1) : ] += 1 # back part hour_matches[ : centre[0] + (h + 2) - 24] += 1 # front part else: hour_matches[: centre[0] + h + 2] += 1 # front part hour_matches[centre[0] - (h + 1) :] += 1 # back part number_estimates += 1 '''If value at lowest uncertainty not found in all others, then see what value is found by all others ''' if hour_matches[11] != number_estimates: # central estimate at 12 o'clock all_match, = np.where(hour_matches == number_estimates) # if one is, then use it if len(all_match) > 0: diurnal_peak = all_match[0] else: diurnal_peak = MISSING if diagnostics: print("Good fit to diurnal cycle not found") '''Now have value for best fit diurnal offset''' utils.write_qc_config(config_file, "DIURNAL-{}".format(obs_var.name), "peak", "{}".format(diurnal_peak), diagnostics=diagnostics) return best_fit_diurnal, best_fit_uncertainty # find_offset
def get_critical_values(obs_var, times, config_file, plots=False, diagnostics=False): """ Use distribution to determine critical values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param array times: array of times (usually in minutes) :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # use all first differences # TODO monthly? masked_times = np.ma.masked_array(times, mask=obs_var.data.mask) time_diffs = np.ma.diff(masked_times) / np.timedelta64( 1, "m") # presuming minutes value_diffs = np.ma.diff(obs_var.data) # get thresholds for each unique time differences unique_diffs = np.unique(time_diffs.compressed()) for t_diff in unique_diffs: if t_diff == 0: # not a spike or jump, but 2 values at the same time. # should be zero value difference, so fitting histogram not going to work # handled in separate test print("test") continue locs, = np.where(time_diffs == t_diff) first_differences = value_diffs[locs] # ensure sufficient non-masked observations if len(first_differences.compressed()) >= utils.DATA_COUNT_THRESHOLD: # fit decay curve to one-sided distribution c_value = utils.get_critical_values( first_differences.compressed(), binmin=0, binwidth=0.5, plots=plots, diagnostics=diagnostics, xlabel="First differences", title="Spike - {} - {}m".format(obs_var.name.capitalize(), t_diff)) # write out the thresholds... utils.write_qc_config(config_file, "SPIKE-{}".format(obs_var.name), "{}".format(t_diff), "{}".format(c_value), diagnostics=diagnostics) if diagnostics: print(" Time Difference: {} minutes".format(t_diff)) print(" Number of obs: {}, threshold: {}".format( len(first_differences.compressed()), c_value)) else: if diagnostics: print(" Time Difference: {} minutes".format(t_diff)) print(" Number of obs insufficient: {} < {}".format( len(first_differences.compressed()), utils.DATA_COUNT_THRESHOLD)) return # get_critical_values
def identify_values(obs_var, station, config_file, plots=False, diagnostics=False): """ Use distribution to identify frequent values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # TODO - do we want to go down the road of allowing resolution (and hence test) # to vary over the p-o-r? I.e. 1C in early, to 0.5C to 0.1C in different decades? utils.write_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width", "{}".format(BIN_WIDTH), diagnostics=diagnostics) for month in range(1, 13): locs, = np.where(station.months == month) month_data = obs_var.data[locs] if len(month_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # insufficient data, so write out empty config and move on utils.write_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "{}".format(month), "[{}]".format(",".join(str(s) for s in [])), diagnostics=diagnostics) continue # adjust bin widths according to reporting accuracy resolution = utils.reporting_accuracy(month_data) if resolution <= 0.5: bins = utils.create_bins(month_data, 0.5, obs_var.name) else: bins = utils.create_bins(month_data, 1.0, obs_var.name) hist, bin_edges = np.histogram(month_data, bins) # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) # Scan through the histogram # check if a bin is the maximum of a local area ("ROLLING") suspect = [] for b, bar in enumerate(hist): if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)): target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) + 1] # if sufficient obs, maximum and contains > 50%, but not all, of the data if bar >= utils.DATA_COUNT_THRESHOLD: if bar == target_bins.max(): if (bar / target_bins.sum()) > RATIO: suspect += [bins[b]] # diagnostic plots if plots: bad_hist = np.copy(hist) for b, bar in enumerate(bad_hist): if bins[b] not in suspect: bad_hist[b] = 0 plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # write out the thresholds... utils.write_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "{}".format(month), "[{}]".format(",".join(str(s) for s in suspect)), diagnostics=diagnostics) return # identify_values
def pressure_offset(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Flag locations where difference between station and sea-level pressure falls outside of bounds :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(sealp.data.shape[0])]) difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: try: average = float( utils.read_qc_config(config_file, "PRESSURE", "average")) spread = float( utils.read_qc_config(config_file, "PRESSURE", "spread")) except KeyError: print("Information missing in config file") average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) if np.abs(np.ma.mean(difference) - np.ma.median(difference)) > THRESHOLD * spread: if diagnostics: print("Large difference between mean and median") print("Likely to have two populations of roughly equal size") print("Test won't work") pass else: high, = np.ma.where(difference > (average + (THRESHOLD * spread))) low, = np.ma.where(difference < (average - (THRESHOLD * spread))) # diagnostic plots if plots: bins = np.arange( np.round(difference.min()) - 1, np.round(difference.max()) + 1, 0.1) import matplotlib.pyplot as plt plt.clf() plt.hist(difference.compressed(), bins=bins) plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r") plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r") plt.xlim([bins[0] - 1, bins[-1] + 1]) plt.ylabel("Observations") plt.xlabel("Difference (hPa)") plt.show() if len(high) != 0: flags[high] = "p" if diagnostics: print("Pressure".format(stnlp.name)) print(" Number of high differences {}".format(len(high))) if plots: for bad in high: plot_pressure(sealp, stnlp, times, bad) if len(low) != 0: flags[low] = "p" if diagnostics: print(" Number of low differences {}".format(len(low))) if plots: for bad in low: plot_pressure(sealp, stnlp, times, bad) # only flag the station level pressure stnlp.flags = utils.insert_flags(stnlp.flags, flags) if diagnostics: print("Pressure {}".format(stnlp.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # pressure_offset
def find_month_thresholds(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ # get hourly climatology for each month for month in range(1, 13): normalised_anomalies = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) if len(normalised_anomalies.compressed() ) >= utils.DATA_COUNT_THRESHOLD: bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies.compressed(), bins) gaussian_fit = utils.fit_gaussian( bins[1:], hist, max(hist), mu=bins[np.argmax(hist)], sig=utils.spread(normalised_anomalies)) fitted_curve = utils.gaussian(bins[1:], gaussian_fit) # diagnostic plots if plots: import matplotlib.pyplot as plt plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel("Scaled {}".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.plot(bins[1:], fitted_curve) plt.ylim([0.1, max(hist) * 2]) # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD try: lower_threshold = bins[1:][np.where( np.logical_and(fitted_curve < FREQUENCY_THRESHOLD, bins[1:] < 0))[0]][-1] except: lower_threshold = bins[1] try: upper_threshold = bins[1:][np.where( np.logical_and(fitted_curve < FREQUENCY_THRESHOLD, bins[1:] > 0))[0]][0] except: upper_threshold = bins[-1] if plots: plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") plt.show() utils.write_qc_config(config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(upper_threshold), diagnostics=diagnostics) utils.write_qc_config(config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(lower_threshold), diagnostics=diagnostics) return # find_month_thresholds
def find_thresholds(obs_var, station, config_file, plots=False, diagnostics=False): """ Extract data for month and find thresholds in distribution and store. :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: month to process :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ for month in range(1, 13): normalised_anomalies = prepare_all_data(obs_var, station, month, config_file, full=True, diagnostics=diagnostics) if len(normalised_anomalies.compressed() ) == 1 and normalised_anomalies[0] == utils.MDI: # scaling not possible for this month utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) continue elif len(np.unique(normalised_anomalies)) == 1: # all the same value, so won't be able to fit a histogram utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(utils.MDI), diagnostics=diagnostics) continue bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies, bins) gaussian_fit = utils.fit_gaussian(bins[1:], hist, max(hist), mu=bins[np.argmax(hist)], \ sig=utils.spread(normalised_anomalies), skew=skew(normalised_anomalies.compressed())) fitted_curve = utils.skew_gaussian(bins[1:], gaussian_fit) # diagnostic plots if plots: import matplotlib.pyplot as plt plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.plot(bins[1:], fitted_curve) plt.ylim([0.1, max(hist) * 2]) # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD try: lower_threshold = bins[1:][np.where( np.logical_and( fitted_curve < FREQUENCY_THRESHOLD, bins[1:] < bins[np.argmax(fitted_curve)]))[0]][-1] except: lower_threshold = bins[1] try: if len(np.unique(fitted_curve)) == 1: # just a line of zeros perhaps (found on AFA00409906 station_level_pressure 20190913) upper_threshold = bins[-1] else: upper_threshold = bins[1:][np.where( np.logical_and( fitted_curve < FREQUENCY_THRESHOLD, bins[1:] > bins[np.argmax(fitted_curve)]))[0]][0] except: upper_threshold = bins[-1] if plots: plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") plt.show() utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month), "{}".format(upper_threshold), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month), "{}".format(lower_threshold), diagnostics=diagnostics) return # find_thresholds
def prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=False): """ Extract data for the month, make & store or read average and spread. Use to calculate normalised anomalies. :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: month to process :param str config_file: configuration file to store critical values :param bool diagnostics: turn on diagnostic output """ month_locs, = np.where(station.months == month) all_month_data = obs_var.data[month_locs] if full: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) else: try: climatology = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: # have data, now to standardise climatology = utils.average(all_month_data) # mean spread = utils.spread(all_month_data) # IQR currently else: climatology = utils.MDI spread = utils.MDI # write out the scaling... utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month), "{}".format(climatology), diagnostics=diagnostics) utils.write_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month), "{}".format(spread), diagnostics=diagnostics) if climatology == utils.MDI and spread == utils.MDI: # these weren't calculable, move on return np.ma.array([utils.MDI]) elif spread == 0: # all the same value return (all_month_data - climatology) # prepare_all_data else: return (all_month_data - climatology) / spread # prepare_all_data