def hfr(station, var_list, full=False, plots=False, diagnostics=False): """ Run through the variables and pass to the High Flag Rate Check :param Station station: Station Object for the station :param list var_list: list of variables to test :param bool full: run a full update (unused here) :param book plots: turn on plots :param bool diagnostics: turn on diagnostic output :returns: int : number of variables on which these flags have been set """ vars_set = [] # Keep track of where these flags are set. for var in var_list: obs_var = getattr(station, var) flags, any_set = high_flag_rate(obs_var, plots=plots, diagnostics=diagnostics) obs_var.flags = utils.insert_flags(obs_var.flags, flags) if any_set: vars_set += [var] # Now double check the list of variables where "H" flags have been set. # If one of a synergistic pair is, then do the other (wind speed/direction, # sea/station level pressure). # Using exclusive or. This only passes if one is True and the other is False. if ("sea_level_pressure" in vars_set) is not ("station_level_pressure" in vars_set): if "sea_level_pressure" in vars_set: set_synergistic_flags(station, "station_level_pressure") elif "station_level_pressure" in vars_set: set_synergistic_flags(station, "sea_level_pressure") if ("wind_speed" in vars_set) is not ("wind_direction" in vars_set): if "wind_speed" in vars_set: set_synergistic_flags(station, "wind_direction") elif "wind_direction" in vars_set: set_synergistic_flags(station, "wind_speed") # For synergistically flagged, just count once, so this return is correct. return len(vars_set) # hfr
def identify_multiple_values(obs_var, times, config_file, plots=False, diagnostics=False): """ Use config_file to read in critical values, and then assess to find :param MetVar obs_var: meteorological variable object :param array times: array of times (usually in minutes) :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # TODO check works with missing data (compressed?) # TODO monthly? flags = np.array(["" for i in range(obs_var.data.shape[0])]) time_diffs = np.ma.diff(times) / np.timedelta64(1, "m") # presuming minutes value_diffs = np.ma.diff(obs_var.data) multiple_obs_at_time, = np.where(time_diffs == 0) # if diagnostics: # print("number of identical timestamps {}".format(multiple_obs_at_time.shape[0])) suspect_locs, = np.ma.where(value_diffs[multiple_obs_at_time] != 0) # set the first of the obs, then the second which make the diff flags[multiple_obs_at_time[suspect_locs]] = "T" flags[multiple_obs_at_time[suspect_locs] + 1] = "T" obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Timestamp {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # identify_multiple_values
def mcu(station, var_list, full=False, plots=False, diagnostics=False): """ Run through the variables and pass to monthly clean up :param Station station: Station Object for the station :param list var_list: list of variables to test :param bool full: run a full update (unused here) :param book plots: turn on plots :param bool diagnostics: turn on diagnostic output """ for var in var_list: obs_var = getattr(station, var) flags = clean_up(obs_var, station, plots=plots, diagnostics=diagnostics) obs_var.flags = utils.insert_flags(obs_var.flags, flags) return # wrc
def set_synergistic_flags(station, var): """ Set the flags on a synergistic variable. :param Station station: Station Object for the station :param str var: name of variable """ obs_var = getattr(station, var) new_flags = np.array(["" for i in range(obs_var.data.shape[0])]) old_flags = obs_var.flags obs_locs, = np.where(obs_var.data.mask == False) if obs_locs.shape[0] > 10 * utils.DATA_COUNT_THRESHOLD: # require sufficient observations to make a flagged fraction useful. # As synergistically flagged, add to all flags. new_flags[obs_locs] = "H" obs_var.flags = utils.insert_flags(obs_var.flags, new_flags) return # set_synergistic_flags
def wrc(station, var_list, full=False, plots=False, diagnostics=False): """ Run through the variables and pass to the World Record Check. :param Station station: Station Object for the station :param list var_list: list of variables to test :param bool full: run a full update (unused here) :param book plots: turn on plots :param bool diagnostics: turn on diagnostic output """ for var in var_list: obs_var = getattr(station, var) flags = record_check(obs_var, station.continent, plots=plots, diagnostics=diagnostics) obs_var.flags = utils.insert_flags(obs_var.flags, flags) return # wrc
def lc(station, var_list, full=False, plots=False, diagnostics=False): """ Run through the variables and pass to the Logic Checks :param Station station: Station Object for the station :param list var_list: list of variables to test :param bool full: run a full update (unused here) :param book plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # https://github.com/glamod/glamod-dm/blob/master/glamod-parser/glamod/parser/filters/observations_table.py # database parser has these, for future reference # station level (from inventory listing, not for each timestamp) return_code = 0 if station.lat < -90 or station.lat > 90: write_logic_error(station, "Bad latitude: {}".format(station.lat), diagnostics=diagnostics) if diagnostics: print("Bad latitude: {}".format(station.lat)) return_code = -1 if station.lon < -180 or station.lon > 180: write_logic_error(station, "Bad longtitude: {}".format(station.lon), diagnostics=diagnostics) if diagnostics: print("Bad longtitude: {}".format(station.lon)) return_code = -1 if station.lon == 0 and station.lat == 0: write_logic_error( station, "Bad longtitude & latitude combination: lon={}, lat={}".format( station.lon, station.lat), diagnostics=diagnostics) if diagnostics: print("Bad longtitude/latitude: {} & {}".format( station.lon, station.lat)) return_code = -1 # Missing elevation acceptable - removed this for the moment (7 November 2019, RJHD) # missing could be -999, -999.9, -999.999 or even 9999.0 etc hence using string comparison if (station.elev < -432.65 or station.elev > 8850.): if str(station.elev)[:4] not in ["-999", "9999"]: write_logic_error(station, "Bad elevation: {}".format(station.elev), diagnostics=diagnostics) if diagnostics: print("Bad elevation: {}".format(station.elev)) return_code = -1 else: if diagnostics: print("Missing elevation, but not flagged: {}".format( station.elev)) if station.times.iloc[0] < dt.datetime(1650, 1, 1): write_logic_error(station, "Bad start time: {}".format(station.times[0]), diagnostics=diagnostics) if diagnostics: print("Bad start time: {}".format(station.times[0])) return_code = -1 elif station.times.iloc[-1] > dt.datetime.now(): write_logic_error(station, "Bad end time: {}".format(station.times[-1]), diagnostics=diagnostics) if diagnostics: print("Bad end time: {}".format(station.times[-1])) return_code = -1 # observation level for var in var_list: obs_var = getattr(station, var) flags = logic_check(obs_var, plots=plots, diagnostics=diagnostics) obs_var.flags = utils.insert_flags(obs_var.flags, flags) return return_code # lc
def identify_spikes(obs_var, times, config_file, plots=False, diagnostics=False): """ Use config_file to read in critical values, and then assess to find spikes :param MetVar obs_var: meteorological variable object :param array times: array of times (usually in minutes) :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # TODO check works with missing data (compressed?) # TODO monthly? masked_times = np.ma.masked_array(times, mask=obs_var.data.mask) time_diffs = np.ma.diff(masked_times) / np.timedelta64( 1, "m") # presuming minutes value_diffs = np.ma.diff(obs_var.data) if len(value_diffs.mask.shape) == 0: # single mask value, replace with array of True/False's if value_diffs.mask: value_diffs.mask = np.ones(value_diffs.shape) else: value_diffs.mask = np.zeros(value_diffs.shape) # get thresholds for each unique time differences unique_diffs = np.unique(time_diffs.compressed()) # retrieve the critical values critical_values = {} for t_diff in unique_diffs: try: c_value = utils.read_qc_config(config_file, "SPIKE-{}".format(obs_var.name), "{}".format(t_diff)) critical_values[t_diff] = float(c_value) except KeyError: # no critical value for this time difference pass # if none have been read, give an option to calculate in case that was the reason for none if len(critical_values) == 0: get_critical_values(obs_var, times, config_file, plots=plots, diagnostics=diagnostics) # and try again for t_diff in unique_diffs: try: c_value = utils.read_qc_config(config_file, "SPIKE-{}".format(obs_var.name), "{}".format(t_diff)) critical_values[t_diff] = float(c_value) except KeyError: # no critical value for this time difference pass # pre select for each time difference that can be tested for t_diff in unique_diffs: if t_diff == 0: # not a spike or jump, but 2 values at the same time. # should be zero value difference, so fitting histogram not going to work # handled in separate test continue # new blank flag array flags = np.array(["" for i in range(obs_var.data.shape[0])]) t_locs, = np.where(time_diffs == t_diff) try: c_locs, = np.where( np.abs(value_diffs[t_locs]) > critical_values[t_diff]) except: # no critical value for this time difference continue # to next loop # TODO - sort spikes at very beginning or very end of sequence, # when don't have a departure from/return to a normal level # potential spikes for ps, possible_in_spike in enumerate(t_locs[c_locs]): is_spike = False spike_len = 1 while spike_len <= MAX_SPIKE_LENGTH: # test for each possible length to see if identified try: out_spike_t_diff = time_diffs[possible_in_spike + spike_len] possible_out_spike = value_diffs[possible_in_spike + spike_len] except IndexError: # got to end of data run, can't test final value at the moment break # need to test mask/unmasked using array rather than values extracted above # as if values unmasked, then no mask attribute to test! if time_diffs.mask[possible_in_spike + spike_len] == False and \ value_diffs.mask[possible_in_spike + spike_len] == False: try: # find critical value for time-difference of way out of spike out_critical_value = critical_values[out_spike_t_diff] except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy out_critical_value = max(critical_values.values()) else: # time or value difference masked out_critical_value = max(critical_values.values()) if np.abs(possible_out_spike) > out_critical_value: # check that the signs are opposite if np.sign(value_diffs[possible_in_spike]) != np.sign( value_diffs[possible_in_spike + spike_len]): is_spike = True break spike_len += 1 if is_spike and spike_len >= 1: # test within spike differences (chosing correct time difference) within = 1 while within < spike_len: within_t_diff = time_diffs[possible_in_spike + within] if time_diffs.mask[possible_in_spike + within] == False: try: within_critical_value = critical_values[ within_t_diff] if value_diffs[ possible_in_spike + within] > within_critical_value / 2.: is_spike = False except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy within_critical_value = max( critical_values.values()) else: # time difference masked within_critical_value = max(critical_values.values()) if value_diffs.mask[possible_in_spike + within] == False: if value_diffs[possible_in_spike + within] > within_critical_value / 2.: is_spike = False else: # if masked then no data, so can't say if it's not a spike pass within += 1 if is_spike: # test either side (either before or after is too big) try: before_t_diff = time_diffs[possible_in_spike - 1] if time_diffs.mask[possible_in_spike - 1] == False: before_critical_value = critical_values[before_t_diff] else: # time difference masked before_critical_value = max(critical_values.values()) except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy before_critical_value = max(critical_values.values()) except IndexError: # off the front of the data array before_critical_value = max(critical_values.values()) try: after_t_diff = time_diffs[possible_in_spike + spike_len + 1] if time_diffs.mask[possible_in_spike + spike_len + 1] == False: after_critical_value = critical_values[after_t_diff] else: # time difference masked after_critical_value = max(critical_values.values()) except KeyError: # don't have a value for this time difference, so use the maximum of all as a proxy after_critical_value = max(critical_values.values()) except IndexError: # off the back of the data array after_critical_value = max(critical_values.values()) try: if value_diffs.mask[possible_in_spike - 1] == False: if value_diffs[possible_in_spike - 1] > before_critical_value / 2.: # before spike fails test is_spike = False except IndexError: # off the front of the data array pass try: if value_diffs.mask[possible_in_spike + spike_len + 1] == False: if value_diffs[possible_in_spike + spike_len + 1] > after_critical_value / 2.: # after spike fails test is_spike = False except IndexError: # off the back of the data array pass # if the spike is still set, set the flags if is_spike: # "+1" because of difference arrays flags[possible_in_spike + 1:possible_in_spike + 1 + spike_len] = "S" # diagnostic plots if plots: plot_spike(times, obs_var, possible_in_spike + 1, spike_len) obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Spike {}".format(obs_var.name)) print(" Time Difference: {} minutes".format(t_diff)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # identify_spikes
def neighbour_outlier(target_station, initial_neighbours, variable, diagnostics=False, plots=False, full=False): """ Works on a single station and variable. Reads in neighbour's data, finds locations where sufficent are sufficiently different. :param Station target_station: station to run on :param array initial_neighbours: input neighbours (ID, distance) pairs :param str variable: obs variable being run on :param bool diagnostics: print extra material to screen :param bool plots: create plots from each test :param bool full: run full reprocessing rather than using stored values. """ station_list = utils.get_station_list() # if sufficient n_neighbours = len(np.where(initial_neighbours[:, 0] != "-")[0]) - 1 if n_neighbours < utils.MIN_NEIGHBOURS: print("{} has insufficient neighbours ({}<{})".format( target_station.id, n_neighbours, utils.MIN_NEIGHBOURS)) else: #************************* # extract target observations obs_var = getattr(target_station, variable) flags = np.array(["" for i in range(obs_var.data.shape[0]) ]).astype("<U10") #************************* # read in in the neighbour (buddy) data all_buddy_data = np.ma.zeros( [len(initial_neighbours[:, 0]), len(target_station.times)]) all_buddy_data.mask = np.ones(all_buddy_data.shape) for bid, buddy_id in enumerate(initial_neighbours[:, 0]): if buddy_id == target_station.id: # first entry is self continue if buddy_id == "-": # end of the list of buddies break if diagnostics: print("{}/{} {}".format(bid, len(initial_neighbours[:, 0]), buddy_id)) # set up station object to hold information buddy_idx, = np.where(station_list.id == buddy_id) buddy = utils.Station(buddy_id, station_list.iloc[buddy_idx].latitude.values[0], \ station_list.iloc[buddy_idx].longitude.values[0], station_list.iloc[buddy_idx].elevation.values[0]) try: buddy, buddy_df = io.read_station(os.path.join( setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(buddy_id)), buddy, read_flags=True) buddy_var = getattr(buddy, variable) # apply flags flag_locs, = np.where(buddy_var.flags != "") buddy_var.data.mask[flag_locs] = True except OSError as e: # file missing, move on to next in sequence io.write_error( target_station, "File Missing (Buddy, {}) - {}".format(variable, buddy_id)) continue except ValueError as e: # some issue in the raw file io.write_error(target_station, "Error in input file (Buddy, {}) - {}".format( variable, buddy_id), error=str(e)) continue # match the timestamps of target_station and copy over match = np.in1d(target_station.times, buddy.times) match_back = np.in1d(buddy.times, target_station.times) if True in match and True in match_back: # skip if no overlapping times at all! all_buddy_data[bid, match] = buddy_var.data[match_back] if diagnostics: print("All buddies read in") #************************* # find differences differences = all_buddy_data - obs_var.data #************************* # find spread of differences on monthly basis (with minimum value) spreads = np.ma.zeros(differences.shape) for month in range(1, 13): month_locs = np.where(target_station.months == month) for bid, buddy in enumerate(differences): if len(differences[bid, month_locs].compressed() ) > utils.DATA_COUNT_THRESHOLD: this_spread = utils.spread(differences[bid, month_locs]) if this_spread < MIN_SPREAD: spreads[bid, month_locs] = MIN_SPREAD else: spreads[bid, month_locs] = utils.spread( differences[bid, month_locs]) else: spreads[bid, month_locs] = MIN_SPREAD spreads.mask = np.copy(differences.mask) # store which entries may be sufficient to flag dubious = np.ma.zeros(differences.shape) dubious.mask = np.copy(differences.mask) #************************* # adjust for storms if variable in ["sea_level_pressure", "station_level_pressure"]: distant, = np.where(initial_neighbours[:, 1].astype(int) > 100) if len(distant) > 0: # find positive and negative differences across neighbours positive = np.ma.where( differences[distant] > spreads[distant] * SPREAD_LIMIT) negative = np.ma.where( differences[distant] < spreads[distant] * SPREAD_LIMIT) # spin through each neighbour for dn, dist_neigh in enumerate(distant): pos, = np.where(positive[0] == dn) neg, = np.where(negative[0] == dn) if len(neg) > 0: ratio = len(neg) / (len(pos) + len(neg)) if ratio > 0.667: # majority negative, only flag the positives [definitely not storms] dubious[dist_neigh, positive[1][pos]] = 1 else: # all stations close by so storms shouldn't affect, include all # note where differences exceed the spread dubious_locs = np.ma.where( np.ma.abs(differences) > spreads * SPREAD_LIMIT) dubious[dubious_locs] = 1 else: #************************* # note where differences exceed the spread [all non pressure variables] dubious_locs = np.ma.where( np.ma.abs(differences) > spreads * SPREAD_LIMIT) dubious[dubious_locs] = 1 if diagnostics: print("cross checks complete - assessing all outcomes") #************************* # sum across neighbours neighbour_count = np.ma.count(differences, axis=0) dubious_count = np.ma.sum(dubious, axis=0) # flag if large enough fraction (>0.66) sufficient, = np.ma.where(dubious_count > 0.66 * neighbour_count) flags[sufficient] = "N" if plots: for flag in sufficient: plot_neighbour_flags(target_station.times, flag, obs_var, all_buddy_data) # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Neighbour Outlier {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # neighbour_outlier
def monthly_gap(obs_var, station, config_file, plots=False, diagnostics=False): """ Use distribution to identify assymetries. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) all_years = np.unique(station.years) for month in range(1, 13): month_averages = prepare_monthly_data(obs_var, station, month, diagnostics=diagnostics) # read in the scaling try: climatology = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: print("Information missing in config file") find_monthly_scaling(obs_var, station, config_file, diagnostics=diagnostics) climatology = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-clim".format(month))) spread = float( utils.read_qc_config(config_file, "MDISTRIBUTION-{}".format(obs_var.name), "{}-spread".format(month))) if climatology == utils.MDI and spread == utils.MDI: # these weren't calculable, move on continue standardised_months = (month_averages - climatology) / spread bins = utils.create_bins(standardised_months, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(standardised_months, bins) # flag months with very large offsets bad, = np.where(np.abs(standardised_months) >= LARGE_LIMIT) # now follow flag locations back up through the process for bad_month_id in bad: # year ID for this set of calendar months locs, = np.where( np.logical_and(station.months == month, station.years == all_years[bad_month_id])) flags[locs] = "D" # walk distribution from centre to find assymetry sort_order = standardised_months.argsort() mid_point = len(standardised_months) // 2 good = True step = 1 bad = [] while good: if standardised_months[sort_order][ mid_point - step] != standardised_months[sort_order][mid_point + step]: suspect_months = [np.abs(standardised_months[sort_order][mid_point - step]), \ np.abs(standardised_months[sort_order][mid_point + step])] if min(suspect_months) != 0: # not all clustered at origin if max(suspect_months) / min(suspect_months) >= 2. and min( suspect_months) >= 1.5: # at least 1.5x spread from centre and difference of two in location (longer tail) # flag everything further from this bin for that tail if suspect_months[0] == max(suspect_months): # LHS has issue (remember that have removed the sign) bad = sort_order[:mid_point - ( step - 1)] # need -1 given array indexing standards elif suspect_months[1] == max(suspect_months): # RHS has issue bad = sort_order[mid_point + step:] good = False step += 1 if (mid_point - step) < 0 or ( mid_point + step) == standardised_months.shape[0]: # reached end break # now follow flag locations back up through the process for bad_month_id in bad: # year ID for this set of calendar months locs, = np.where( np.logical_and(station.months == month, station.years == all_years[bad_month_id])) flags[locs] = "D" if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") if len(bad) > 0: bad_hist, dummy = np.histogram(standardised_months[bad], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.ylabel("Number of Months") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Distribution (monthly) {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # monthly_gap
def repeating_value(obs_var, times, config_file, plots=False, diagnostics=False): """ AKA straight string Use config file to read threshold values. Then find strings which exceed threshold. :param MetVar obs_var: meteorological variable object :param array times: array of times (usually in minutes) :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # remove calm periods for wind speeds when (a) calculating thresholds and (b) identifying streaks this_var = copy.deepcopy(obs_var) if obs_var.name == "wind_speed": calms, = np.ma.where(this_var.data == 0) this_var.data[calms] = utils.MDI this_var.data.mask[calms] = True flags = np.array(["" for i in range(this_var.data.shape[0])]) compressed_flags = np.array( ["" for i in range(this_var.data.compressed().shape[0])]) # retrieve the threshold and store in another dictionary threshold = {} try: th = utils.read_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight") threshold["Straight"] = float(th) except KeyError: # no threshold set print("Threshold missing in config file") get_repeating_string_threshold(this_var, config_file, plots=plots, diagnostics=diagnostics) th = utils.read_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight") threshold["Straight"] = float(th) # only process further if there is enough data if len(this_var.data.compressed()) > 1: repeated_string_lengths, grouped_diffs, strings = prepare_data_repeating_string( this_var, plots=plots, diagnostics=diagnostics) # above threshold bad, = np.where(repeated_string_lengths >= threshold["Straight"]) # flag identified strings for string in bad: start = int(np.sum(grouped_diffs[:strings[string], 1])) end = start + int(grouped_diffs[strings[string], 1]) + 1 compressed_flags[start:end] = "K" if plots: plot_streak(times, this_var, start, end) # undo compression and write into original object (the one with calm periods) flags[this_var.data.mask == False] = compressed_flags obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Repeated Strings {}".format(this_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # repeating_value
def variance_check(obs_var, station, config_file, plots=False, diagnostics=False, winsorize=True): """ Use distribution to identify threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) # get hourly climatology for each month for month in range(1, 13): month_locs, = np.where(station.months == month) variances = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) try: average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) average_variance = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-average".format(month))) variance_spread = float( utils.read_qc_config(config_file, "VARIANCE-{}".format(obs_var.name), "{}-spread".format(month))) if average_variance == utils.MDI and variance_spread == utils.MDI: # couldn't be calculated, move on continue bad_years, = np.where( np.abs(variances - average_variance) / variance_spread > SPREAD_THRESHOLD) # prepare wind and pressure data in case needed to check for storms if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: wind_monthly_data = station.wind_speed.data[month_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_monthly_data = obs_var.data[month_locs] else: pressure_monthly_data = station.sea_level_pressure.data[ month_locs] if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell # move on continue wind_average = utils.average(wind_monthly_data) wind_spread = utils.spread(wind_monthly_data) pressure_average = utils.average(pressure_monthly_data) pressure_spread = utils.spread(pressure_monthly_data) # go through each bad year for this month all_years = np.unique(station.years) for year in bad_years: # corresponding locations ym_locs, = np.where( np.logical_and(station.months == month, station.years == all_years[year])) # if pressure or wind speed, need to do some further checking before applying flags if obs_var.name in [ "station_level_pressure", "sea_level_pressure", "wind_speed" ]: # pull out the data wind_data = station.wind_speed.data[ym_locs] if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: pressure_data = obs_var.data[ym_locs] else: pressure_data = station.sea_level_pressure.data[ym_locs] # need sufficient data to work with for storm check to work, else can't tell if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # move on continue # find locations of high wind speeds and low pressures, cross match high_winds, = np.ma.where( (wind_data - wind_average) / wind_spread > STORM_THRESHOLD) low_pressures, = np.ma.where( (pressure_average - pressure_data) / pressure_spread > STORM_THRESHOLD) match = np.in1d(high_winds, low_pressures) couldbe_storm = False if len(match) > 0: # this could be a storm, either at tropical station (relatively constant pressure) # or out of season in mid-latitudes. couldbe_storm = True if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: diffs = np.ma.diff(pressure_data) elif obs_var.name == "wind_speed": diffs = np.ma.diff(wind_data) # count up the largest number of sequential negative and positive differences negs, poss = 0, 0 biggest_neg, biggest_pos = 0, 0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not couldbe_storm: # insufficient to identify as a storm (HadISD values) # leave flags set pass else: # could be a storm, so better to leave this month unflagged # zero length array to flag ym_locs = np.ma.array([]) # copy over the flags, if any if len(ym_locs) != 0: # and set the flags flags[ym_locs] = "V" # diagnostic plots if plots: import matplotlib.pyplot as plt scaled_variances = ((variances - average_variance) / variance_spread) bins = utils.create_bins(scaled_variances, 0.25, obs_var.name) hist, bin_edges = np.histogram(scaled_variances, bins) plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Months") plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(SPREAD_THRESHOLD, c="r") plt.axvline(-SPREAD_THRESHOLD, c="r") bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Variance {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # variance_check
def frequent_values(obs_var, station, config_file, plots=False, diagnostics=False): """ Use config file to read frequent values. Check each month to see if appear. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) all_years = np.unique(station.years) # work through each month, and then year for month in range(1, 13): # read in bin-width and suspect bins for this month try: width = float( utils.read_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width")) suspect_bins = utils.read_qc_config(config_file, "FREQUENT-{}".format( obs_var.name), "{}".format(month), islist=True) except KeyError: print("Information missing in config file") identify_values(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) width = float( utils.read_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width")) suspect_bins = utils.read_qc_config(config_file, "FREQUENT-{}".format( obs_var.name), "{}".format(month), islist=True) # skip on if nothing to find if len(suspect_bins) == 0: continue # work through each year for year in all_years: locs, = np.where( np.logical_and(station.months == month, station.years == year)) month_data = obs_var.data[locs] # skip if no data if np.ma.count(month_data) == 0: continue month_flags = np.array(["" for i in range(month_data.shape[0])]) # adjust bin widths according to reporting accuracy resolution = utils.reporting_accuracy(month_data) if resolution <= 0.5: bins = utils.create_bins(month_data, 0.5, obs_var.name) else: bins = utils.create_bins(month_data, 1.0, obs_var.name) hist, bin_edges = np.histogram(month_data, bins) # Scan through the histogram # check if a bin is the maximum of a local area ("ROLLING") for b, bar in enumerate(hist): if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)): target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) + 1] # if sufficient obs, maximum and contains > 50% of data if bar >= utils.DATA_COUNT_THRESHOLD: if bar == target_bins.max(): if (bar / target_bins.sum()) > RATIO: # this bin meets all the criteria if bins[b] in suspect_bins: # find observations (month & year) to flag! flag_locs = np.where( np.logical_and( month_data >= bins[b], month_data < bins[b + 1])) month_flags[flag_locs] = "F" # copy flags for all years into main array flags[locs] = month_flags # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) bad_hist = np.copy(hist) for b, bar in enumerate(bad_hist): if bins[b] not in suspect_bins: bad_hist[b] = 0 plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Frequent Values {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # frequent_values
def pressure_theory(sealp, stnlp, temperature, times, elevation, plots=False, diagnostics=False): """ Flag locations where difference between recorded and calculated sea-level pressure falls outside of bounds :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param MetVar temperature: temperature object :param array times: datetime array :param float elevation: station elevation (m) :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(sealp.data.shape[0])]) theoretical_value = calc_slp(stnlp.data, elevation, temperature.data) difference = sealp.data - theoretical_value bad_locs, = np.ma.where(np.ma.abs(difference) > THEORY_THRESHOLD) # diagnostic plots if plots: bins = np.arange( np.round(np.ma.min(difference)) - 1, np.round(np.ma.max(difference)) + 1, 0.1) import matplotlib.pyplot as plt plt.clf() plt.hist(difference.compressed(), bins=bins) plt.axvline(x=THEORY_THRESHOLD, ls="--", c="r") plt.axvline(x=-THEORY_THRESHOLD, ls="--", c="r") plt.xlim([bins[0] - 1, bins[-1] + 1]) plt.ylabel("Observations") plt.xlabel("Difference (hPa)") plt.show() if len(bad_locs) != 0: flags[bad_locs] = "p" if diagnostics: print("Pressure".format(stnlp.name)) print( " Number of mismatches between recorded and theoretical SLPs {}" .format(len(bad_locs))) if plots: for bad in bad_locs: plot_pressure(sealp, stnlp, times, bad) def adjust_preexisting_locs(var, flags): # may have flags already set by previous part of test # find these locations, and adjust new flags to these aren't added again pre_exist = [i for i, item in enumerate(var.flags) if "p" in item] new_flags = flags[:] new_flags[pre_exist] = "" return new_flags # flag both as not sure immediately where the issue lies stnlp.flags = utils.insert_flags(stnlp.flags, adjust_preexisting_locs(stnlp, flags)) sealp.flags = utils.insert_flags(sealp.flags, adjust_preexisting_locs(sealp, flags)) if diagnostics: print("Pressure {}".format(stnlp.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # pressure_theory
def pressure_offset(sealp, stnlp, times, config_file, plots=False, diagnostics=False): """ Flag locations where difference between station and sea-level pressure falls outside of bounds :param MetVar sealp: sea level pressure object :param MetVar stnlp: station level pressure object :param array times: datetime array :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(sealp.data.shape[0])]) difference = sealp.data - stnlp.data if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD: try: average = float( utils.read_qc_config(config_file, "PRESSURE", "average")) spread = float( utils.read_qc_config(config_file, "PRESSURE", "spread")) except KeyError: print("Information missing in config file") average = utils.average(difference) spread = utils.spread(difference) if spread < MIN_SPREAD: # less than XhPa spread = MIN_SPREAD elif spread > MAX_SPREAD: # more than XhPa spread = MAX_SPREAD utils.write_qc_config(config_file, "PRESSURE", "average", "{}".format(average), diagnostics=diagnostics) utils.write_qc_config(config_file, "PRESSURE", "spread", "{}".format(spread), diagnostics=diagnostics) if np.abs(np.ma.mean(difference) - np.ma.median(difference)) > THRESHOLD * spread: if diagnostics: print("Large difference between mean and median") print("Likely to have two populations of roughly equal size") print("Test won't work") pass else: high, = np.ma.where(difference > (average + (THRESHOLD * spread))) low, = np.ma.where(difference < (average - (THRESHOLD * spread))) # diagnostic plots if plots: bins = np.arange( np.round(difference.min()) - 1, np.round(difference.max()) + 1, 0.1) import matplotlib.pyplot as plt plt.clf() plt.hist(difference.compressed(), bins=bins) plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r") plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r") plt.xlim([bins[0] - 1, bins[-1] + 1]) plt.ylabel("Observations") plt.xlabel("Difference (hPa)") plt.show() if len(high) != 0: flags[high] = "p" if diagnostics: print("Pressure".format(stnlp.name)) print(" Number of high differences {}".format(len(high))) if plots: for bad in high: plot_pressure(sealp, stnlp, times, bad) if len(low) != 0: flags[low] = "p" if diagnostics: print(" Number of low differences {}".format(len(low))) if plots: for bad in low: plot_pressure(sealp, stnlp, times, bad) # only flag the station level pressure stnlp.flags = utils.insert_flags(stnlp.flags, flags) if diagnostics: print("Pressure {}".format(stnlp.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # pressure_offset
def monthly_clim(obs_var, station, config_file, logfile="", plots=False, diagnostics=False, winsorize=True): """ Run through the variables and pass to the Distributional Gap Checks :param MetVar obs_var: meteorological variable object :param Station station: station object :param str configfile: string for configuration file :param str logfile: string for log file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) for month in range(1, 13): month_locs, = np.where(station.months == month) # note these are for the whole record, just this month is unmasked normalised_anomalies = prepare_data(obs_var, station, month, diagnostics=diagnostics, winsorize=winsorize) if len(normalised_anomalies.compressed() ) >= utils.DATA_COUNT_THRESHOLD: bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies.compressed(), bins) try: upper_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-lthresh".format(month))) except KeyError: print("Information missing in config file") find_month_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) upper_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config( config_file, "CLIMATOLOGICAL-{}".format(obs_var.name), "{}-lthresh".format(month))) # now to find the gaps uppercount = len( np.where(normalised_anomalies > upper_threshold)[0]) lowercount = len( np.where(normalised_anomalies < lower_threshold)[0]) if uppercount > 0: gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE) if gap_start != 0: bad_locs, = np.ma.where( normalised_anomalies > gap_start) # all years for one month # normalised_anomalies are for the whole record, just this month is unmasked flags[bad_locs] = "C" if lowercount > 0: gap_start = utils.find_gap(hist, bins, lower_threshold, GAP_SIZE, upwards=False) if gap_start != 0: bad_locs, = np.ma.where( normalised_anomalies < gap_start) # all years for one month flags[bad_locs] = "C" # diagnostic plots if plots: import matplotlib.pyplot as plt plt.clf() plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel("Scaled {}".format(obs_var.name.capitalize())) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") bad_locs, = np.where(flags[month_locs] == "C") bad_hist, dummy = np.histogram( normalised_anomalies[month_locs][bad_locs], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Climatological {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # monthly_clim
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False): """ Extract data for month and find secondary populations in distribution. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) for month in range(1, 13): normalised_anomalies = prepare_all_data(obs_var, station, month, config_file, full=False, diagnostics=diagnostics) if (len(normalised_anomalies.compressed()) == 1 and normalised_anomalies[0] == utils.MDI): # no data to work with for this month, move on. continue bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name) hist, bin_edges = np.histogram(normalised_anomalies, bins) try: upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) except KeyError: print("Information missing in config file") find_thresholds(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) upper_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-uthresh".format(month))) lower_threshold = float( utils.read_qc_config(config_file, "ADISTRIBUTION-{}".format(obs_var.name), "{}-lthresh".format(month))) if upper_threshold == utils.MDI and lower_threshold == utils.MDI: # these weren't able to be calculated, move on continue elif len(np.unique(normalised_anomalies)) == 1: # all the same value, so won't be able to fit a histogram continue # now to find the gaps uppercount = len(np.where(normalised_anomalies > upper_threshold)[0]) lowercount = len(np.where(normalised_anomalies < lower_threshold)[0]) month_locs, = np.where( station.months == month) # append should keep year order if uppercount > 0: gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies > gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" flags[month_locs] = month_flags if lowercount > 0: gap_start = utils.find_gap(hist, bins, lower_threshold, GAP_SIZE, upwards=False) if gap_start != 0: bad_locs, = np.ma.where(normalised_anomalies < gap_start) # all years for one month month_flags = flags[month_locs] month_flags[bad_locs] = "d" # TODO - can this bit be refactored? # for pressure data, see if the flagged obs correspond with high winds # could be a storm signal if obs_var.name in [ "station_level_pressure", "sea_level_pressure" ]: wind_monthly_data = prepare_monthly_data( station.wind_speed, station, month) pressure_monthly_data = prepare_monthly_data( obs_var, station, month) if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \ len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # need sufficient data to work with for storm check to work, else can't tell pass else: wind_monthly_average = utils.average(wind_monthly_data) wind_monthly_spread = utils.spread(wind_monthly_data) pressure_monthly_average = utils.average( pressure_monthly_data) pressure_monthly_spread = utils.spread( pressure_monthly_data) # already a single calendar month, so go through each year all_years = np.unique(station.years) for year in all_years: # what's best - extract only when necessary but repeatedly if so, or always, but once this_year_locs = np.where( station.years[month_locs] == year) if "d" not in month_flags[this_year_locs]: # skip if you get the chance continue wind_data = station.wind_speed.data[month_locs][ this_year_locs] pressure_data = obs_var.data[month_locs][ this_year_locs] storms, = np.ma.where( np.logical_and( (((wind_data - wind_monthly_average) / wind_monthly_spread) > STORM_THRESHOLD), (((pressure_monthly_average - pressure_data ) / pressure_monthly_spread) > STORM_THRESHOLD))) # more than one entry - check if separate events if len(storms) >= 2: # find where separation more than the usual obs separation storm_1diffs = np.ma.diff(storms) separations, = np.where( storm_1diffs > np.ma.median( np.ma.diff(wind_data))) if len(separations) != 0: # multiple storm signals storm_start = 0 storm_finish = separations[0] + 1 first_storm = expand_around_storms( storms[storm_start:storm_finish], len(wind_data)) final_storm_locs = copy.deepcopy( first_storm) for j in range(len(separations)): # then do the rest in a loop if j + 1 == len(separations): # final one this_storm = expand_around_storms( storms[separations[j] + 1:], len(wind_data)) else: this_storm = expand_around_storms( storms[separations[j] + 1:separations[j + 1] + 1], len(wind_data)) final_storm_locs = np.append( final_storm_locs, this_storm) else: # locations separated at same interval as data final_storm_locs = expand_around_storms( storms, len(wind_data)) # single entry elif len(storms) != 0: # expand around the storm signal (rather than # just unflagging what could be the peak and # leaving the entry/exit flagged) final_storm_locs = expand_around_storms( storms, len(wind_data)) # unset the flags if len(storms) > 0: month_flags[this_year_locs][ final_storm_locs] = "" # having checked for storms now store final flags flags[month_locs] = month_flags # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) plt.ylim([0.1, max(hist) * 2]) plt.axvline(upper_threshold, c="r") plt.axvline(lower_threshold, c="r") bad_locs, = np.where(flags[month_locs] == "d") bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs], bins) plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Distribution (all) {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # all_obs_gap
def diurnal_cycle_check(obs_var, station, config_file, plots=False, diagnostics=False, best_fit_diurnal=None, best_fit_uncertainty=None): """ Use offset to find days where cycle doesn't match :param MetVar obs_var: Meteorological Variable object :param Station station: Station Object for the station :param str configfile: string for configuration file :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) diurnal_offset = int(utils.read_qc_config(config_file, "DIURNAL-{}".format(obs_var.name), "peak")) hours = np.arange(24) hours = np.roll(hours, 11-int(diurnal_offset)) if diurnal_offset != MISSING: if (best_fit_diurnal is None) and (best_fit_uncertainty is None): best_fit_diurnal, best_fit_uncertainty = prepare_data(station, obs_var) # find locations where the overall best fit does not match the daily fit potentially_spurious = np.ones(best_fit_diurnal.shape[0])*MISSING for d, (fit, uncertainty) in enumerate(zip(best_fit_diurnal, best_fit_uncertainty)): if fit != MISSING: min_range = 11 - uncertainty max_range = 11 + uncertainty offset_loc, = np.where(hours == fit) # find where the best fit falls outside the range for this particular day if offset_loc < min_range or offset_loc > max_range: potentially_spurious[d] = 1 else: potentially_spurious[d] = 0 # now check there are sufficient issues in running 30 day periods """Any periods>30 days where the diurnal cycle deviates from the expected phase by more than this uncertainty, without three consecutive good or missing days or six consecutive days consisting of a mix of only good or missing values, a re deemed dubious and the entire period of data (including all non-temperature elements) is flagged""" n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 bad_locs = np.zeros(best_fit_diurnal.shape[0]) for d in range(best_fit_diurnal.shape[0]): if potentially_spurious[d] == 1: # if bad, just add one n_good = 0 n_miss = 0 n_not_bad = 0 total_points += 1 total_not_miss += 1 else: # find a non-bad value - so check previous run # if have reached limits on good/missing if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6): # sufficient good missing or not bad data if total_points >= 30: # if have collected enough others, then set flag if float(total_not_miss)/total_points >= 0.5: bad_locs[d - total_points : d] = 1 # reset counters n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 # and deal with this point total_points += 1 if potentially_spurious[d] == 0: # if good n_good += 1 n_not_bad += 1 if n_miss != 0: n_miss = 0 total_not_miss += 1 elif potentially_spurious[d] == -999: # if missing data n_miss += 1 n_not_bad += 1 if n_good != 0: n_good = 0 # run through all days # find zero point of day counter in data preparation part day_counter_start = dt.datetime(np.unique(station.years)[0], np.unique(station.months)[0], np.unique(station.days)[0]) # find the bad days in the times array for day in bad_locs: this_day = day_counter_start + dt.timedelta(days=int(day)) locs, = np.where(np.logical_and.reduce((station.years == this_day.year, station.months == this_day.month, station.days == this_day.day))) flags[locs] = "U" # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Diurnal Check {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format(len(np.where(flags != "")[0]))) else: if diagnostics: print("Diurnal fit not found") return # diurnal_cycle_check
def flag_clusters(obs_var, station, plots=False, diagnostics=False): """ Go through the clusters of data and flag if meet requirements :param MetVar obs_var: meteorological variable object :param Station station: Station Object for the station :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) time_differences = np.diff(station.times) / np.timedelta64(1, "m") potential_cluster_ends, = np.where(time_differences >= MIN_SEPARATION * 60) # TODO - need explicit checks for start and end of timeseries for ce, cluster_end in enumerate(potential_cluster_ends): if ce == 0: # check if cluster at start of series (long gap after a first few points) cluster_length = station.times.iloc[ cluster_end] - station.times.iloc[0] if cluster_length.asm8 / np.timedelta64(1, "h") < MAX_LENGTH_TIME: # could be a cluster if len(flags[:cluster_end + 1]) < MAX_LENGTH_OBS: flags[:cluster_end + 1] = "o" if plots: plot_cluster(station, obs_var, 0, cluster_end + 1) elif ce == len(potential_cluster_ends) - 1: # check if cluster at end of series (long gap before last few points) cluster_length = station.times.iloc[-1] - station.times.iloc[ cluster_end + 1] # add one to find cluster start! if cluster_length.asm8 / np.timedelta64(1, "h") < MAX_LENGTH_TIME: # could be a cluster if len(flags[cluster_end + 1:]) < MAX_LENGTH_OBS: flags[cluster_end + 1:] = "o" if plots: plot_cluster(station, obs_var, cluster_end + 1, -1) if ce > 0: # check for cluster in series. # use previous gap > MIN_SEPARATION to define cluster and check length cluster_length = station.times.iloc[ cluster_end] - station.times.iloc[potential_cluster_ends[ ce - 1] + 1] # add one to find cluster start! if cluster_length.asm8 / np.timedelta64(1, "h") < MAX_LENGTH_TIME: # could be a cluster if len(flags[potential_cluster_ends[ce - 1] + 1:cluster_end + 1]) < MAX_LENGTH_OBS: flags[potential_cluster_ends[ce - 1] + 1:cluster_end + 1] = "o" if plots: plot_cluster(station.times, obs_var, potential_cluster_ends[ce - 1] + 1, cluster_end + 1) # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Odd Cluster {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # flag_clusters