def get_repeating_string_threshold(obs_var, config_file, plots=False, diagnostics=False): """ Use distribution to determine threshold values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # mask calm periods (as these could be a reasonable string) this_var = copy.deepcopy(obs_var) if obs_var.name == "wind_speed": calms, = np.ma.where(this_var.data == 0) this_var.data[calms] = utils.MDI this_var.data.mask[calms] = True # only process further if there is enough data if len(this_var.data.compressed()) > 1: repeated_string_lengths, grouped_diffs, strings = prepare_data_repeating_string( this_var, plots=plots, diagnostics=diagnostics) # bin width is 1 as dealing in time index. # minimum bin value is 2 as this is the shortest string possible threshold = utils.get_critical_values(repeated_string_lengths, binmin=2, binwidth=1.0, plots=plots, diagnostics=diagnostics, title=this_var.name.capitalize(), xlabel="Repeating string length") # write out the thresholds... utils.write_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight", "{}".format(threshold), diagnostics=diagnostics) else: # store high value so threshold never reached utils.write_qc_config(config_file, "STREAK-{}".format(this_var.name), "Straight", "{}".format(-utils.MDI), diagnostics=diagnostics) return # repeating_string_threshold
def rsc_get_straight_string_threshold(st_var, start, end, reporting = 0., diagnostics = False, plots = False, doMonth = False, old_threshold = 0): ''' Derive threshold number for strings/streaks of repeating values :param object st_var: station variable object :param datetime start: start of data :param datetime end: end of data :param float reporting: reporting accuracy :param bool diagnostics: do diagnostic output :param bool plots: do plots :param float old_threshold: old threshold to use as comparison ''' all_filtered = utils.apply_filter_flags(st_var, doMonth = doMonth, start = start, end = end) # find and count the length of all repeating strings prev_value = st_var.mdi this_string = [] string_lengths =[] # run through all obs, the inefficient (non-pythonic) way for o, obs in enumerate(all_filtered): if all_filtered.mask[o] == False: if obs != prev_value: # if different value to before string_lengths += [len(this_string)] this_string = [o] else: # if same value as before, note and continue this_string += [o] prev_value = obs if plots: import calendar title = "Straight String Distribution" line_label = st_var.name xlabel = "String length" else: title, line_label, xlabel = "","","" threshold = utils.get_critical_values(string_lengths, binmin = 1, binwidth = 1, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = old_threshold) if diagnostics: print "threshold {}".format(threshold) return threshold # rsc_get_straight_string_threshold
def rsc_get_straight_string_threshold(st_var, start, end, reporting = 0., diagnostics = False, plots = False, old_threshold = 0): ''' Derive threshold number for strings/streaks of repeating values :param object st_var: station variable object :param datetime start: start of data :param datetime end: end of data :param float reporting: reporting accuracy :param bool diagnostics: do diagnostic output :param bool plots: do plots :param float old_threshold: old threshold to use as comparison ''' all_filtered = utils.apply_filter_flags(st_var) # find and count the length of all repeating strings prev_value = st_var.mdi this_string = [] string_lengths =[] # run through all obs, the inefficient (non-pythonic) way for o, obs in enumerate(all_filtered): if all_filtered.mask[o] == False: if obs != prev_value: # if different value to before string_lengths += [len(this_string)] this_string = [o] else: # if same value as before, note and continue this_string += [o] prev_value = obs if plots: import calendar title = "Straight String Distribution" line_label = st_var.name xlabel = "String length" else: title, line_label, xlabel = "","","" threshold = utils.get_critical_values(string_lengths, binmin = 1, binwidth = 1, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = old_threshold) return threshold # rsc_get_straight_string_threshold
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool doMonth: account for incomplete months :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] st_var = getattr(station, variable) # if incomplete year, mask all obs for the incomplete bit all_filtered = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) reporting_resolution = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) good, = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered), dtype=int) full_time_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_time_diffs[good[:-1]] = station.time.data[ good[1:]] - station.time.data[good[:-1]] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_filtered_diffs[good[:-1]] = all_filtered.compressed( )[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress, = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_value_diffs[good_to_uncompress[:-1]] = st_var.data.compressed( )[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9, 12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape, dtype=int) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[ year, month, 0]:month_ranges[year, month, 1]] this_month_filtered_diff = full_filtered_diffs[ month_ranges[year, month, 0]:month_ranges[year, month, 1]] else: this_month_time_diff = np.ma.concatenate([ this_month_time_diff, full_time_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) this_month_filtered_diff = np.ma.concatenate([ this_month_filtered_diff, full_filtered_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) month_locs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] = month for delta in range(1, 9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta - 1, month] = 6. elif iqr == 0: critical_values[delta - 1, month] = st_var.mdi else: critical_values[delta - 1, month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format( calendar.month_name[month + 1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "", "", "" threshold = utils.get_critical_values( this_month_filtered_diff[locs], binmin=0, binwidth=0.5, plots=plots, diagnostics=diagnostics, title=title, line_label=line_label, xlabel=xlabel, old_threshold=critical_values[delta - 1, month]) if threshold < critical_values[delta - 1, month]: critical_values[delta - 1, month] = threshold if plots or diagnostics: print critical_values[delta - 1, month], iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0, :] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where( critical_values[good_critical_values] <= 5. * reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5. * reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0, :], 5. * reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0, month] != st_var.mdi and critical_values[ 1, month] != st_var.mdi: if critical_values[0, month] / critical_values[1, month] <= 0.66: critical_values[0, month] = 0.66 * critical_values[1, month] if diagnostics: print "critical values" print critical_values[0, :] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data), dtype=int) full_time_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_time_diffs[good_to_uncompress[:-1]] = station.time.data[ good_to_uncompress[1:]] - station.time.data[ good_to_uncompress[:-1]] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[t]] ) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1, 2, 3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign( 1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[ time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs( value_diffs[t - spk_len - 1] ) <= critical_values[ time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs( value_diffs[t + 1] ) <= critical_values[ time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[t - spk_len + 1:t + 1] = 1 if plots or diagnostics: sc_diagnostics_and_plots( station.time. data, st_var.data, good_to_uncompress[ t - spk_len + 1], good_to_uncompress[ t + 1], start, variable, plots=plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs, = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs), noWrite=diagnostics) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def get_critical_values(obs_var, times, config_file, plots=False, diagnostics=False): """ Use distribution to determine critical values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param array times: array of times (usually in minutes) :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # use all first differences # TODO monthly? masked_times = np.ma.masked_array(times, mask=obs_var.data.mask) time_diffs = np.ma.diff(masked_times) / np.timedelta64( 1, "m") # presuming minutes value_diffs = np.ma.diff(obs_var.data) # get thresholds for each unique time differences unique_diffs = np.unique(time_diffs.compressed()) for t_diff in unique_diffs: if t_diff == 0: # not a spike or jump, but 2 values at the same time. # should be zero value difference, so fitting histogram not going to work # handled in separate test print("test") continue locs, = np.where(time_diffs == t_diff) first_differences = value_diffs[locs] # ensure sufficient non-masked observations if len(first_differences.compressed()) >= utils.DATA_COUNT_THRESHOLD: # fit decay curve to one-sided distribution c_value = utils.get_critical_values( first_differences.compressed(), binmin=0, binwidth=0.5, plots=plots, diagnostics=diagnostics, xlabel="First differences", title="Spike - {} - {}m".format(obs_var.name.capitalize(), t_diff)) # write out the thresholds... utils.write_qc_config(config_file, "SPIKE-{}".format(obs_var.name), "{}".format(t_diff), "{}".format(c_value), diagnostics=diagnostics) if diagnostics: print(" Time Difference: {} minutes".format(t_diff)) print(" Number of obs: {}, threshold: {}".format( len(first_differences.compressed()), c_value)) else: if diagnostics: print(" Time Difference: {} minutes".format(t_diff)) print(" Number of obs insufficient: {} < {}".format( len(first_differences.compressed()), utils.DATA_COUNT_THRESHOLD)) return # get_critical_values
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, second = False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool second: run for second time :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] prev_flag_number = 0 if second: # count currently existing flags: prev_flag_number = len(flags[flags != 0]) st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) good = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered)) full_time_diffs.mask = all_filtered.mask full_time_diffs[good] = station.time.data[good][1:] - station.time.data[good][:-1] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = all_filtered.mask full_filtered_diffs[good] = all_filtered.compressed()[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = st_var.data.mask full_value_diffs[good_to_uncompress] = st_var.data.compressed()[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9,12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] this_month_filtered_diff = full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] else: this_month_time_diff = np.ma.concatenate([this_month_time_diff, full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) this_month_filtered_diff = np.ma.concatenate([this_month_filtered_diff, full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) month_locs[month_ranges[year,month,0]:month_ranges[year,month,1]] = month for delta in range(1,9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta-1,month] = 6. elif iqr == 0: critical_values[delta-1,month] = st_var.mdi else: critical_values[delta-1,month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format(calendar.month_name[month+1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "","","" threshold = utils.get_critical_values(this_month_filtered_diff[locs], binmin = 0, binwidth = 0.5, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = critical_values[delta-1,month]) if threshold < critical_values[delta-1,month]: critical_values[delta-1,month] = threshold if plots or diagnostics: print critical_values[delta-1,month] , iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0,:] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where(critical_values[good_critical_values] <= 5.*reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5.*reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0,:], 5.*reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0,month] != st_var.mdi and critical_values[1,month] != st_var.mdi: if critical_values[0,month]/critical_values[1,month] <= 0.66: critical_values[0,month] = 0.66 * critical_values[1,month] if diagnostics: print critical_values[0,:] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data)) full_time_diffs.mask = st_var.data.mask full_time_diffs[good_to_uncompress] = station.time.data[good_to_uncompress][1:] - station.time.data[good_to_uncompress][:-1] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[0][t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[0][t]]) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[0][:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[0][t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1,2,3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign(1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs(value_diffs[t - spk_len - 1]) <= critical_values[time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs(value_diffs[t + 1]) <= critical_values[time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[ t - spk_len + 1 : t +1] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t-spk_len+1], good_to_uncompress[0][t+1], start, variable, plots = plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) # additional flags else: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc