def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False, doMonth = False): ''' Wrapper for the four individual repeating streak check tests ''' times = station.time.data for v, variable in enumerate(var_list): st_var = getattr(station, variable) if len(utils.apply_filter_flags(st_var).compressed()) > 0: wind = False if variable == "windspeeds": wind = True winddir= False if variable == "winddirs": winddir = True reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var, doMonth = doMonth, start = start, end = end), winddir = winddir, plots = plots) limits = limits_dict[variable][reporting_resolution] # need to apply flags to st_var.flags each time for filtering station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True, doMonth = doMonth) # no effect of final incomplete year ("month" option) as limits[2] and limits[3] fixed station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]] = rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots) for streak_type in range(3): flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0) utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Streak Check") return
def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False): ''' Wrapper for the four individual repeating streak check tests ''' times = station.time.data for v, variable in enumerate(var_list): st_var = getattr(station, variable) if len(utils.apply_filter_flags(st_var).compressed()) > 0: reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) limits = limits_dict[variable][reporting_resolution] wind = False if variable == "windspeeds": wind = True # need to apply flags to st_var.flags each time for filtering station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True) station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]]= rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots) for streak_type in range(3): flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Streak Check") return
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool doMonth: account for incomplete months :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] st_var = getattr(station, variable) # if incomplete year, mask all obs for the incomplete bit all_filtered = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) reporting_resolution = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) good, = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered), dtype=int) full_time_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_time_diffs[good[:-1]] = station.time.data[ good[1:]] - station.time.data[good[:-1]] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = copy.deepcopy(all_filtered.mask[:]) full_filtered_diffs[good[:-1]] = all_filtered.compressed( )[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress, = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_value_diffs[good_to_uncompress[:-1]] = st_var.data.compressed( )[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9, 12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape, dtype=int) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[ year, month, 0]:month_ranges[year, month, 1]] this_month_filtered_diff = full_filtered_diffs[ month_ranges[year, month, 0]:month_ranges[year, month, 1]] else: this_month_time_diff = np.ma.concatenate([ this_month_time_diff, full_time_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) this_month_filtered_diff = np.ma.concatenate([ this_month_filtered_diff, full_filtered_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) month_locs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] = month for delta in range(1, 9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta - 1, month] = 6. elif iqr == 0: critical_values[delta - 1, month] = st_var.mdi else: critical_values[delta - 1, month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format( calendar.month_name[month + 1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "", "", "" threshold = utils.get_critical_values( this_month_filtered_diff[locs], binmin=0, binwidth=0.5, plots=plots, diagnostics=diagnostics, title=title, line_label=line_label, xlabel=xlabel, old_threshold=critical_values[delta - 1, month]) if threshold < critical_values[delta - 1, month]: critical_values[delta - 1, month] = threshold if plots or diagnostics: print critical_values[delta - 1, month], iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0, :] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where( critical_values[good_critical_values] <= 5. * reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5. * reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0, :], 5. * reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0, month] != st_var.mdi and critical_values[ 1, month] != st_var.mdi: if critical_values[0, month] / critical_values[1, month] <= 0.66: critical_values[0, month] = 0.66 * critical_values[1, month] if diagnostics: print "critical values" print critical_values[0, :] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data), dtype=int) full_time_diffs.mask = copy.deepcopy(st_var.data.mask[:]) full_time_diffs[good_to_uncompress[:-1]] = station.time.data[ good_to_uncompress[1:]] - station.time.data[ good_to_uncompress[:-1]] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[t]] ) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[t], good_to_uncompress[t + 1], start, variable, plots=plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1, 2, 3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign( 1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[ time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs( value_diffs[t - spk_len - 1] ) <= critical_values[ time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs( value_diffs[t + 1] ) <= critical_values[ time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[t - spk_len + 1:t + 1] = 1 if plots or diagnostics: sc_diagnostics_and_plots( station.time. data, st_var.data, good_to_uncompress[ t - spk_len + 1], good_to_uncompress[ t + 1], start, variable, plots=plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs, = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs), noWrite=diagnostics) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False): """ Run through neighbour checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param array distances: array of distances between station pairs :param array angles: array of angles between station pairs :param bool second: do the second run :param bool masking: apply the flags to the data to mask the observations. """ first = not second qc_code_version = subprocess.check_output(['svnversion']).strip() # if distances and angles not calculated, then do so if (len(distances) == 0) or (len(angles) == 0): print "calculating distances and bearings matrix" distances, angles = get_distances_angles(station_info) # extract before truncate the array neighbour_elevations = np.array(station_info[:,3], dtype=float) neighbour_ids = np.array(station_info[:,0]) neighbour_info = np.array(station_info[:,:]) # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex: endindex+1] distances = distances[startindex:endindex+1,:] angles = angles[startindex:endindex+1,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] # process each neighbour for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "Neighbour Check" print "{:35s} {}".format("Station Identifier :", stat[0]) if not plots and not diagnostics: logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration. logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Neighbour Check\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) else: logfile = "" process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if running through the first time if first: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # or if second pass through? elif second: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # select neighbours neighbour_distances = distances[st,:] neighbour_bearings = angles[st,:] # have to add in start index so that can use location in distance file. # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively) neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) if plots or diagnostics: print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation") for n in neighbours: print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]) else: logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation")) for n in neighbours: logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])) # if sufficient neighbours if len(neighbours) >= 3: for variable, col in FLAG_OUTLIER_DICT.items(): # NOTE - this requires multiple reads of the same file # but does make it easier to understand and code st_var = getattr(station, variable) if plots or diagnostics: print "Length of {} record: {}".format(variable, len(st_var.data.compressed())) else: logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed()))) if len(st_var.data.compressed()) > 0: final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots) # now read in final set of neighbours and process neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values for nn, nn_loc in enumerate(final_neighbours): neigh_details = neighbour_info[nn_loc] neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3])) if first: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False) all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable)) if diagnostics: print neigh_details n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots) reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data) dpd_flags += neigh.qc_flags[:,31] # gone through all neighbours # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours) some_flags, = np.where(neigh_flags > 0) outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.))) # flag where < 3 neighbours locs = np.where(neigh_count[some_flags] < 3) station.qc_flags[some_flags[locs], col] = -1 if len(outlier_locs) >= 1: station.qc_flags[some_flags[outlier_locs], col] = 1 # print number flagged and copy into attribute if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) st_var = getattr(station, variable) st_var.flags[some_flags[outlier_locs]] = 1 else: if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) if plots: n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART) # unflagging using neighbours n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics) else: if plots or diagnostics: print "No observations to assess for {}".format(variable) else: logfile.write("No observations to assess for {}\n".format(variable)) # variable loop else: if plots or diagnostics: print "Fewer than 3 neighbours" else: logfile.write("Fewer than 3 neighbours\n") print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) # end of neighbour check utils.append_history(station, "Neighbour Outlier Check") # clean up months qc_tests.clean_up.clu(station, ["temperatures","dewpoints","windspeeds","winddirs","slp"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots) if diagnostics or plots: raw_input("stop") # masking (at least call from here - optional call from internal?) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file # masking - apply the flags and copy masked data to flagged_obs attribute if masking: station = utils.mask(station, process_vars, logfile) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) if plots or diagnostics: print "Masking completed\n" print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) else: logfile.write("Masking completed\n") logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # gzip up all the raw files if doZip: for st, stat in enumerate(station_info): if first: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")]) elif second: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")]) print "Neighbour Checks completed\n" return # neighbour_checks
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' Check for certain values occurring more frequently than would be expected :param object station: station object to process :param list variable_list: list of variables to process :param list flag_col: columns to fill in flag array :param datetime start: datetime object of start of data :param datetime end: datetime object of end of data :param file logfile: logfile to store outputs :param bool diagnostics: produce extra diagnostic output :param bool plots: produce plots :param bool month: ignore months after last complete year/season for distribution ''' MIN_DATA_REQUIRED = 500 # to create histogram for complete record MIN_DATA_REQUIRED_YEAR = 100 # to create histogram month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1, 12, 2) for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_accuracy = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) for season in range(5): # Year,MAM,JJA,SON,JF+D if season == 0: # all year season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi) thresholds = [30, 20, 10] else: thresholds = [20, 15, 10] season_data = np.ma.array([]) for y, year in enumerate(month_ranges_years): # churn through months extracting data, accounting for fdi and concatenating together if season == 1: #mam season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[2][0]:year[4][-1]], st_var.fdi) ]) elif season == 2: #jja season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[5][0]:year[7][-1]], st_var.fdi) ]) elif season == 3: #son season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[8][0]:year[10][-1]], st_var.fdi) ]) elif season == 4: #d+jf season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[0][0]:year[1][-1]], st_var.fdi) ]) season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi) ]) season_data = season_data.compressed() if len(season_data) > MIN_DATA_REQUIRED: if 0 < reporting_accuracy <= 0.5: # -1 used as missing value bins, bincenters = utils.create_bins(season_data, 0.5) else: bins, bincenters = utils.create_bins(season_data, 1.0) hist, binEdges = np.histogram(season_data, bins=bins) if plots: plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title="%s" % (SEASONS[season])) bad_bin = np.zeros(len(hist)) # scan through bin values and identify bad ones for e, element in enumerate(hist): if e > 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e - 3:e + 3 + 1] if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3] / float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]): # contains >50% of data and is greater than threshold bad_bin[e] = 1 # for plotting remove good bins else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 if plots: import matplotlib.pyplot as plt plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() # having identified possible bad bins, check each year in turn, on unfiltered data for y, year in enumerate(month_ranges_years): if season == 0: # year year_data = np.ma.masked_values( st_var.data[year[0][0]:year[-1][-1]], st_var.fdi) year_flags = station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] elif season == 1: #mam year_data = np.ma.masked_values( st_var.data[year[2][0]:year[4][-1]], st_var.fdi) year_flags = station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] elif season == 2: #jja year_data = np.ma.masked_values( st_var.data[year[5][0]:year[7][-1]], st_var.fdi) year_flags = station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] elif season == 3: #son year_data = np.ma.masked_values( st_var.data[year[8][0]:year[10][-1]], st_var.fdi) year_flags = station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] elif season == 4: #d+jf year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\ np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)]) year_flags = np.append( station.qc_flags[year[0][0]:year[1][-1], flag_col[v]], station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]]) if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR: hist, binEdges = np.histogram(year_data.compressed(), bins=bins) if plots: plot_hist, bincenters = fvc_plot_setup( year_data.compressed(), hist, binEdges, st_var.name, title="%s - %s" % (y + start.year, SEASONS[season])) for e, element in enumerate(hist): if bad_bin[e] == 1: # only look at pre-identified bins if e >= 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e - 3:e + 3 + 1].astype('float') if (seven_bins[3] == seven_bins.max() ) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \ or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]): # contains >50% or >90% of data and is greater than appropriate threshold # Flag these data bad_points = np.where( (year_data >= binEdges[e]) & (year_data < binEdges[e + 1])) year_flags[bad_points] = 1 # for plotting remove good bins else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 if diagnostics or plots: nflags = len(np.where(year_flags != 0)[0]) print "{} {}".format(y + start.year, nflags) if plots: if nflags > 0: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() else: plt.clf() # copy flags back if season == 0: station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags elif season == 1: station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags elif season == 2: station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags elif season == 3: station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags elif season == 4: split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]]) station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split] station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:] flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite=diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Frequent Values Check") return # fvc
def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): if plots or diagnostics: import matplotlib.pyplot as plt import calendar # very similar to climatological check - ensure that not duplicating for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) month_data_count = np.zeros(month_ranges.shape[0:2]) # for each month for month in range(12): # set up hourly climatologies hourly_clims = np.zeros(24) hourly_clims.fill(st_var.data.fill_value) this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) # # extract each year and append together # year_ids = [] # counter to determine which year each day corresponds to # for year in range(month_ranges.shape[0]): # this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]] # if year == 0: # # store so can access each hour of day separately # this_month = this_year.reshape(-1,24) # year_ids = [year for x in range(this_month.shape[0])] # month_data_count[year,month] = len(this_year.compressed()) # else: # this_year = this_year.reshape(-1,24) # this_month = np.ma.concatenate((this_month, this_year), axis = 0) # year_ids.extend([year for x in range(this_year.shape[0])]) # month_data_count[year,month] = len(this_year.compressed()) # winsorize and get hourly climatology for h in range(24): this_hour = this_month[:,h] if len(this_hour.compressed()) > 100: # winsorize & climatologies - done to match IDL if idl: this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1) else: this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_clims[h] = np.ma.mean(this_hour_winsorized) hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims) anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1)) # extract IQR of anomalies (using 1/2 value to match IDL) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr variances = np.ma.zeros(month_ranges.shape[0]) variances.mask = [False for i in range(month_ranges.shape[0])] rep_accuracies = np.zeros(month_ranges.shape[0]) rep_freqs = np.zeros(month_ranges.shape[0]) variances.fill(st_var.mdi) rep_accuracies.fill(st_var.mdi) rep_freqs.fill(st_var.mdi) year_ids = np.array(year_ids) # extract variance of normalised anomalies for each year for y, year in enumerate(range(month_ranges.shape[0])): year_locs = np.where(year_ids == y) this_year = normed_anomalies[year_locs,:] this_year = this_year.reshape(-1) # end of similarity with Climatological check if len(this_year.compressed()) >= 30: variances[y] = utils.mean_absolute_deviation(this_year, median = True) rep_accuracies[y] = utils.reporting_accuracy(this_year) rep_freqs[y] = utils.reporting_frequency(this_year) else: variances.mask[y] = True good = np.where(month_data_count[:,month] >= 100) # get median and IQR of variance for all years for this month if len(good[0]) >= 10: median_variance = np.median(variances[good]) iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL if iqr_variance < 0.01: iqr_variance = 0.01 else: median_variance = st_var.mdi iqr_variance = st_var.mdi # if SLP, then get median and MAD of SLP and windspeed for month if variable in ["slp", "windspeeds"]: winds = getattr(station, "windspeeds") slp = getattr(station, "slp") # refactor this as similar in style to how target data extracted for y, year in enumerate(range(month_ranges.shape[0])): if y == 0: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_month = winds_year.reshape(-1,24) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp_year.reshape(-1,24) else: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_year = winds_year.reshape(-1,24) winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_year = slp_year.reshape(-1,24) slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0) median_wind = np.ma.median(winds_month) median_slp = np.ma.median(slp_month) wind_MAD = utils.mean_absolute_deviation(winds_month.compressed()) slp_MAD = utils.mean_absolute_deviation(slp_month.compressed()) if diagnostics: print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD) print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD) # now test to see if variance exceeds expected range for y, year in enumerate(range(month_ranges.shape[0])): if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \ (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD): # if SLP, then need to test if deep low pressure ("hurricane/storm") present # as this will increase the variance for this month + year if variable in ["slp", "windspeeds"]: iqr_threshold = 6. # increase threshold if reporting frequency and resolution of this # year doesn't match average if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 8. if diagnostics: print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold: # check for storms winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] storm = False if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1): # find max wind & min SLP # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0] # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0] # if these are above thresholds and within one day of each other, # then it likely was a storm # print "fix this in case of multiple max/min locations" # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ # (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \ # (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): # locations where winds greater than threshold high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD) # and where SLP less than threshold low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD) # if any locations match, then it's a storm match_loc = high_winds[np.in1d(high_winds, low_slps)] if len(match_loc) > 0: storm = True else: print "write spurious" # check the SLP first difference series # to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed if variable == "slp": diffs = np.diff(slp_month.compressed()) elif variable == "windspeeds": diffs = np.diff(winds_month.compressed()) negs, poss = 0,0 biggest_neg, biggest_pos = 0,0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not storm: # not a hurricane, so mask station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots or diagnostics: print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year) else: logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)) else: # hurricane if plots or diagnostics: print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year) else: logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)) if plots: # plot showing the pressure, pressure first differences and the wind speeds plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start) evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD) else: iqr_threshold = 8. if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 10. if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold: if diagnostics: print "flagging {} {}".format(year+start.year,calendar.month_name[month+1]) # remove the data station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots: plot_variances = (variances - median_variance) / iqr_variance plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances) evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1])) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 for T, D and SLP 21/8/2014 station = utils.append_history(station, "Excess Variance Check") return # evc
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False): """ Run through neighbour checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param array distances: array of distances between station pairs :param array angles: array of angles between station pairs :param bool second: do the second run :param bool masking: apply the flags to the data to mask the observations. """ first = not second qc_code_version = subprocess.check_output(['svnversion']).strip() # if distances and angles not calculated, then do so if (len(distances) == 0) or (len(angles) == 0): print "calculating distances and bearings matrix" distances, angles = get_distances_angles(station_info) # extract before truncate the array neighbour_elevations = np.array(station_info[:,3], dtype=float) neighbour_ids = np.array(station_info[:,0]) neighbour_info = np.array(station_info[:,:]) # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex: endindex+1] distances = distances[startindex:endindex+1,:] angles = angles[startindex:endindex+1,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] # process each neighbour for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "Neighbour Check" print "{:35s} {}".format("Station Identifier :", stat[0]) if not plots and not diagnostics: logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration. logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Neighbour Check\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) else: logfile = "" process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if running through the first time if first: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # or if second pass through? elif second: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # select neighbours neighbour_distances = distances[st,:] neighbour_bearings = angles[st,:] # have to add in start index so that can use location in distance file. # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively) neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) if plots or diagnostics: print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation") for n in neighbours: print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]) else: logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation")) for n in neighbours: logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])) # if sufficient neighbours if len(neighbours) >= 3: for variable, col in FLAG_OUTLIER_DICT.items(): # NOTE - this requires multiple reads of the same file # but does make it easier to understand and code st_var = getattr(station, variable) if plots or diagnostics: print "Length of {} record: {}".format(variable, len(st_var.data.compressed())) else: logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed()))) if len(st_var.data.compressed()) > 0: final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots) # now read in final set of neighbours and process neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values for nn, nn_loc in enumerate(final_neighbours): neigh_details = neighbour_info[nn_loc] neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3])) if first: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False) all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable)) if diagnostics: print neigh_details n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots) reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data) dpd_flags += neigh.qc_flags[:,31] # gone through all neighbours # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours) some_flags, = np.where(neigh_flags > 0) outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.))) # flag where < 3 neighbours locs = np.where(neigh_count[some_flags] < 3) station.qc_flags[some_flags[locs], col] = -1 if len(outlier_locs) >= 1: station.qc_flags[some_flags[outlier_locs], col] = 1 # print number flagged and copy into attribute if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) st_var = getattr(station, variable) st_var.flags[some_flags[outlier_locs]] = 1 else: if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) if plots: n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART) # unflagging using neighbours n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics) else: if plots or diagnostics: print "No observations to assess for {}".format(variable) else: logfile.write("No observations to assess for {}\n".format(variable)) # variable loop else: if plots or diagnostics: print "Fewer than 3 neighbours" else: logfile.write("Fewer than 3 neighbours\n") print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) # end of neighbour check utils.append_history(station, "Neighbour Outlier Check") # clean up months qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics) if diagnostics or plots: raw_input("stop") # masking (at least call from here - optional call from internal?) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file # masking - apply the flags and copy masked data to flagged_obs attribute if masking: station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) if plots or diagnostics: print "Masking completed\n" print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) else: logfile.write("Masking completed\n") logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # looped through all stations # gzip up all the raw files if doZip: for st, stat in enumerate(station_info): if first: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")]) elif second: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")]) print "Neighbour Checks completed\n" return # neighbour_checks
def identify_values(obs_var, station, config_file, plots=False, diagnostics=False): """ Use distribution to identify frequent values. Then also store in config file. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ # TODO - do we want to go down the road of allowing resolution (and hence test) # to vary over the p-o-r? I.e. 1C in early, to 0.5C to 0.1C in different decades? utils.write_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width", "{}".format(BIN_WIDTH), diagnostics=diagnostics) for month in range(1, 13): locs, = np.where(station.months == month) month_data = obs_var.data[locs] if len(month_data.compressed()) < utils.DATA_COUNT_THRESHOLD: # insufficient data, so write out empty config and move on utils.write_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "{}".format(month), "[{}]".format(",".join(str(s) for s in [])), diagnostics=diagnostics) continue # adjust bin widths according to reporting accuracy resolution = utils.reporting_accuracy(month_data) if resolution <= 0.5: bins = utils.create_bins(month_data, 0.5, obs_var.name) else: bins = utils.create_bins(month_data, 1.0, obs_var.name) hist, bin_edges = np.histogram(month_data, bins) # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) # Scan through the histogram # check if a bin is the maximum of a local area ("ROLLING") suspect = [] for b, bar in enumerate(hist): if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)): target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) + 1] # if sufficient obs, maximum and contains > 50%, but not all, of the data if bar >= utils.DATA_COUNT_THRESHOLD: if bar == target_bins.max(): if (bar / target_bins.sum()) > RATIO: suspect += [bins[b]] # diagnostic plots if plots: bad_hist = np.copy(hist) for b, bar in enumerate(bad_hist): if bins[b] not in suspect: bad_hist[b] = 0 plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # write out the thresholds... utils.write_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "{}".format(month), "[{}]".format(",".join(str(s) for s in suspect)), diagnostics=diagnostics) return # identify_values
def frequent_values(obs_var, station, config_file, plots=False, diagnostics=False): """ Use config file to read frequent values. Check each month to see if appear. :param MetVar obs_var: meteorological variable object :param Station station: station object :param str config_file: configuration file to store critical values :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output """ flags = np.array(["" for i in range(obs_var.data.shape[0])]) all_years = np.unique(station.years) # work through each month, and then year for month in range(1, 13): # read in bin-width and suspect bins for this month try: width = float( utils.read_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width")) suspect_bins = utils.read_qc_config(config_file, "FREQUENT-{}".format( obs_var.name), "{}".format(month), islist=True) except KeyError: print("Information missing in config file") identify_values(obs_var, station, config_file, plots=plots, diagnostics=diagnostics) width = float( utils.read_qc_config(config_file, "FREQUENT-{}".format(obs_var.name), "width")) suspect_bins = utils.read_qc_config(config_file, "FREQUENT-{}".format( obs_var.name), "{}".format(month), islist=True) # skip on if nothing to find if len(suspect_bins) == 0: continue # work through each year for year in all_years: locs, = np.where( np.logical_and(station.months == month, station.years == year)) month_data = obs_var.data[locs] # skip if no data if np.ma.count(month_data) == 0: continue month_flags = np.array(["" for i in range(month_data.shape[0])]) # adjust bin widths according to reporting accuracy resolution = utils.reporting_accuracy(month_data) if resolution <= 0.5: bins = utils.create_bins(month_data, 0.5, obs_var.name) else: bins = utils.create_bins(month_data, 1.0, obs_var.name) hist, bin_edges = np.histogram(month_data, bins) # Scan through the histogram # check if a bin is the maximum of a local area ("ROLLING") for b, bar in enumerate(hist): if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)): target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) + 1] # if sufficient obs, maximum and contains > 50% of data if bar >= utils.DATA_COUNT_THRESHOLD: if bar == target_bins.max(): if (bar / target_bins.sum()) > RATIO: # this bin meets all the criteria if bins[b] in suspect_bins: # find observations (month & year) to flag! flag_locs = np.where( np.logical_and( month_data >= bins[b], month_data < bins[b + 1])) month_flags[flag_locs] = "F" # copy flags for all years into main array flags[locs] = month_flags # diagnostic plots if plots: import matplotlib.pyplot as plt plt.step(bins[1:], hist, color='k', where="pre") plt.yscale("log") plt.ylabel("Number of Observations") plt.xlabel(obs_var.name.capitalize()) plt.title("{} - month {}".format(station.id, month)) bad_hist = np.copy(hist) for b, bar in enumerate(bad_hist): if bins[b] not in suspect_bins: bad_hist[b] = 0 plt.step(bins[1:], bad_hist, color='r', where="pre") plt.show() # append flags to object obs_var.flags = utils.insert_flags(obs_var.flags, flags) if diagnostics: print("Frequent Values {}".format(obs_var.name)) print(" Cumulative number of flags set: {}".format( len(np.where(flags != "")[0]))) return # frequent_values
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, second = False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool second: run for second time :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] prev_flag_number = 0 if second: # count currently existing flags: prev_flag_number = len(flags[flags != 0]) st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) good = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered)) full_time_diffs.mask = all_filtered.mask full_time_diffs[good] = station.time.data[good][1:] - station.time.data[good][:-1] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = all_filtered.mask full_filtered_diffs[good] = all_filtered.compressed()[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = st_var.data.mask full_value_diffs[good_to_uncompress] = st_var.data.compressed()[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9,12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] this_month_filtered_diff = full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] else: this_month_time_diff = np.ma.concatenate([this_month_time_diff, full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) this_month_filtered_diff = np.ma.concatenate([this_month_filtered_diff, full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) month_locs[month_ranges[year,month,0]:month_ranges[year,month,1]] = month for delta in range(1,9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta-1,month] = 6. elif iqr == 0: critical_values[delta-1,month] = st_var.mdi else: critical_values[delta-1,month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format(calendar.month_name[month+1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "","","" threshold = utils.get_critical_values(this_month_filtered_diff[locs], binmin = 0, binwidth = 0.5, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = critical_values[delta-1,month]) if threshold < critical_values[delta-1,month]: critical_values[delta-1,month] = threshold if plots or diagnostics: print critical_values[delta-1,month] , iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0,:] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where(critical_values[good_critical_values] <= 5.*reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5.*reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0,:], 5.*reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0,month] != st_var.mdi and critical_values[1,month] != st_var.mdi: if critical_values[0,month]/critical_values[1,month] <= 0.66: critical_values[0,month] = 0.66 * critical_values[1,month] if diagnostics: print critical_values[0,:] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data)) full_time_diffs.mask = st_var.data.mask full_time_diffs[good_to_uncompress] = station.time.data[good_to_uncompress][1:] - station.time.data[good_to_uncompress][:-1] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[0][t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[0][t]]) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[0][:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[0][t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1,2,3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign(1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs(value_diffs[t - spk_len - 1]) <= critical_values[time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs(value_diffs[t + 1]) <= critical_values[time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[ t - spk_len + 1 : t +1] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t-spk_len+1], good_to_uncompress[0][t+1], start, variable, plots = plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) # additional flags else: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False): ''' Check for certain values occurring more frequently than would be expected :param object station: station object to process :param list variable_list: list of variables to process :param list flag_col: columns to fill in flag array :param datetime start: datetime object of start of data :param datetime end: datetime object of end of data :param file logfile: logfile to store outputs :param bool diagnostics: produce extra diagnostic output :param bool plots: produce plots ''' MIN_DATA_REQUIRED = 500 # to create histogram for complete record MIN_DATA_REQUIRED_YEAR = 100 # to create histogram month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1,12,2) for v,variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var) for season in range(5): # Year,MAM,JJA,SON,JF+D if season == 0: # all year season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi) thresholds = [30,20,10] else: thresholds = [20,15,10] season_data = np.ma.array([]) for y,year in enumerate(month_ranges_years): # churn through months extracting data, accounting for fdi and concatenating together if season == 1: #mam season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[2][0]:year[4][-1]], st_var.fdi)]) elif season == 2: #jja season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[5][0]:year[7][-1]], st_var.fdi)]) elif season == 3: #son season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[8][0]:year[10][-1]], st_var.fdi)]) elif season == 4: #d+jf season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[0][0]:year[1][-1]], st_var.fdi)]) season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi)]) season_data = season_data.compressed() if len(season_data) > MIN_DATA_REQUIRED: if 0 < reporting_accuracy <= 0.5: # -1 used as missing value bins, bincenters = utils.create_bins(season_data, 0.5) else: bins, bincenters = utils.create_bins(season_data, 1.0) hist, binEdges = np.histogram(season_data, bins = bins) if plots: plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title = "%s" % (SEASONS[season])) bad_bin = np.zeros(len(hist)) # scan through bin values and identify bad ones for e, element in enumerate(hist): if e > 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e-3:e+3+1] if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]): # contains >50% of data and is greater than threshold bad_bin[e] = 1 # for plotting remove good bins else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 if plots: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() # having identified possible bad bins, check each year in turn for y,year in enumerate(month_ranges_years): if season == 0: # year year_data = np.ma.masked_values(st_var.data[year[0][0]:year[-1][-1]], st_var.fdi) year_flags = station.qc_flags[year[0][0]:year[-1][-1],flag_col[v]] elif season == 1: #mam year_data = np.ma.masked_values(st_var.data[year[2][0]:year[4][-1]], st_var.fdi) year_flags = station.qc_flags[year[2][0]:year[4][-1],flag_col[v]] elif season == 2: #jja year_data = np.ma.masked_values(st_var.data[year[5][0]:year[7][-1]], st_var.fdi) year_flags = station.qc_flags[year[5][0]:year[7][-1],flag_col[v]] elif season == 3: #son year_data = np.ma.masked_values(st_var.data[year[8][0]:year[10][-1]], st_var.fdi) year_flags = station.qc_flags[year[8][0]:year[10][-1],flag_col[v]] elif season == 4: #d+jf year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\ np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)]) year_flags = np.append(station.qc_flags[year[0][0]:year[1][-1],flag_col[v]],station.qc_flags[year[-1][0]:year[-1][-1],flag_col[v]]) if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR: hist, binEdges = np.histogram(year_data.compressed(), bins = bins) if plots: plot_hist, bincenters = fvc_plot_setup(hist, binEdges, st_var.name, title = "%s - %s" % (y+start.year, SEASONS[season])) for e, element in enumerate(hist): if bad_bin[e] == 1: # only look at pre-identified bins if e >= 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e-3:e+3+1].astype('float') if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \ or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]): # contains >50% or >90% of data and is greater than appropriate threshold # Flag these data bad_points = np.where((year_data >= binEdges[e]) & (year_data < binEdges[e+1])) year_flags[bad_points] = 1 # for plotting remove good bins else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 if diagnostics or plots: nflags = len(np.where(year_flags != 0)[0]) print "{} {}".format(y + start.year, nflags) if plots: if nflags > 0: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() else: plt.clf() # copy flags back if season == 0: station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags elif season == 1: station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags elif season == 2: station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags elif season == 3: station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags elif season == 4: split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]]) station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split] station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:] flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Frequent Values Check") return # fvc