def dgc(station, variable_list, flag_col, start, end, logfile, plots=False, diagnostics = False, idl = False, GH = False): '''Controller for two individual tests''' if plots: import matplotlib.pyplot as plt for v, variable in enumerate(variable_list): station.qc_flags[:,flag_col[v]] = dgc_monthly(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl) if variable == "slp": # need to send in windspeeds too station.qc_flags[:,flag_col[v]] = dgc_all_obs(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl, windspeeds = True, GH = GH) else: station.qc_flags[:,flag_col[v]] = dgc_all_obs(station, variable, station.qc_flags[:,flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl = idl, GH = GH) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0]), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0])) # copy flags into attribute st_var = getattr(station, variable) st_var.flags[flag_locs] = 1 # MATCHES IDL for 030660-99999, 2 flags in T, 30-06-2014 station = utils.append_history(station, "Distributional Gap Check") return # dgc
def mid_full(station, flag_col, logfile, diagnostics = False, plots = False): ''' Mid cloud full, but values in high :param obj station: station object :param list flag_col: flag columns to use :param file logfile: logfile to store outpu :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' mid = getattr(station, "mid_cloud_cover") high = getattr(station, "high_cloud_cover") mid_full_locs = np.ma.where(mid.data == 8) bad_high = np.where(high.data.mask[mid_full_locs] != True) station.qc_flags[mid_full_locs[0][bad_high[0]], flag_col] = 1 flag_locs = np.where(station.qc_flags[:, flag_col] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Mid full cloud", "cloud", len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Mid full cloud", "cloud", len(flag_locs[0])) # copy flags into attribute high.flags[flag_locs] = 1 return # mid_full
def negative_cloud(station, flag_col, logfile, diagnostics = False, plots = False): ''' Non-sensical cloud value :param obj station: station object :param list flag_col: flag columns to use :param file logfile: logfile to store outpu :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' # go through each cloud varaible and flag bad locations for c, cloud in enumerate(['total_cloud_cover','low_cloud_cover','mid_cloud_cover','high_cloud_cover']): cloud_obs = getattr(station, cloud) bad_locs = np.ma.where(cloud_obs.data < 0) station.qc_flags[bad_locs, flag_col] = 1 # copy flags into attribute cloud_obs.flags[bad_locs] = 1 flag_locs = np.where(station.qc_flags[:, flag_col] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Negative Cloud", "cloud", len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Negative Cloud", "cloud", len(flag_locs[0])) return
def unobservable(station, flag_col, logfile, diagnostics = False, plots = False): ''' Cloud observation code given as unobservable (==9 or 10) :param obj station: station object :param list flag_col: flag columns to use :param file logfile: logfile to store outpu :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' # for each cloud variable, find bad locations and flag for c, cloud in enumerate(['total_cloud_cover','low_cloud_cover','mid_cloud_cover','high_cloud_cover']): cloud_obs = getattr(station, cloud) bad_locs = np.ma.where(np.logical_or(cloud_obs.data == 9, cloud_obs.data == 10)) station.qc_flags[bad_locs, flag_col[c]] = 1 flag_locs = np.where(station.qc_flags[:, flag_col[c]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Unobservable cloud", cloud, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Unobservable cloud", cloud, len(flag_locs[0])) # copy flags into attribute cloud_obs.flags[flag_locs] = 1 return # unobservable
def clu(station, var_list, flag_cols, FLAG_COL_DICT, start, end, logfile, plots=False): """ Run the clean up for each variable :param file logfile: logfile to store outputs """ for v, variable in enumerate(var_list): st_var = getattr(station, variable) clean_up( st_var, station.qc_flags, FLAG_COL_DICT[variable], flag_cols[v], start, end, station.time.data, plots=plots ) flag_locs = np.where(station.qc_flags[:, flag_cols[v]] != 0) if plots: utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0]), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Clean Up Months", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Clean Up Months") return # clu
def hcc_sss(T, D, month_ranges, start, logfile, plots=False, diagnostics=False): """ Supersaturation check, on individual obs, and then if >20% of month affected :param array T: temperatures :param array D: dewpoint temperatures :param array month_ranges: array of month start and end times :param datetime start: DATASTART (for plotting) :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose output :returns: flags - locations where flags have been set """ flags = np.zeros(len(T)) # flag each location where D > T for m, month in enumerate(month_ranges): data_count = 0.0 sss_count = 0.0 for t in np.arange(month[0], month[1]): data_count += 1 if D[t] > T[t]: sss_count += 1 flags[t] = 1 if plots: hcc_time_plot(T, D, t - 1, t, start) # test whole month # if more than 20% flagged, flag whole month if sss_count / data_count >= SSS_MONTH_FRACTION: flags[month[0] : month[1]] = 1 if plots: hcc_time_plot(T, D, month[0], month[1], start) nflags = len(np.where(flags != 0)[0]) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Supersaturation", "temperature", nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Supersaturation", "temperature", nflags) # not yet tested. return flags # hcc_sss
def logical_checks(station, flag_col, logfile, plots=False, diagnostics=False): """ Select occurrences of wind speed and direction which are logically inconsistent with measuring practices. From Table 2 - DeGaetano, JOAT, 14, 308-317, 1997 :param Station station: station object :param array flag_col: which columns to use in QC flag array :param file logfile: logfile to output to :param bool plots: do plots? :param bool diagnostics: do diagnostics? """ speed = getattr(station, "windspeeds") direction = getattr(station, "winddirs") # recover direction information where the speed is Zero fix_zero_direction = np.ma.where(np.logical_and(speed.data == 0, direction.data.mask == True)) direction.data[fix_zero_direction] = 0 direction.data.mask[fix_zero_direction] = False station.qc_flags[fix_zero_direction, flag_col[1]] = -1 # to make a note of these # negative speeds negative_speed = np.ma.where(speed.data < 0) station.qc_flags[negative_speed, flag_col[0]] = 1 # negative directions (don't try to adjust) negative_direction = np.ma.where(direction.data < 0) station.qc_flags[negative_direction, flag_col[1]] = 1 # wrapped directions (don't try to adjust) wrapped_direction = np.ma.where(direction.data > 360) station.qc_flags[wrapped_direction, flag_col[1]] = 1 # no direction possible if speed == 0 bad_direction = np.ma.where(np.logical_and(speed.data == 0, direction.data != 0)) station.qc_flags[bad_direction, flag_col[1]] = 1 # northerlies given as 360, not 0 --> calm bad_speed = np.ma.where(np.logical_and(direction.data == 0, speed.data != 0)) station.qc_flags[bad_speed, flag_col[0]] = 1 # and output to file/screen flag_locs0, = np.where(station.qc_flags[:, flag_col[0]] > 0) # in case of direction fixes flag_locs1, = np.where(station.qc_flags[:, flag_col[1]] > 0) # in case of direction fixes if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "windspeeds", len(flag_locs0), noWrite=True) utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "winddirs", len(flag_locs1), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "windspeeds", len(flag_locs0)) utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "winddirs", len(flag_locs1)) # copy flags into attribute station.windspeeds.flags[flag_locs0] = 1 station.winddirs.flags[flag_locs1] = 1 return # logical_checks
def krc(station, var_list, flag_col, logfile, diagnostics = False, plots = False): ''' Run the known records check for each variable in list :param object station: station to process :param list var_list: list of variables to process :param list flag_col: which columns to use for which variable :param file logfile: logfile to store output :param bool diagnostics: diagnostic output (unused) :param bool plots: do the plots (unused) ''' for v, variable in enumerate(var_list): st_var = getattr(station, variable) st_region = krc_get_wmo_region(station.id) all_filtered = utils.apply_filter_flags(st_var) too_high = np.where(all_filtered > maxes[variable][st_region]) krc_set_flags(too_high, station.qc_flags, flag_col[v]) # make sure that don't flag the missing values! too_low = np.where(np.logical_and(all_filtered < mins[variable][st_region], all_filtered.mask == False )) krc_set_flags(too_low, station.qc_flags, flag_col[v]) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "World Record", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "World Record Check") return # krc
def rsc(station, var_list, flag_col, start, end, logfile, diagnostics = False, plots = False): ''' Wrapper for the four individual repeating streak check tests ''' times = station.time.data for v, variable in enumerate(var_list): st_var = getattr(station, variable) if len(utils.apply_filter_flags(st_var).compressed()) > 0: reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) limits = limits_dict[variable][reporting_resolution] wind = False if variable == "windspeeds": wind = True # need to apply flags to st_var.flags each time for filtering station.qc_flags[:,flag_col[v][0]] = rsc_straight_strings(st_var, times, limits[0], limits[1], start, end, reporting = reporting_resolution, wind = wind, diagnostics = diagnostics, plots = plots, dynamic = True) station.qc_flags[:, flag_col[v][1]], station.qc_flags[:, flag_col[v][2]]= rsc_hourly_repeats(st_var, times, limits[2], limits[3], diagnostics = diagnostics, plots = plots) for streak_type in range(3): flag_locs = np.where(station.qc_flags[:, flag_col[v][streak_type]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Streak Check", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Streak Check") return
def total_lt_max(station, flag_col, logfile, diagnostics = False, plots = False): ''' Total cloud cover less than maximum of low, mid and high :param obj station: station object :param list flag_col: flag columns to use :param file logfile: logfile to store outpu :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' total = getattr(station, "total_cloud_cover") low = getattr(station, "low_cloud_cover") mid = getattr(station, "mid_cloud_cover") high = getattr(station, "high_cloud_cover") maximum = np.ma.max([low.data, mid.data, high.data], axis = 0) bad_locs = np.ma.where(maximum > total.data) station.qc_flags[bad_locs, flag_col] = 1 flag_locs = np.where(station.qc_flags[:, flag_col] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Total < Max cloud", "cloud", len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Total < Max cloud", "cloud", len(flag_locs[0])) # copy flags into attribute total.flags[flag_locs] = 1 low.flags[flag_locs] = 1 mid.flags[flag_locs] = 1 high.flags[flag_locs] = 1 return # total_lt_max
def occ(station, variable_list, flag_col, datastart, logfile, diagnostics = False, plots = False, second = False): ''' Check for odd clusters of data surrounded by missing up to 6hr/24hr surrounded by at least 48 on each side :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime datastart: dataset start time :param file logfile: logfile to store outputs :param bool diagnostics: do extra verbose output :param bool plots: do plots :param bool second: run for second time :returns: ''' # the four options of what to do with each observation # the keys give values which are subroutines, and can be called # all subroutines have to take the same set of inputs options = {0 : occ_normal, 1 : occ_start_cluster, 2 : occ_in_cluster, 3 : occ_after_cluster} for v,variable in enumerate(variable_list): st_var = getattr(station, variable) filtered_data = utils.apply_filter_flags(st_var) var_flags = station.qc_flags[:,flag_col[v]] prev_flag_number = 0 if second: # count currently existing flags: prev_flag_number = len(var_flags[var_flags != 0]) # using IDL copy as method to ensure reproducibility (initially) oc_details = OddCluster(st_var.mdi, st_var.mdi, 0, st_var.mdi, st_var.mdi, -1) obs_type = 1 for time in station.time.data: if filtered_data.mask[time] == False: # process observation point using subroutines, called from named tuple if plots and (obs_type == 3) and (time - oc_details.end >= 48): # do plotting if matches flagging criteria oc_plots(station, oc_details, time, datastart, filtered_data, variable) oc_details, obs_type = options[obs_type](oc_details, obs_type, time, var_flags) else: # have missing data, if obs_type == 2: obs_type = 3 elif obs_type == 0: obs_type = 1 station.qc_flags[:,flag_col[v]] = var_flags flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) else: utils.print_flagged_obs_number(logfile, "Odd Cluster", variable, len(flag_locs[0]) - prev_flag_number) # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 032070 temperature 26/8/2014 station = utils.append_history(station, "Isolated Odd Cluster Check") return # occ
def dgc_all_obs(station, variable, flags, start, end, plots = False, diagnostics = False, idl = False, windspeeds = False, GH = False): '''RJHD addition working on all observations''' if plots: import matplotlib.pyplot as plt st_var = getattr(station, variable) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) all_filtered = utils.apply_filter_flags(st_var) for month in range(12): if windspeeds == True: st_var_wind = getattr(station, "windspeeds") # get monthly averages windspeeds_month = np.empty([]) for y, year in enumerate(month_ranges[:,month,:]): if y == 0: windspeeds_month = np.ma.array(st_var_wind.data[year[0]:year[1]]) else: windspeeds_month = np.ma.concatenate([windspeeds_month, st_var_wind.data[year[0]:year[1]]]) windspeeds_month_average = dgc_get_monthly_averages(windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN) windspeeds_month_mad = utils.mean_absolute_deviation(windspeeds_month, median=True) this_month_data = np.array([]) this_month_filtered = np.array([]) this_month_data, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = False) this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = False) if len(this_month_filtered.compressed()) > OBS_LIMIT: if idl: monthly_median = utils.idl_median(this_month_filtered.compressed().reshape(-1)) else: monthly_median = np.ma.median(this_month_filtered) iqr = utils.IQR(this_month_filtered.compressed()) if iqr == 0.0: # to get some spread if IQR too small iqr = utils.IQR(this_month_filtered.compressed(), percentile = 0.05) print "Spurious_stations file not yet sorted" if iqr != 0.0: monthly_values = np.ma.array((this_month_data.compressed() - monthly_median) / iqr) bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE) dummy, plot_bincenters = utils.create_bins(monthly_values, BIN_SIZE/10.) hist, binEdges = np.histogram(monthly_values, bins = bins) if GH: # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD initial_values = [np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values)] # norm, mean, std, skew, kurtosis fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))]) res = utils.hermite2gauss(fit[0], diagnostics = diagnostics) plot_gaussian = utils.funcGH(fit[0], plot_bincenters) # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting? mid_point = np.argmax(plot_gaussian) bad, = np.where(plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD/10.) if len(bad) > 0: plot_gaussian[mid_point:][bad[0]:] = FREQUENCY_THRESHOLD/10. bad, = np.where(plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD/10.) if len(bad) > 0: plot_gaussian[:mid_point][:bad[-1]] = FREQUENCY_THRESHOLD/10. # extract threshold values good_values = np.argwhere(plot_gaussian > FREQUENCY_THRESHOLD) l_minimum_threshold = round(plot_bincenters[good_values[0]]) - 1 u_minimum_threshold = 1 + round(plot_bincenters[good_values[-1]]) else: gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(monthly_values), sig = np.std(monthly_values)) # assume the same threshold value u_minimum_threshold = 1 + round(utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) l_minimum_threshold = -u_minimum_threshold plot_gaussian = utils.gaussian(plot_bincenters, gaussian) if diagnostics: if GH: print hist print res print iqr, l_minimum_threshold, u_minimum_threshold else: print hist print gaussian print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian) if plots: dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold = (u_minimum_threshold, l_minimum_threshold), sub_par = "observations", GH = GH) if GH: plt.figtext(0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %(res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small') uppercount = len(np.where(monthly_values > u_minimum_threshold)[0]) lowercount = len(np.where(monthly_values < l_minimum_threshold)[0]) # this needs refactoring - but lots of variables to pass in if plots or diagnostics: gap_plot_values = np.array([]) if uppercount > 0: gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold) if gap_start != 0: for y, year in enumerate(month_ranges[:,month,:]): this_year_data = np.ma.array(all_filtered[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.where(((this_year_data - monthly_median) / iqr) > gap_start) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr) if lowercount > 0: gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold) if gap_start != 0: for y, year in enumerate(month_ranges[:,month,:]): this_year_data = np.ma.array(all_filtered[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.where(np.logical_and(((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True)) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr) if windspeeds: this_year_flags[gap_cleaned_locations] = 2 # tentative flags slp_average = dgc_get_monthly_averages(this_month_data, OBS_LIMIT, st_var.mdi, MEAN) slp_mad = utils.mean_absolute_deviation(this_month_data, median=True) storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\ (((this_month_data - slp_average) / slp_mad) > 4.5)) if len(storms[0]) >= 2: storm_1diffs = np.diff(storms) separations = np.where(storm_1diffs != 1) #for sep in separations: if plots: hist, binEdges = np.histogram(gap_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid') import calendar plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes) plt.legend(loc='lower center',ncol=3, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13}) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png') if diagnostics: utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True) return flags # dgc_all_obs
def dgc(station, variable_list, flag_col, start, end, logfile, plots=False, diagnostics=False, idl=False, GH=False): '''Controller for two individual tests''' if plots: import matplotlib.pyplot as plt for v, variable in enumerate(variable_list): station.qc_flags[:, flag_col[v]] = dgc_monthly( station, variable, station.qc_flags[:, flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl=idl) if variable == "slp": # need to send in windspeeds too station.qc_flags[:, flag_col[v]] = dgc_all_obs( station, variable, station.qc_flags[:, flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl=idl, windspeeds=True, GH=GH) else: station.qc_flags[:, flag_col[v]] = dgc_all_obs( station, variable, station.qc_flags[:, flag_col[v]], start, end, plots=plots, diagnostics=diagnostics, idl=idl, GH=GH) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0]), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Distributional Gap", variable, len(flag_locs[0])) # copy flags into attribute st_var = getattr(station, variable) st_var.flags[flag_locs] = 1 # MATCHES IDL for 030660-99999, 2 flags in T, 30-06-2014 station = utils.append_history(station, "Distributional Gap Check") return # dgc
def dgc_monthly(station, variable, flags, start, end, logfile, plots=False, diagnostics=False, idl=False, doMonth=False): ''' Original Distributional Gap Check :param obj station: station object :param str variable: variable to act on :param array flags: flags array :param datetime start: data start :param datetime end: data end :param file logfile: output logfile :param bool plots: run plots :param bool diagnostics: run diagnostics :param bool idl: run IDL equivalent routines for median :returns: flags - updated flag array ''' if plots: import matplotlib.pyplot as plt st_var = getattr(station, variable) month_ranges = utils.month_starts_in_pairs(start, end) # get monthly averages month_average = np.empty(month_ranges.shape[0]) month_average.fill(st_var.mdi) month_average_filtered = np.empty(month_ranges.shape[0]) month_average_filtered.fill(st_var.mdi) all_filtered = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) for m, month in enumerate(month_ranges): data = st_var.data[month[0]:month[1]] filtered = all_filtered[month[0]:month[1]] month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT, st_var.mdi, MEAN) month_average_filtered[m] = dgc_get_monthly_averages( filtered, OBS_LIMIT, st_var.mdi, MEAN) # get overall monthly climatologies - use filtered data month_average = month_average.reshape(-1, 12) month_average_filtered = month_average_filtered.reshape(-1, 12) standardised_months = np.empty(month_average.shape) standardised_months.fill(st_var.mdi) for m in range(12): valid_filtered = np.where(month_average_filtered[:, m] != st_var.mdi) if len(valid_filtered[0]) >= VALID_MONTHS: valid_data = month_average_filtered[valid_filtered, m][0] if MEAN: clim = np.mean(valid_data) spread = np.stdev(valid_data) else: if idl: clim = utils.idl_median( valid_data.compressed().reshape(-1)) else: clim = np.median(valid_data) spread = utils.IQR(valid_data) if spread <= SPREAD_LIMIT: spread = SPREAD_LIMIT standardised_months[valid_filtered, m] = (month_average[valid_filtered, m] - clim) / spread standardised_months = standardised_months.reshape(month_ranges.shape[0]) good_months = np.where(standardised_months != st_var.mdi) # must be able to do this with masked arrays if plots: bins, bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE) dummy, plot_bincenters = utils.create_bins( standardised_months[good_months], BIN_SIZE / 10.) hist, binEdges = np.histogram(standardised_months[good_months], bins=bins) fit = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(standardised_months[good_months]), sig=np.std(standardised_months[good_months])) plot_gaussian = utils.gaussian(plot_bincenters, fit) dgc_set_up_plot(plot_gaussian, standardised_months[good_months], variable, sub_par="Months") # remove all months with a large standardised offset if len(good_months[0]) >= MONTH_LIMIT: standardised_months = np.ma.masked_values(standardised_months, st_var.mdi) large_offsets = np.where(standardised_months >= LARGE_LIMIT) if len(large_offsets[0]) > 0: for lo in large_offsets[0]: flags[month_ranges[lo, 0]:month_ranges[lo, 1]] = 1 if plots: hist, binEdges = np.histogram( standardised_months[large_offsets], bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'g-', label='> %i' % LARGE_LIMIT, where='mid', zorder=5) plt.axvline(5, c='g') plt.axvline(-5, c='g') # walk distribution from centre and see if any assymetry sort_order = standardised_months[good_months].argsort() mid_point = len(good_months[0]) / 2 good = True iter = 1 while good: if standardised_months[good_months][sort_order][ mid_point - iter] != standardised_months[good_months][sort_order][ mid_point + iter]: # using IDL notation tempvals = [ np.abs( standardised_months[good_months][sort_order][mid_point - iter]), np.abs( standardised_months[good_months][sort_order][mid_point + iter]) ] if min(tempvals) != 0: if max(tempvals) / min(tempvals) >= 2. and min( tempvals) >= 1.5: # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2. if tempvals[0] == max(tempvals): # LHS bad = good_months[0][sort_order][:mid_point - iter] if plots: badplot = standardised_months[good_months][ sort_order][:mid_point - iter] elif tempvals[1] == max(tempvals): #RHS bad = good_months[0][sort_order][mid_point + iter:] if plots: badplot = standardised_months[good_months][ sort_order][mid_point + iter:] for b in bad: flags[month_ranges[b, 0]:month_ranges[b, 1]] = 1 if plots: hist, binEdges = np.histogram(badplot, bins=bins) plot_hist = np.array( [0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='Gap', where='mid', zorder=4) good = False iter += 1 if iter == mid_point: break if plots: plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png') nflags, = np.where(flags != 0) utils.print_flagged_obs_number(logfile, "Distributional Gap Month", variable, len(nflags), noWrite=diagnostics) return flags # dgc_monthly
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False): """ Run through neighbour checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param array distances: array of distances between station pairs :param array angles: array of angles between station pairs :param bool second: do the second run :param bool masking: apply the flags to the data to mask the observations. """ first = not second qc_code_version = subprocess.check_output(['svnversion']).strip() # if distances and angles not calculated, then do so if (len(distances) == 0) or (len(angles) == 0): print "calculating distances and bearings matrix" distances, angles = get_distances_angles(station_info) # extract before truncate the array neighbour_elevations = np.array(station_info[:,3], dtype=float) neighbour_ids = np.array(station_info[:,0]) neighbour_info = np.array(station_info[:,:]) # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex: endindex+1] distances = distances[startindex:endindex+1,:] angles = angles[startindex:endindex+1,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] # process each neighbour for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "Neighbour Check" print "{:35s} {}".format("Station Identifier :", stat[0]) if not plots and not diagnostics: logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration. logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Neighbour Check\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) else: logfile = "" process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if running through the first time if first: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # or if second pass through? elif second: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # select neighbours neighbour_distances = distances[st,:] neighbour_bearings = angles[st,:] # have to add in start index so that can use location in distance file. # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively) neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) if plots or diagnostics: print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation") for n in neighbours: print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]) else: logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation")) for n in neighbours: logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])) # if sufficient neighbours if len(neighbours) >= 3: for variable, col in FLAG_OUTLIER_DICT.items(): # NOTE - this requires multiple reads of the same file # but does make it easier to understand and code st_var = getattr(station, variable) if plots or diagnostics: print "Length of {} record: {}".format(variable, len(st_var.data.compressed())) else: logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed()))) if len(st_var.data.compressed()) > 0: final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots) # now read in final set of neighbours and process neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values for nn, nn_loc in enumerate(final_neighbours): neigh_details = neighbour_info[nn_loc] neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3])) if first: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False) all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable)) if diagnostics: print neigh_details n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots) reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data) dpd_flags += neigh.qc_flags[:,31] # gone through all neighbours # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours) some_flags, = np.where(neigh_flags > 0) outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.))) # flag where < 3 neighbours locs = np.where(neigh_count[some_flags] < 3) station.qc_flags[some_flags[locs], col] = -1 if len(outlier_locs) >= 1: station.qc_flags[some_flags[outlier_locs], col] = 1 # print number flagged and copy into attribute if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) st_var = getattr(station, variable) st_var.flags[some_flags[outlier_locs]] = 1 else: if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) if plots: n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART) # unflagging using neighbours n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics) else: if plots or diagnostics: print "No observations to assess for {}".format(variable) else: logfile.write("No observations to assess for {}\n".format(variable)) # variable loop else: if plots or diagnostics: print "Fewer than 3 neighbours" else: logfile.write("Fewer than 3 neighbours\n") print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) # end of neighbour check utils.append_history(station, "Neighbour Outlier Check") # clean up months qc_tests.clean_up.clu(station, ["temperatures","dewpoints","windspeeds","winddirs","slp"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots) if diagnostics or plots: raw_input("stop") # masking (at least call from here - optional call from internal?) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file # masking - apply the flags and copy masked data to flagged_obs attribute if masking: station = utils.mask(station, process_vars, logfile) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) if plots or diagnostics: print "Masking completed\n" print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) else: logfile.write("Masking completed\n") logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # gzip up all the raw files if doZip: for st, stat in enumerate(station_info): if first: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")]) elif second: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")]) print "Neighbour Checks completed\n" return # neighbour_checks
def do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, start, logfile, plots=False, diagnostics=False): ''' Set up and run the unflagging process for the specified tests :param MetVar station: station object :param string variable: variable to process :param array all_data: array containing all neighbour obs for full time period :param array reporting accuracies: reporting accuracy for each neighbour :param array neigh_count: number of neighbours with data at each time stamp :param array dpd_flags: number of neighbours that have DPD set at each time stamp :param dict FLAG_COL_DICT: look up dictionary to :param datetime start: start of dataset :param file logfile: logfile to store outputs :param bool plots: do plots ''' # unflagging using neighbours '''This is slow - np.ma.median is known to be slow https://github.com/astropy/ccdproc/issues/74 https://github.com/astropy/ccdproc/blob/122cdbd5713140174f057eaa8fdb6f9ce03312df/docs/ccdproc/bottleneck_example.rst''' mean_of_neighbours = bn_median(all_data, axis=0) std_of_neighbours = median_absolute_deviation(all_data, axis=0) # find where the spread of neighbour observations is less than 1/2 # of maximum reporting accuracy std_of_neighbours[ std_of_neighbours < 0.5 * max(reporting_accuracies)] = 0.5 * max(reporting_accuracies) # create series of normalised differences of obs from neighbour mean st_var = getattr(station, variable) normalised_differences = np.ma.abs(st_var.data - mean_of_neighbours) / std_of_neighbours for qc_test in ["climatological", "gap", "odd", "dpd"]: if qc_test == "dpd" and variable == "dewpoints": flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, dpd_count=dpd_flags) elif qc_test == "dpd": # only unflag DPD on dewpoints continue elif qc_test == "gap" and variable != "slp": # only unflag gap check on slp observations continue else: flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] if qc_test == "gap" or qc_test == "climatological": # only tentative flags unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, flag_value=2) else: unset_locs = unflagging_locs(normalised_differences, flags, neigh_count) if len(unset_locs) > 0: station.qc_flags[unset_locs, UNFLAG_COL_DICT[qc_test][variable]] = 0 # need to unflag attribute if and only if no other flags are set subset_flags = station.qc_flags[:, FLAG_COL_DICT[variable]] total_flags = np.sum(subset_flags[unset_locs, :], axis=1) clean_locs = np.where(total_flags == 0) st_var.flags[unset_locs[clean_locs]] = 0 # and print result if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Unflagging " + qc_test, variable, len(unset_locs), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Unflagging " + qc_test, variable, len(unset_locs)) if plots: if len(unset_locs) > 0: plot_outlier(station, variable, unset_locs, all_data, start) station = utils.append_history(station, "Unflagging - " + variable) return # do_unflagging
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False): """ Run through neighbour checks on list of stations passed :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings :param array distances: array of distances between station pairs :param array angles: array of angles between station pairs :param bool second: do the second run :param bool masking: apply the flags to the data to mask the observations. """ first = not second qc_code_version = subprocess.check_output(['svnversion']).strip() # if distances and angles not calculated, then do so if (len(distances) == 0) or (len(angles) == 0): print "calculating distances and bearings matrix" distances, angles = get_distances_angles(station_info) # extract before truncate the array neighbour_elevations = np.array(station_info[:,3], dtype=float) neighbour_ids = np.array(station_info[:,0]) neighbour_info = np.array(station_info[:,:]) # sort truncated run startindex = 0 if restart_id != "": startindex, = np.where(station_info[:,0] == restart_id) if end_id != "": endindex, = np.where(station_info[:,0] == end_id) if endindex != len(station_info) -1: station_info = station_info[startindex: endindex+1] distances = distances[startindex:endindex+1,:] angles = angles[startindex:endindex+1,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] else: station_info = station_info[startindex:] distances = distances[startindex:,:] angles = angles[startindex:,:] # process each neighbour for st, stat in enumerate(station_info): print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "Neighbour Check" print "{:35s} {}".format("Station Identifier :", stat[0]) if not plots and not diagnostics: logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration. logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("Neighbour Check\n") logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0])) else: logfile = "" process_start_time = time.time() station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3])) # if running through the first time if first: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding # read in the data ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # or if second pass through? elif second: if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")): # if gzip file, unzip here subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")]) time.sleep(5) # make sure it is unzipped before proceeding ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics) if plots or diagnostics: print "{:35s} {}\n".format("Total station record size :",len(station.time.data)) else: logfile.write("{:35s} {}\n".format("Total station record size :",len(station.time.data))) match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars) # select neighbours neighbour_distances = distances[st,:] neighbour_bearings = angles[st,:] # have to add in start index so that can use location in distance file. # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively) neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations) if plots or diagnostics: print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation") for n in neighbours: print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]) else: logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation")) for n in neighbours: logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])) # if sufficient neighbours if len(neighbours) >= 3: for variable, col in FLAG_OUTLIER_DICT.items(): # NOTE - this requires multiple reads of the same file # but does make it easier to understand and code st_var = getattr(station, variable) if plots or diagnostics: print "Length of {} record: {}".format(variable, len(st_var.data.compressed())) else: logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed()))) if len(st_var.data.compressed()) > 0: final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots) # now read in final set of neighbours and process neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values for nn, nn_loc in enumerate(final_neighbours): neigh_details = neighbour_info[nn_loc] neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3])) if first: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) elif second: ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False) dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False) all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable)) if diagnostics: print neigh_details n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots) reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data) dpd_flags += neigh.qc_flags[:,31] # gone through all neighbours # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours) some_flags, = np.where(neigh_flags > 0) outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.))) # flag where < 3 neighbours locs = np.where(neigh_count[some_flags] < 3) station.qc_flags[some_flags[locs], col] = -1 if len(outlier_locs) >= 1: station.qc_flags[some_flags[outlier_locs], col] = 1 # print number flagged and copy into attribute if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) st_var = getattr(station, variable) st_var.flags[some_flags[outlier_locs]] = 1 else: if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs)) if plots: n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART) # unflagging using neighbours n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics) else: if plots or diagnostics: print "No observations to assess for {}".format(variable) else: logfile.write("No observations to assess for {}\n".format(variable)) # variable loop else: if plots or diagnostics: print "Fewer than 3 neighbours" else: logfile.write("Fewer than 3 neighbours\n") print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) # end of neighbour check utils.append_history(station, "Neighbour Outlier Check") # clean up months qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics) if diagnostics or plots: raw_input("stop") # masking (at least call from here - optional call from internal?) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) # gzip the raw file # masking - apply the flags and copy masked data to flagged_obs attribute if masking: station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT) # write to file if first: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) elif second: ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version) if plots or diagnostics: print "Masking completed\n" print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n") print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time) else: logfile.write("Masking completed\n") logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")) logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time)) logfile.close() # looped through all stations # gzip up all the raw files if doZip: for st, stat in enumerate(station_info): if first: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")]) elif second: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")]) if masking: subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")]) subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")]) print "Neighbour Checks completed\n" return # neighbour_checks
def hcc_dpd(times, T, D, P, C, SX, start, logfile, plots=False, diagnostics=False): """ Dew point Depression check. If long string of DPD = 0, then flag :param array times: timestamps :param array T: temperatures :param array D: dewpoint temperatures :param array P: precipitation depth :param array C: cloud base :param array SX: past significant weather :param datetime start: DATASTART (for plotting) :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose output :returns: flags - locations where flags have been set """ flags = np.zeros(len(T)) dpds = T - D last_dpds = -9999.0 string_start_time = times[0] start_loc = 0 for t, tt in enumerate(times): if (tt > 0) and (tt < times[-1]): if dpds.mask[t] == False: # if change in DPD, examine previous string if dpds[t] != last_dpds: # if long enough if times[t - 1] - string_start_time >= 24: these_dpds = dpds[start_loc:t] good = np.where(these_dpds.mask == False) if T[t] >= 0: abs_diff = 0.25 else: abs_diff = 1.0 # has enough data and is small enough if (len(good[0]) >= 4) and (abs(last_dpds) <= abs_diff): # check if weather event could explain it. these_sigwx = SX[start_loc:t] these_P = P[start_loc:t] these_CB = C[start_loc:t] # use past significant weather, precipitation or low cloud base fog = np.where( np.logical_or.reduce( ( these_sigwx[good] >= 4, these_P[good] > 0.0, np.logical_and(these_CB[good] > 0.0, these_CB[good] < 1000.0), ) ) ) if len(fog[0]) >= 1: if len(fog[0]) / float(len(good[0])) < 0.333: flags[start_loc:t][good] = 1 if plots: hcc_time_plot(T, D, start_loc, t, start) else: flags[start_loc:t][good] = 1 if plots: hcc_time_plot(T, D, start_loc, t, start) string_start_time = tt start_loc = t last_dpds = dpds[t] nflags = len(np.where(flags != 0)[0]) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Dewpoint Depression", "temperature", nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Dewpoint Depression", "temperature", nflags) # checked on 032220 on 19/8/2014 and matches identically return flags # hcc_dpd
def wind_rose_check(station, flag_col, start, end, logfile, plots=False, diagnostics=False): """ Checks for large differences in the year-to-year wind-rose shape. Uses RMSE and fits Gaussian. Finds gap in distribution to flag beyond :param MetStation station: station object :param int flag_col: which column to store the flags in :param datetime start: start of data :param datetime end: end of data :param bool plots: run the plots :param bool diagnostics: run the diagnostics """ direction = station.winddirs.data speed = station.windspeeds.data flags = station.qc_flags[:, flag_col] month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1, 12, 2) # histogram of wind directions ( ~ unravelled wind-rose) bw = 20 bins = range(0, 360 + bw, bw) full_hist, binEdges = np.histogram(direction, bins=bins, normed=True) # use rmse as this is known (Chi-sq remains just in case) rmse, chisq = -np.ma.ones([month_ranges_years.shape[0]]), -np.ma.ones([month_ranges_years.shape[0]]) # run through each year to extract RMSE's for y, year in enumerate(month_ranges_years): if len(direction[year[0][0] : year[-1][0]].compressed()) > 0: hist, binEdges = np.histogram(direction[year[0][0] : year[-1][0]], bins=bins, normed=True) chisq[y] = np.sum((full_hist - hist) ** 2 / (full_hist + hist)) / 2.0 rmse[y] = np.sqrt(np.mean((full_hist - hist) ** 2)) else: rmse.mask[y] = True # now to bin up the differences and see what the fit is. # need to have values spread so can bin! if len(np.unique(rmse.compressed())) > 1: binEdges, bincenters = wind_create_bins(rmse) hist, binEdges = np.histogram(rmse, bins=binEdges) # , density=True) norm = get_histogram_norm(rmse, binEdges) # inputs for fit mu = np.mean(rmse) std = np.std(rmse) # try to get decent fit to bulk of obs. # initial_values = [np.max(hist), np.mean(rmse), np.std(rmse), stats.skew(rmse), stats.kurtosis(rmse)] # norm, mean, std, sk#ew, kurtosis # fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))]) # res = utils.hermite2gauss(fit[0]) # plot_gaussian = utils.funcGH(fit[0], bincenters) fit = stats.rice.fit(rmse.compressed(), loc=0, scale=np.ma.std(rmse)) dist_pdf = stats.rice.pdf(bincenters, fit[:-2], loc=fit[-2], scale=fit[-1]) * norm gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=mu, sig=std) # invert Gaussian to find initial threshold, then hunt for first gap beyond # threshold = utils.invert_gaussian(PROB_THRESHOLD, gaussian) # invert Rician to find initial threshold, then hunt for first gap beyond if dist_pdf[-1] < PROB_THRESHOLD: # then curve has dropped below the threshold, so can find some updated ones. threshold = -np.where(dist_pdf[::-1] > PROB_THRESHOLD)[0][0] else: threshold = bincenters[-1] n = 0 center = np.argmax(hist) gap = bincenters[-1] # nothing should be beyond this while True: if center + n + 1 == len(bincenters): # gone beyond edge - nothing to flag, so just break break if bincenters[center + n] < threshold: n += 1 # continue moving outwards continue if hist[center + n] == 0: # found one if center + n + 1 == len(bincenters): # gone beyond edge - nothing to flag - escape break elif hist[center + n + 1] == 0: # has to be two bins wide? gap = bincenters[center + n] break n += 1 # run through each year to extract RMSE's for y, year in enumerate(month_ranges_years): if rmse[y] > gap: # only flag where there are observations good, = np.where( np.logical_or( direction.mask[year[0][0] : year[-1][0]] == False, speed.mask[year[0][0] : year[-1][0]] == False ) ) flags[year[0][0] : year[-1][0]][good] = 1 if diagnostics or plots: print "Flagging {} RMSE {} > {}".format(y + start.year, rmse[y], gap) elif rmse.mask[y] == False: if diagnostics or plots: print "{}".format(y + start.year) if plots: import matplotlib.pyplot as plt # plot underlying histogram plt.clf() plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist]) plt.step(binEdges[1:], plot_hist, color="k") # plot the Rician distribution on top plt.plot(bincenters, dist_pdf, "r-", label="Rician") # plot the gaussian on top plt.plot(binEdges[1:], utils.gaussian(bincenters, gaussian), color="b", ls=":", label="Gaussian") plt.yscale("log") plt.ylim([0.001, 2 * max(plot_hist)]) # plot the thresholds plt.axvline(threshold, color="g") plt.axvline(gap, color="r") # plot flagged values in different colour if len(rmse[rmse > gap]) > 0: plt.step(binEdges[1:][bincenters >= gap], plot_hist[bincenters >= gap], color="r") # prettify plt.xlabel("RMSE between complete record and each year") plt.ylabel("Frequency") plt.title(station.id + " annual wind rose differences") plt.xlim([0, 1.1 * np.ma.max(rmse)]) plt.legend(loc="lower right", frameon=False) plt.show() # plot all the annual wind roses, flattened out. plt.clf() hist, binEdges = np.histogram(direction, bins=np.arange(0.0, 360.0 + DEGREEBINS, DEGREEBINS), normed=True) bincenters = (binEdges[:-1] + binEdges[1:]) / 2.0 plt.plot(bincenters, hist, "k-", lw=2) for y, year in enumerate(month_ranges_years): if len(speed[year[0][0] : year[-1][0]].compressed() > 0): hist, binEdges = np.histogram(direction[year[0][0] : year[-1][0]], bins=binEdges, normed=True) plt.plot(bincenters, hist) plt.xlabel("Direction (degrees)") plt.show() # plot wind roses as wind roses plot_wind_rose(speed, direction, "{} - {}".format(station.id, "all years")) for y, year in enumerate(month_ranges_years): if len(speed[year[0][0] : year[-1][0]].compressed() > 0): plot_wind_rose( speed[year[0][0] : year[-1][0]], direction[year[0][0] : year[-1][0]], "{} - {}".format(station.id, start.year + y), label="RMSE {:6.4f}\nThreshold {:6.4f}".format(rmse[y], gap), ) # and apply the flags and output text flag_locs, = np.where(flags != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Wind Rose Check", "windspeeds/dirs", len(flag_locs), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Wind Rose Check", "windspeeds/dirs", len(flag_locs)) station.qc_flags[:, flag_col] = flags # and flag the variables station.windspeeds.flags[flag_locs] = 1 station.winddirs.flags[flag_locs] = 1 return # wind_rose_check
def wind_rose_check(station, flag_col, start, end, logfile, plots = False, diagnostics = False, doMonth = False): ''' Checks for large differences in the year-to-year wind-rose shape. Uses RMSE and fits Gaussian. Finds gap in distribution to flag beyond :param MetStation station: station object :param int flag_col: which column to store the flags in :param datetime start: start of data :param datetime end: end of data :param bool plots: run the plots :param bool diagnostics: run the diagnostics ''' st_var_spd = getattr(station, "windspeeds") st_var_dir = getattr(station, "winddirs") direction = st_var_dir.data speed = st_var_spd.data flags = station.qc_flags[:,flag_col] month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1,12,2) filtered_direction = utils.apply_filter_flags(st_var_dir, doMonth = doMonth, start = start, end = end) filtered_speed = utils.apply_filter_flags(st_var_spd, doMonth = doMonth, start = start, end = end) # histogram of wind directions ( ~ unravelled wind-rose) dir_bins = range(0,360+DEGREEBINS,DEGREEBINS) full_hist, full_binEdges = np.histogram(filtered_direction.compressed(), bins = dir_bins, normed = True) if diagnostics: print full_hist # use rmse as this is known (Chi-sq remains just in case) rmse, chisq = -np.ma.ones([month_ranges_years.shape[0]]), -np.ma.ones([month_ranges_years.shape[0]]) # run through each year to extract RMSE's for y,year in enumerate(month_ranges_years): if len(direction[year[0][0]:year[-1][0]].compressed()) > 0: hist, dummy = np.histogram(direction[year[0][0]:year[-1][0]].compressed(), bins = dir_bins, normed = True) chisq[y] = np.sum((full_hist-hist)**2/(full_hist+hist))/2. rmse[y] = np.sqrt(np.mean((full_hist-hist)**2)) else: rmse.mask[y] = True # now to bin up the differences and see what the fit is. # need to have values spread so can bin! if len(np.unique(rmse.compressed())) > 1: rmse_binEdges, rmse_bincenters = wind_create_bins(rmse) hist, rmse_binEdges = np.histogram(rmse, bins = rmse_binEdges)#, density=True) norm = get_histogram_norm(rmse, rmse_binEdges) # inputs for fit mu = np.mean(rmse) std = np.std(rmse) # try to get decent fit to bulk of obs. # initial_values = [np.max(hist), np.mean(rmse), np.std(rmse), stats.skew(rmse), stats.kurtosis(rmse)] # norm, mean, std, sk#ew, kurtosis # fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))]) # res = utils.hermite2gauss(fit[0]) # plot_gaussian = utils.funcGH(fit[0], bincenters) fit = stats.rice.fit(rmse.compressed(), loc = 0, scale = np.ma.std(rmse)) dist_pdf = stats.rice.pdf(rmse_bincenters, fit[:-2], loc=fit[-2], scale=fit[-1]) * norm gaussian = utils.fit_gaussian(rmse_bincenters, hist, max(hist), mu = mu, sig = std) # invert Gaussian to find initial threshold, then hunt for first gap beyond # threshold = utils.invert_gaussian(PROB_THRESHOLD, gaussian) # invert Rician to find initial threshold, then hunt for first gap beyond if dist_pdf[-1] < PROB_THRESHOLD: # then curve has dropped below the threshold, so can find some updated ones. threshold = -np.where(dist_pdf[::-1] > PROB_THRESHOLD)[0][0] else: threshold = rmse_bincenters[-1] n = 0 center = np.argmax(hist) gap = rmse_bincenters[-1] # nothing should be beyond this while True: if center + n + 1 == len(rmse_bincenters): # gone beyond edge - nothing to flag, so just break break if rmse_bincenters[center + n] < threshold: n += 1 # continue moving outwards continue if hist[center + n] == 0: # found one if center + n + 1 == len(rmse_bincenters): # gone beyond edge - nothing to flag - escape break elif hist[center + n + 1] == 0: # has to be two bins wide? gap = rmse_bincenters[center + n] break n += 1 # run through each year to extract RMSE's for y,year in enumerate(month_ranges_years): if rmse[y] > gap: # only flag where there are observations good, = np.where(np.logical_or(direction.mask[year[0][0]:year[-1][0]] == False, speed.mask[year[0][0]:year[-1][0]] == False)) if len(good) > 100: flags[year[0][0]:year[-1][0]][good] = 1 if diagnostics or plots: print "Flagging {} RMSE {} > {}".format(y+start.year, rmse[y], gap) else: if diagnostics or plots: print "{} beyond threshold (RMSE {} > {}) but retained as only {} observations\n".format(y+start.year, rmse[y], gap, len(good)) logfile.write("{} beyond threshold but retained as only {} observations\n".format(y+start.year, len(good))) elif rmse.mask[y] == False: if diagnostics or plots: print "{}".format(y+start.year) if plots: import matplotlib.pyplot as plt # plot underlying histogram plt.clf() plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist]) plt.step(rmse_binEdges[1:], plot_hist, color = 'k') # plot the Rician distribution on top plt.plot(rmse_bincenters, dist_pdf, "r-", label = "Rician") # plot the gaussian on top plt.plot(rmse_binEdges[1:], utils.gaussian(rmse_bincenters, gaussian), color = 'b', ls = ":", label = "Gaussian") plt.yscale("log") plt.ylim([0.001, 2*max(plot_hist)]) # plot the thresholds plt.axvline(threshold, color = 'g') plt.axvline(gap, color = 'r') # plot flagged values in different colour if len(rmse[rmse > gap]) > 0: plt.step(rmse_binEdges[1:][rmse_bincenters >= gap], plot_hist[rmse_bincenters >= gap], color = 'r') # prettify plt.xlabel("RMSE between complete record and each year") plt.ylabel("Frequency") plt.title(station.id + " annual wind rose differences") plt.xlim([0, 1.1*np.ma.max(rmse)]) plt.legend(loc = "lower right", frameon = False) plt.show() # plot all the annual wind roses, flattened out. plt.clf() bincenters = (full_binEdges[:-1] + full_binEdges[1:])/2. plt.plot(bincenters, full_hist, "k-", lw = 2) for y,year in enumerate(month_ranges_years): if len(speed[year[0][0]:year[-1][0]].compressed() > 0): hist, binEdges = np.histogram(direction[year[0][0]:year[-1][0]].compressed(), bins = dir_bins, normed = True) plt.plot(bincenters, hist) plt.xlabel("Direction (degrees)") plt.show() # plot wind roses as wind roses plot_wind_rose(speed, direction, "{} - {}".format(station.id, "all years")) for y,year in enumerate(month_ranges_years): if len(speed[year[0][0]:year[-1][0]].compressed() > 0): plot_wind_rose(speed[year[0][0]:year[-1][0]], direction[year[0][0]:year[-1][0]], "{} - {}".format(station.id, start.year + y), label = "RMSE {:6.4f}\nThreshold {:6.4f}".format(rmse[y], gap)) else: print "no data for {}".format(year) # and apply the flags and output text flag_locs, = np.where(flags != 0) utils.print_flagged_obs_number(logfile, "Wind Rose Check", "windspeeds/dirs", len(flag_locs), noWrite=diagnostics) station.qc_flags[:,flag_col] = flags # and flag the variables station.windspeeds.flags[flag_locs] = 1 station.winddirs.flags[flag_locs] = 1 return # wind_rose_check
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): for v, variable in enumerate(variable_list): st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) # is this needed 13th Nov 2014 RJHD #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) for month in range(12): hourly_climatologies = np.zeros(24) hourly_climatologies.fill(st_var.mdi) # append all e.g. Januaries together this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True) # if fixed climatology period, sort this here # get as array of 24 hrs. this_month = np.ma.array(this_month) this_month = this_month.reshape(-1,24) this_month_filtered = np.ma.array(this_month_filtered) this_month_filtered = this_month_filtered.reshape(-1,24) # get hourly climatology for each month for hour in range(24): this_hour = this_month[:,hour] # need to have data if this is going to work! if len(this_hour.compressed()) > 0: # winsorize & climatologies - done to match IDL if idl: this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1) else: this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.mean(this_hour) if len(this_month.compressed()) > 0: # can get stations with few obs in a particular variable. # anomalise each hour over month appropriately anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1)) anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1)) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1))/2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr normed_anomalies_filtered = anomalies_filtered / iqr # get average anomaly for year year_ids = np.array(year_ids) monthly_vqvs = np.ma.zeros(month_ranges.shape[0]) monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])] for year in range(month_ranges.shape[0]): year_locs = np.where(year_ids == year) this_year = normed_anomalies_filtered[year_locs,:] if len(this_year.compressed()) > 0: # need to have data for this to work! if idl: monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1)) else: monthly_vqvs[year] = np.ma.median(this_year) else: monthly_vqvs.mask[year] = True # low pass filter normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0]) # copy from distributional_gap.py - refactor! # get the threshold value bins, bincenters = utils.create_bins(normed_anomalies, 1.) hist, binEdges = np.histogram(normed_anomalies, bins = bins) gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies)) minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) if diagnostics: print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian) print gaussian print hist if plots: coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations") uppercount = len(np.where(normed_anomalies > minimum_threshold)[0]) lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0]) these_flags = station.qc_flags[:, flag_col[v]] gap_plot_values, tentative_plot_values = [], [] # find the gaps and apply the flags gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) station.qc_flags[:, flag_col[v]] = these_flags if uppercount + lowercount > 1000: #print "not sorted spurious stations yet" pass if plots: import matplotlib.pyplot as plt hist, binEdges = np.histogram(tentative_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid') hist, binEdges = np.histogram(gap_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid') import calendar plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes) leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5) plt.setp(leg.get_title(), fontsize=14) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png') flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) # copy flags into attribute st_var.flags[flag_locs] = 1 if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True) print "where\n" nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags, noWrite = True) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags, noWrite = True) else: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0])) logfile.write("where\n") nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags) # firm flags match 030220 station = utils.append_history(station, "Climatological Check") return
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, second=False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool second: run for second time :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] prev_flag_number = 0 if second: # count currently existing flags: prev_flag_number = len(flags[flags != 0]) st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) reporting_resolution = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) good = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered)) full_time_diffs.mask = all_filtered.mask full_time_diffs[ good] = station.time.data[good][1:] - station.time.data[good][:-1] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = all_filtered.mask full_filtered_diffs[good] = all_filtered.compressed( )[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = st_var.data.mask full_value_diffs[good_to_uncompress] = st_var.data.compressed( )[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9, 12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[ year, month, 0]:month_ranges[year, month, 1]] this_month_filtered_diff = full_filtered_diffs[ month_ranges[year, month, 0]:month_ranges[year, month, 1]] else: this_month_time_diff = np.ma.concatenate([ this_month_time_diff, full_time_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) this_month_filtered_diff = np.ma.concatenate([ this_month_filtered_diff, full_filtered_diffs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] ]) month_locs[month_ranges[year, month, 0]:month_ranges[year, month, 1]] = month for delta in range(1, 9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta - 1, month] = 6. elif iqr == 0: critical_values[delta - 1, month] = st_var.mdi else: critical_values[delta - 1, month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format( calendar.month_name[month + 1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "", "", "" threshold = utils.get_critical_values( this_month_filtered_diff[locs], binmin=0, binwidth=0.5, plots=plots, diagnostics=diagnostics, title=title, line_label=line_label, xlabel=xlabel, old_threshold=critical_values[delta - 1, month]) if threshold < critical_values[delta - 1, month]: critical_values[delta - 1, month] = threshold if plots or diagnostics: print critical_values[delta - 1, month], iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0, :] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where( critical_values[good_critical_values] <= 5. * reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5. * reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0, :], 5. * reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0, month] != st_var.mdi and critical_values[ 1, month] != st_var.mdi: if critical_values[0, month] / critical_values[1, month] <= 0.66: critical_values[0, month] = 0.66 * critical_values[1, month] if diagnostics: print critical_values[0, :] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data)) full_time_diffs.mask = st_var.data.mask full_time_diffs[ good_to_uncompress] = station.time.data[good_to_uncompress][ 1:] - station.time.data[good_to_uncompress][:-1] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[0][t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[0][t]] ) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t + 1], start, variable, plots=plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[0][:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[0][t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t + 1], start, variable, plots=plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1, 2, 3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign( 1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[ time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs( value_diffs[t - spk_len - 1] ) <= critical_values[ time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs( value_diffs[t + 1] ) <= critical_values[ time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[t - spk_len + 1:t + 1] = 1 if plots or diagnostics: sc_diagnostics_and_plots( station.time. data, st_var.data, good_to_uncompress[ 0][t - spk_len + 1], good_to_uncompress[ 0][t + 1], start, variable, plots=plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number, noWrite=True) # additional flags else: utils.print_flagged_obs_number( logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def dgc_all_obs(station, variable, flags, start, end, logfile, plots=False, diagnostics=False, idl=False, windspeeds=False, GH=False, doMonth=False): '''RJHD addition working on all observations''' if plots: import matplotlib.pyplot as plt month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) # extract variable st_var = getattr(station, variable) # apply flags (and mask incomplete year if appropriate) all_filtered = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) st_var_complete_year = copy.deepcopy(st_var) if doMonth: # restrict the incomplete year if appropriate - keep other flagged obs. full_year_end = utils.get_first_hour_this_year(start, end) st_var_complete_year.data.mask[full_year_end:] = True for month in range(12): # if requiring wind data, extract data and find monthly averages if windspeeds == True: st_var_wind = getattr(station, "windspeeds") if doMonth: # restrict the incomplete year if appropriate st_var_wind.data.mask[full_year_end:] = True # get monthly averages windspeeds_month = np.empty([]) for y, year in enumerate(month_ranges[:, month, :]): if y == 0: windspeeds_month = np.ma.array( st_var_wind.data[year[0]:year[1]]) else: windspeeds_month = np.ma.concatenate( [windspeeds_month, st_var_wind.data[year[0]:year[1]]]) windspeeds_month_average = dgc_get_monthly_averages( windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN) windspeeds_month_mad = utils.mean_absolute_deviation( windspeeds_month, median=True) # pull data from each calendar month together this_month_data, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var.data, hours=False) this_month_filtered, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], all_filtered, hours=False) this_month_complete, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var_complete_year.data, hours=False) # if enough clean and complete data for this calendar month find the median and IQR if len(this_month_filtered.compressed()) > OBS_LIMIT: if idl: monthly_median = utils.idl_median( this_month_filtered.compressed().reshape(-1)) else: monthly_median = np.ma.median(this_month_filtered) iqr = utils.IQR(this_month_filtered.compressed()) if iqr == 0.0: # to get some spread if IQR too small iqr = utils.IQR(this_month_filtered.compressed(), percentile=0.05) print "Spurious_stations file not yet sorted" # if have an IQR, anomalise using median and standardise using IQR if iqr != 0.0: monthly_values = np.ma.array( (this_month_data.compressed() - monthly_median) / iqr) complete_values = np.ma.array( (this_month_complete.compressed() - monthly_median) / iqr) # use complete years only for the histogram - aiming to find outliers. bins, bincenters = utils.create_bins(complete_values, BIN_SIZE) dummy, plot_bincenters = utils.create_bins( complete_values, BIN_SIZE / 10.) hist, binEdges = np.histogram(complete_values, bins=bins) """ Change to monthly updates Oct 2017 Thought about changing distribution to use filtered values But this changes the test beyond just dealing with additional months Commented out lines below would be alternative. """ # bins, bincenters = utils.create_bins(filtered_values, BIN_SIZE) # dummy, plot_bincenters = utils.create_bins(filtered_values, BIN_SIZE/10.) # hist, binEdges = np.histogram(filtered_values, bins = bins) # used filtered (incl. incomplete year mask) to determine the distribution. if GH: # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD # Feb 2019 - if large amounts off centre, can affect initial values # switched to median and MAD initial_values = [ np.max(hist), np.median(complete_values), utils.mean_absolute_deviation(complete_values, median=True), stats.skew(complete_values), stats.kurtosis(complete_values) ] # norm, mean, std, skew, kurtosis fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))]) res = utils.hermite2gauss(fit[0], diagnostics=diagnostics) plot_gaussian = utils.funcGH(fit[0], plot_bincenters) # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting? mid_point = np.argmax(plot_gaussian) bad, = np.where( plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[mid_point:][ bad[0]:] = FREQUENCY_THRESHOLD / 10. bad, = np.where( plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[:mid_point][:bad[ -1]] = FREQUENCY_THRESHOLD / 10. # extract threshold values good_values = np.argwhere( plot_gaussian > FREQUENCY_THRESHOLD) l_minimum_threshold = round( plot_bincenters[good_values[0]]) - 1 u_minimum_threshold = 1 + round( plot_bincenters[good_values[-1]]) if diagnostics: print hist print res print iqr, l_minimum_threshold, u_minimum_threshold # or just a standard Gaussian else: gaussian = utils.fit_gaussian( bincenters, hist, max(hist), mu=np.median(complete_values), sig=utils.mean_absolute_value(complete_values)) # assume the same threshold value u_minimum_threshold = 1 + round( utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) l_minimum_threshold = -u_minimum_threshold plot_gaussian = utils.gaussian(plot_bincenters, gaussian) if diagnostics: print hist print gaussian print iqr, u_minimum_threshold, 1. + utils.invert_gaussian( FREQUENCY_THRESHOLD, gaussian) if plots: dgc_set_up_plot(plot_gaussian, complete_values, variable, threshold=(u_minimum_threshold, l_minimum_threshold), sub_par="observations", GH=GH) if GH: plt.figtext( 0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' % (res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small') # now trying to find gaps in the distribution uppercount = len( np.where(monthly_values > u_minimum_threshold)[0]) lowercount = len( np.where(monthly_values < l_minimum_threshold)[0]) # this needs refactoring - but lots of variables to pass in if plots or diagnostics: gap_plot_values = np.array([]) # do one side of distribution and then other if uppercount > 0: gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold) if gap_start != 0: # if found a gap, then go through each year for this calendar month # and flag observations further from middle for y, year in enumerate(month_ranges[:, month, :]): # not using filtered - checking all available data this_year_data = np.ma.array( st_var.data[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.ma.where( ((this_year_data - monthly_median) / iqr) > gap_start) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if len(gap_cleaned_locations[0]) > 0: print "Upper {}-{} - {} obs flagged".format( y + start.year, month, len(gap_cleaned_locations[0])) print gap_cleaned_locations, this_year_data[ gap_cleaned_locations] if lowercount > 0: gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold) if gap_start != 0: # if found a gap, then go through each year for this calendar month # and flag observations further from middle for y, year in enumerate(month_ranges[:, month, :]): this_year_data = np.ma.array( st_var.data[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.ma.where( np.logical_and( ((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True)) # add flag requirement for low pressure bit if appropriate this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if len(gap_cleaned_locations[0]) > 0: print "Lower {}-{} - {} obs flagged".format( y + start.year, month, len(gap_cleaned_locations[0])) print gap_cleaned_locations, this_year_data[ gap_cleaned_locations] # if doing SLP then do extra checks for storms if windspeeds: windspeeds_year = np.ma.array( st_var_wind.data[year[0]:year[1]]) this_year_flags[ gap_cleaned_locations] = 2 # tentative flags slp_average = dgc_get_monthly_averages( this_month_data, OBS_LIMIT, st_var.mdi, MEAN) slp_mad = utils.mean_absolute_deviation( this_month_data, median=True) # need to ensure that this_year_data is less than slp_average, hence order of test storms, = np.ma.where((((windspeeds_year - windspeeds_month_average) / windspeeds_month_mad) > MAD_THRESHOLD) &\ (((slp_average - this_year_data) / slp_mad) > MAD_THRESHOLD)) # using IDL terminology if len(storms) >= 2: # use the first difference series to find when there are gaps in # contiguous sequences of storm observations - want to split up into # separate storm events storm_1diffs = np.diff(storms) separations, = np.where(storm_1diffs != 1) # expand around storm signal so that all low SLP values covered, and unflagged if len(separations) >= 1: print " multiple storms in {} {}".format( y + start.year, month) # if more than one storm signal that month, then use intervals # in the first difference series to expand around the first interval alone storm_start = 0 storm_finish = separations[0] + 1 first_storm = dgc_expand_storms( storms[storm_start:storm_finish], len(this_year_data)) final_storms = copy.deepcopy( first_storm) for j in range(len(separations)): # then do the rest in a loop if j + 1 == len(separations): # final one this_storm = dgc_expand_storms( storms[separations[j] + 1:], len(this_year_data)) else: this_storm = dgc_expand_storms( storms[separations[j] + 1:separations[j + 1] + 1], len(this_year_data)) final_storms = np.append( final_storms, this_storm) else: # else just expand around the signal by 6 hours either way final_storms = dgc_expand_storms( storms, len(this_year_data)) else: final_storms = storms if len(storms) >= 1: print "Tropical Storm signal in {} {}".format( y + start.year, month) this_year_flags[final_storms] = 0 # and write flags back into array flags[year[0]:year[1]] = this_year_flags if plots: hist, binEdges = np.histogram(gap_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='flagged', where='mid') import calendar plt.text(0.1, 0.9, calendar.month_name[month + 1], transform=plt.gca().transAxes) plt.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png') nflags, = np.where(flags != 0) utils.print_flagged_obs_number(logfile, "Distributional Gap All", variable, len(nflags), noWrite=diagnostics) return flags # dgc_all_obs
def dcc(station, variable_list, full_variable_list, flag_col, logfile, plots = False, diagnostics = False): ''' The diurnal cycle check. :param object station: the station object to be processed :param list variable_list: the variables to be processed :param list full_variable_list: the variables for flags to be applied to :param list flag_col: which column in the qc_flags array to work on :param file logfile: logfile to store outputs :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' # list of flags for each variable diurnal_flags = [] for v,variable in enumerate(variable_list): st_var = getattr(station, variable) # is this needed 21/08/2014 # reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var) filtered_data = filtered_data.reshape(-1,24) # working in fulltimes. number_of_days = filtered_data.shape[0] if plots: import matplotlib.pyplot as plt plt.clf() plot_data = np.ma.zeros(filtered_data.shape) plot_data.mask = True # best_estimate_counter = np.zeros(HOURS) diurnal_best_fits = np.zeros(filtered_data.shape[0], dtype = (int)) diurnal_best_fits.fill(INTMDI) diurnal_uncertainties = np.zeros(filtered_data.shape[0]) diurnal_uncertainties.fill(INTMDI) for d,day in enumerate(filtered_data): '''enough observations and have large enough diurnal range ''' if len(day.compressed()) >= OBS_PER_DAY: obs_daily_range = max(day.compressed()) - min(day.compressed()) if obs_daily_range >= DAILY_RANGE: if dcc_quartile_check(day): scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed()) diffs = np.zeros(HOURS) '''Find differences for each shifted sine --> cost function''' for h in range(HOURS): diffs[h] = np.sum(np.abs(day - scaled_sine).compressed()) scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT() diurnal_best_fits[d] = np.argmin(diffs) # default uncertainty is the average time resolution of the data diurnal_uncertainties[d] = round(float(HOURS) / len(day.compressed())) if DYNAMIC_DIURNAL: critical_value = min(diffs) + ((max(diffs) - min(diffs)) * 0.33) # centre so minimum in middle diffs = np.roll(diffs, 11 - diurnal_best_fits[d]) uncertainty = 1 while uncertainty < 11: if (diffs[11 - uncertainty] > critical_value) and\ (diffs[11 + uncertainty] > critical_value): # break if both sides greater than critical difference # when counting outwards # see diurnal_example.py break uncertainty += 1 # check if uncertainty greater than time resolution for day if uncertainty > diurnal_uncertainties[d] : diurnal_uncertainties[d] = uncertainty if plots: # best_estimate_counter[np.argmin(diffs)] += 1 # scale daily data to range -1 -> 1, plot with random scatter for clarity plot_data[d] = ((2 * (day - min(day.compressed())) / obs_daily_range) - 1.) plt.plot(np.arange(24)+np.random.randn(24)*0.25, plot_data[d]+np.random.randn(24)*0.05, 'k,') if plots: plt.plot(np.arange(24),np.roll(dcc_make_sine(), np.argmax(np.bincount(diurnal_best_fits[np.where(diurnal_best_fits != INTMDI)]))),'r-') plt.xlim([-1,25]) plt.ylim([-1.2,1.2]) plt.show() # dumb copy of IDL '''For each uncertainty range (1-6h) find median of cycle offset''' best_fits = np.zeros(6) best_fits.fill(-9) for h in range(6): locs = np.where(diurnal_uncertainties == h+1) if len(locs[0]) > 300: # best_fits[h] = int(np.median(diurnal_best_fits[locs])) # Numpy median gives average of central two values which may not be integer # 25/11/2014 use IDL style which gives lower value best_fits[h] = utils.idl_median(diurnal_best_fits[locs]) '''Build up range of cycles incl, uncertainty to find where best of best located''' hours = np.arange(24) hour_matches=np.zeros(24) diurnal_peak = -9 number_estimates = 0 for h in range(6): if best_fits[h] != -9: '''Store lowest uncertainty best fit as first guess''' if diurnal_peak == -9: diurnal_peak = best_fits[h] hours = np.roll(hours,11-int(diurnal_peak)) hour_matches[11-(h+1):11+(h+2)] = 1 number_estimates += 1 centre = np.where(hours == best_fits[h]) if (centre[0] - h + 1) >= 0: if (centre[0] + h + 1 ) <=23: hour_matches[centre[0] - (h + 1) : centre[0] + h + 2] += 1 else: hour_matches[centre[0] - (h + 1) : ] += 1 hour_matches[ : centre[0] + h + 2- 24] += 1 else: hour_matches[: centre[0] + h + 2] += 1 hour_matches[centre[0] - (h + 1) :] += 1 number_estimates += 1 '''If value at lowest uncertainty not found in all others, then see what value is found by all others ''' if hour_matches[11] != number_estimates: # central estimate at 12 o'clock all_match = np.where(hour_matches == number_estimates) # if one is, then use it if len(all_match[0]) > 0: diurnal_peak = all_match[0][0] else: diurnal_peak = -9 '''Now have value for best fit diurnal offset''' potentially_spurious = np.zeros(number_of_days) potentially_spurious.fill(INTMDI) if diurnal_peak != -9: hours = np.arange(24) hours = np.roll(hours,11-int(diurnal_peak)) for d in range(number_of_days): if diurnal_best_fits[d] != INTMDI: '''Checks if global falls inside daily value+/-range rather than seeing if each day falls in global value+/-range''' min_range = 11 - diurnal_uncertainties[d] max_range = 11 + diurnal_uncertainties[d] maxloc = np.where(hours == diurnal_best_fits[d])[0][0] if maxloc < min_range or maxloc > max_range: potentially_spurious[d] = 1 else: potentially_spurious[d] = 0 # count number of good, missing and not-bad days n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 to_flag = np.zeros(number_of_days) for d in range(number_of_days): if potentially_spurious[d] == 1: n_good = 0 n_miss = 0 n_not_bad = 0 total_points += 1 total_not_miss +=1 else: if potentially_spurious[d] == 0: n_good += 1 n_not_bad += 1 if n_miss != 0: n_miss = 0 total_not_miss += 1 if potentially_spurious[d] == -999: n_miss += 1 n_not_bad += 1 if n_good != 0: n_good = 0 total_points += 1 if (n_good == 3) or (n_miss == 3) or (n_not_bad >=6): if total_points >= 30: if float(total_not_miss)/total_points >= 0.5: to_flag[d - total_points : d ] = 1 n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 dcc_flags = np.zeros(filtered_data.shape) for d in range(number_of_days): if to_flag[d] == 1: good = np.where(filtered_data.mask[d,:] == False) if len(good[0]) >= 1: dcc_flags[d,good]=1 if diagnostics: print len(np.where(dcc_flags == 1)[0]) print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?" diurnal_flags += [dcc_flags] else: diurnal_flags += [np.zeros(filtered_data.shape)] station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Diurnal Cycle", plots = plots, diagnostics = diagnostics) station = utils.append_history(station, "Diurnal Cycle Check") return # dcc
def spc_diff(sfc, stn, flags, month_ranges, start, end, logfile, plots = False, diagnostics = False, doMonth = False): ''' Pressure difference check, on individual obs. Remove very silly stnlp :param array sfc: SLP :param array stn: STNLP :param array flags: flags_array :param array month_ranges: array of month start and end times :param datetime start: DATASTART :param datetime end: DATAEND :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose output :param bool doMonth: account for spare month :returns: flags - locations where flags have been set ''' month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) # apply flags (and mask incomplete year if appropriate) sfc_filtered = utils.apply_filter_flags(sfc, doMonth = doMonth, start = start, end = end) stn_filtered = utils.apply_filter_flags(stn, doMonth = doMonth, start = start, end = end) # get the differences diffs = sfc.data - stn.data diffs_filtered = sfc_filtered - stn_filtered # robust statistics median_difference = np.ma.median(diffs) mad_difference = utils.mean_absolute_deviation(diffs, median = True) # where exceed high, = np.ma.where(diffs > (median_difference + MAD_THRESHOLD*mad_difference)) low, = np.ma.where(diffs < (median_difference - MAD_THRESHOLD*mad_difference)) # set flags if len(high) != 0: if diagnostics: print "Number of high differences {}".format(len(high)) flags[high] = 1 if len(low) != 0: if diagnostics: print "Number of low differences {}".format(len(low)) flags[low] = 1 if plots: import matplotlib.pyplot as plt plt.clf() plt.hist(diffs.compressed(), bins = np.arange(np.round(median_difference)-10, np.round(median_difference)+10, 0.1)) plt.axvline(x = (median_difference + 4*mad_difference), ls = "--", c = "r") plt.axvline(x = (median_difference - 4*mad_difference), ls = "--", c = "r") plt.xlim([median_difference - 11, median_difference + 11]) plt.ylabel("Observations") plt.xlabel("Difference (hPa)") plt.show() # How to set the range of allowable values. nflags, = np.where(flags != 0) utils.print_flagged_obs_number(logfile, "Station Level Pressure", "stnlp", len(nflags), noWrite=diagnostics) return flags # spc_diff
def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): if plots or diagnostics: import matplotlib.pyplot as plt import calendar # very similar to climatological check - ensure that not duplicating for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) month_data_count = np.zeros(month_ranges.shape[0:2]) # for each month for month in range(12): # set up hourly climatologies hourly_clims = np.zeros(24) hourly_clims.fill(st_var.data.fill_value) this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) # # extract each year and append together # year_ids = [] # counter to determine which year each day corresponds to # for year in range(month_ranges.shape[0]): # this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]] # if year == 0: # # store so can access each hour of day separately # this_month = this_year.reshape(-1,24) # year_ids = [year for x in range(this_month.shape[0])] # month_data_count[year,month] = len(this_year.compressed()) # else: # this_year = this_year.reshape(-1,24) # this_month = np.ma.concatenate((this_month, this_year), axis = 0) # year_ids.extend([year for x in range(this_year.shape[0])]) # month_data_count[year,month] = len(this_year.compressed()) # winsorize and get hourly climatology for h in range(24): this_hour = this_month[:,h] if len(this_hour.compressed()) > 100: # winsorize & climatologies - done to match IDL if idl: this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1) else: this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_clims[h] = np.ma.mean(this_hour_winsorized) hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims) anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1)) # extract IQR of anomalies (using 1/2 value to match IDL) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr variances = np.ma.zeros(month_ranges.shape[0]) variances.mask = [False for i in range(month_ranges.shape[0])] rep_accuracies = np.zeros(month_ranges.shape[0]) rep_freqs = np.zeros(month_ranges.shape[0]) variances.fill(st_var.mdi) rep_accuracies.fill(st_var.mdi) rep_freqs.fill(st_var.mdi) year_ids = np.array(year_ids) # extract variance of normalised anomalies for each year for y, year in enumerate(range(month_ranges.shape[0])): year_locs = np.where(year_ids == y) this_year = normed_anomalies[year_locs,:] this_year = this_year.reshape(-1) # end of similarity with Climatological check if len(this_year.compressed()) >= 30: variances[y] = utils.mean_absolute_deviation(this_year, median = True) rep_accuracies[y] = utils.reporting_accuracy(this_year) rep_freqs[y] = utils.reporting_frequency(this_year) else: variances.mask[y] = True good = np.where(month_data_count[:,month] >= 100) # get median and IQR of variance for all years for this month if len(good[0]) >= 10: median_variance = np.median(variances[good]) iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL if iqr_variance < 0.01: iqr_variance = 0.01 else: median_variance = st_var.mdi iqr_variance = st_var.mdi # if SLP, then get median and MAD of SLP and windspeed for month if variable in ["slp", "windspeeds"]: winds = getattr(station, "windspeeds") slp = getattr(station, "slp") # refactor this as similar in style to how target data extracted for y, year in enumerate(range(month_ranges.shape[0])): if y == 0: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_month = winds_year.reshape(-1,24) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp_year.reshape(-1,24) else: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_year = winds_year.reshape(-1,24) winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_year = slp_year.reshape(-1,24) slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0) median_wind = np.ma.median(winds_month) median_slp = np.ma.median(slp_month) wind_MAD = utils.mean_absolute_deviation(winds_month.compressed()) slp_MAD = utils.mean_absolute_deviation(slp_month.compressed()) if diagnostics: print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD) print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD) # now test to see if variance exceeds expected range for y, year in enumerate(range(month_ranges.shape[0])): if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \ (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD): # if SLP, then need to test if deep low pressure ("hurricane/storm") present # as this will increase the variance for this month + year if variable in ["slp", "windspeeds"]: iqr_threshold = 6. # increase threshold if reporting frequency and resolution of this # year doesn't match average if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 8. if diagnostics: print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold: # check for storms winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] storm = False if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1): # find max wind & min SLP # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0] # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0] # if these are above thresholds and within one day of each other, # then it likely was a storm # print "fix this in case of multiple max/min locations" # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ # (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \ # (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): # locations where winds greater than threshold high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD) # and where SLP less than threshold low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD) # if any locations match, then it's a storm match_loc = high_winds[np.in1d(high_winds, low_slps)] if len(match_loc) > 0: storm = True else: print "write spurious" # check the SLP first difference series # to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed if variable == "slp": diffs = np.diff(slp_month.compressed()) elif variable == "windspeeds": diffs = np.diff(winds_month.compressed()) negs, poss = 0,0 biggest_neg, biggest_pos = 0,0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not storm: # not a hurricane, so mask station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots or diagnostics: print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year) else: logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)) else: # hurricane if plots or diagnostics: print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year) else: logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)) if plots: # plot showing the pressure, pressure first differences and the wind speeds plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start) evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD) else: iqr_threshold = 8. if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 10. if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold: if diagnostics: print "flagging {} {}".format(year+start.year,calendar.month_name[month+1]) # remove the data station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots: plot_variances = (variances - median_variance) / iqr_variance plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances) evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1])) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 for T, D and SLP 21/8/2014 station = utils.append_history(station, "Excess Variance Check") return # evc
def logical_checks(station, flag_col, logfile, plots=False, diagnostics=False): """ Select occurrences of wind speed and direction which are logically inconsistent with measuring practices. From Table 2 - DeGaetano, JOAT, 14, 308-317, 1997 :param Station station: station object :param array flag_col: which columns to use in QC flag array :param file logfile: logfile to output to :param bool plots: do plots? :param bool diagnostics: do diagnostics? """ speed = getattr(station, "windspeeds") direction = getattr(station, "winddirs") # recover direction information where the speed is Zero fix_zero_direction = np.ma.where( np.logical_and(speed.data == 0, direction.data.mask == True)) direction.data[fix_zero_direction] = 0 direction.data.mask[fix_zero_direction] = False station.qc_flags[fix_zero_direction, flag_col[1]] = -1 # to make a note of these # negative speeds negative_speed = np.ma.where(speed.data < 0) station.qc_flags[negative_speed, flag_col[0]] = 1 # negative directions (don't try to adjust) negative_direction = np.ma.where(direction.data < 0) station.qc_flags[negative_direction, flag_col[1]] = 1 # wrapped directions (don't try to adjust) wrapped_direction = np.ma.where(direction.data > 360) station.qc_flags[wrapped_direction, flag_col[1]] = 1 # no direction possible if speed == 0 bad_direction = np.ma.where( np.logical_and(speed.data == 0, direction.data != 0)) station.qc_flags[bad_direction, flag_col[1]] = 1 # northerlies given as 360, not 0 --> calm bad_speed = np.ma.where( np.logical_and(direction.data == 0, speed.data != 0)) station.qc_flags[bad_speed, flag_col[0]] = 1 # and output to file/screen flag_locs0, = np.where( station.qc_flags[:, flag_col[0]] > 0) # in case of direction fixes flag_locs1, = np.where( station.qc_flags[:, flag_col[1]] > 0) # in case of direction fixes if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "windspeeds", len(flag_locs0), noWrite=True) utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "winddirs", len(flag_locs1), noWrite=True) else: utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "windspeeds", len(flag_locs0)) utils.print_flagged_obs_number(logfile, "Wind Logical Checks", "winddirs", len(flag_locs1)) # copy flags into attribute station.windspeeds.flags[flag_locs0] = 1 station.winddirs.flags[flag_locs1] = 1 return # logical_checks
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False, doMonth = False): for v, variable in enumerate(variable_list): st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var, doMonth = doMonth, start = start, end = end) st_var_complete_year = copy.deepcopy(st_var) if doMonth: # restrict the incomplete year if appropriate - keep other flagged obs. full_year_end = utils.get_first_hour_this_year(start, end) st_var_complete_year.data.mask[full_year_end :] = True # is this needed 13th Nov 2014 RJHD #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) for month in range(12): hourly_climatologies = np.zeros(24) hourly_climatologies.fill(st_var.mdi) # append all e.g. Januaries together this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) this_month_complete, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var_complete_year.data, hours = True) this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True) # if fixed climatology period, sort this here # get as array of 24 hrs. this_month = np.ma.array(this_month) this_month = this_month.reshape(-1,24) this_month_complete = np.ma.array(this_month_complete) this_month_complete = this_month_complete.reshape(-1,24) this_month_filtered = np.ma.array(this_month_filtered) this_month_filtered = this_month_filtered.reshape(-1,24) # get hourly climatology for each month for hour in range(24): this_hour = this_month_complete[:,hour] # need to have data if this is going to work! if len(this_hour.compressed()) > 0: # winsorize & climatologies - done to match IDL if idl: this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1) else: this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.mean(this_hour) if diagnostics: print "hourly clims", hourly_climatologies if len(this_month.compressed()) > 0 and len(this_month_complete.compressed()) > 0: # can get stations with few obs in a particular variable. # or, with monthly running, if this variable has just started reporting # only get data from the recent month, but not from previous year. # anomalise each hour over month appropriately anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1)) anomalies_complete = this_month_complete - np.tile(hourly_climatologies, (this_month_complete.shape[0],1)) anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1)) if len(anomalies_complete.compressed()) >= 10: iqr = utils.IQR(anomalies_complete.compressed().reshape(-1))/2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr normed_anomalies_complete = anomalies_complete / iqr normed_anomalies_filtered = anomalies_filtered / iqr if diagnostics: print np.ma.mean(this_month), np.ma.mean(this_month_complete), np.ma.mean(this_month_filtered) print np.ma.mean(anomalies), np.ma.mean(anomalies_complete), np.ma.mean(anomalies_filtered) print np.ma.mean(normed_anomalies), np.ma.mean(normed_anomalies_complete), np.ma.mean(normed_anomalies_filtered) # get average anomaly for year year_ids = np.array(year_ids) monthly_vqvs = np.ma.zeros(month_ranges.shape[0]) monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])] for year in range(month_ranges.shape[0]): year_locs = np.where(year_ids == year) this_year = normed_anomalies_filtered[year_locs,:] if len(this_year.compressed()) > 0: # need to have data for this to work! if idl: monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1)) else: monthly_vqvs[year] = np.ma.median(this_year) else: monthly_vqvs.mask[year] = True if diagnostics: print "monthly vqvs", monthly_vqvs # low pass filter normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0]) if doMonth: # run low pass filter, ignoring the final incomplete year. not_final_year_locs, = np.where(year_ids != year_ids[-1]) normed_anomalies_complete[not_final_year_locs] = coc_low_pass_filter(normed_anomalies_complete[not_final_year_locs], year_ids[not_final_year_locs], monthly_vqvs[:-1], month_ranges.shape[0]-1) else: normed_anomalies_complete = coc_low_pass_filter(normed_anomalies_complete, year_ids, monthly_vqvs, month_ranges.shape[0]) # copy from distributional_gap.py - refactor! # get the threshold value using complete values bins, bincenters = utils.create_bins(normed_anomalies_complete, 1.) hist, binEdges = np.histogram(normed_anomalies_complete.compressed(), bins = bins) gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.ma.mean(normed_anomalies_complete), sig = np.ma.std(normed_anomalies_complete)) minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) if diagnostics: print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian) print gaussian print hist print bins if plots: coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations") # apply to uncomplete values uppercount = len(np.where(normed_anomalies > minimum_threshold)[0]) lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0]) these_flags = station.qc_flags[:, flag_col[v]] gap_plot_values, tentative_plot_values = [], [] # find the gaps and apply the flags gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) station.qc_flags[:, flag_col[v]] = these_flags if uppercount + lowercount > 1000: #print "not sorted spurious stations yet" pass if plots: import matplotlib.pyplot as plt hist, binEdges = np.histogram(tentative_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid') hist, binEdges = np.histogram(gap_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid') import calendar plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes) leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5) plt.setp(leg.get_title(), fontsize=14) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png') flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) # copy flags into attribute st_var.flags[flag_locs] = 1 utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = diagnostics) if diagnostics: print "where\n" logfile.write("where\n") nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags, noWrite = diagnostics) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags, noWrite = diagnostics) # firm flags match 030220 station = utils.append_history(station, "Climatological Check") return
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False): ''' Check for certain values occurring more frequently than would be expected :param object station: station object to process :param list variable_list: list of variables to process :param list flag_col: columns to fill in flag array :param datetime start: datetime object of start of data :param datetime end: datetime object of end of data :param file logfile: logfile to store outputs :param bool diagnostics: produce extra diagnostic output :param bool plots: produce plots ''' MIN_DATA_REQUIRED = 500 # to create histogram for complete record MIN_DATA_REQUIRED_YEAR = 100 # to create histogram month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1,12,2) for v,variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var) for season in range(5): # Year,MAM,JJA,SON,JF+D if season == 0: # all year season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi) thresholds = [30,20,10] else: thresholds = [20,15,10] season_data = np.ma.array([]) for y,year in enumerate(month_ranges_years): # churn through months extracting data, accounting for fdi and concatenating together if season == 1: #mam season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[2][0]:year[4][-1]], st_var.fdi)]) elif season == 2: #jja season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[5][0]:year[7][-1]], st_var.fdi)]) elif season == 3: #son season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[8][0]:year[10][-1]], st_var.fdi)]) elif season == 4: #d+jf season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[0][0]:year[1][-1]], st_var.fdi)]) season_data = np.ma.concatenate([season_data, np.ma.masked_values(filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi)]) season_data = season_data.compressed() if len(season_data) > MIN_DATA_REQUIRED: if 0 < reporting_accuracy <= 0.5: # -1 used as missing value bins, bincenters = utils.create_bins(season_data, 0.5) else: bins, bincenters = utils.create_bins(season_data, 1.0) hist, binEdges = np.histogram(season_data, bins = bins) if plots: plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title = "%s" % (SEASONS[season])) bad_bin = np.zeros(len(hist)) # scan through bin values and identify bad ones for e, element in enumerate(hist): if e > 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e-3:e+3+1] if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]): # contains >50% of data and is greater than threshold bad_bin[e] = 1 # for plotting remove good bins else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 if plots: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() # having identified possible bad bins, check each year in turn for y,year in enumerate(month_ranges_years): if season == 0: # year year_data = np.ma.masked_values(st_var.data[year[0][0]:year[-1][-1]], st_var.fdi) year_flags = station.qc_flags[year[0][0]:year[-1][-1],flag_col[v]] elif season == 1: #mam year_data = np.ma.masked_values(st_var.data[year[2][0]:year[4][-1]], st_var.fdi) year_flags = station.qc_flags[year[2][0]:year[4][-1],flag_col[v]] elif season == 2: #jja year_data = np.ma.masked_values(st_var.data[year[5][0]:year[7][-1]], st_var.fdi) year_flags = station.qc_flags[year[5][0]:year[7][-1],flag_col[v]] elif season == 3: #son year_data = np.ma.masked_values(st_var.data[year[8][0]:year[10][-1]], st_var.fdi) year_flags = station.qc_flags[year[8][0]:year[10][-1],flag_col[v]] elif season == 4: #d+jf year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\ np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)]) year_flags = np.append(station.qc_flags[year[0][0]:year[1][-1],flag_col[v]],station.qc_flags[year[-1][0]:year[-1][-1],flag_col[v]]) if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR: hist, binEdges = np.histogram(year_data.compressed(), bins = bins) if plots: plot_hist, bincenters = fvc_plot_setup(hist, binEdges, st_var.name, title = "%s - %s" % (y+start.year, SEASONS[season])) for e, element in enumerate(hist): if bad_bin[e] == 1: # only look at pre-identified bins if e >= 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e-3:e+3+1].astype('float') if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \ or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]): # contains >50% or >90% of data and is greater than appropriate threshold # Flag these data bad_points = np.where((year_data >= binEdges[e]) & (year_data < binEdges[e+1])) year_flags[bad_points] = 1 # for plotting remove good bins else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 else: if plots: plot_hist[e]=1e-1 if diagnostics or plots: nflags = len(np.where(year_flags != 0)[0]) print "{} {}".format(y + start.year, nflags) if plots: if nflags > 0: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() else: plt.clf() # copy flags back if season == 0: station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags elif season == 1: station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags elif season == 2: station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags elif season == 3: station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags elif season == 4: split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]]) station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split] station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:] flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Frequent Values Check") return # fvc
def sc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, second = False): ''' Spike Check, looks for spikes up to 3 observations long, using thresholds calculated from the data itself. :param MetVar station: the station object :param list variable_list: list of observational variables to process :param list flag_col: the columns to set on the QC flag array :param datetime start: dataset start time :param datetime end: dataset end time :param file logfile: logfile to store outputs :param bool plots: do plots :param bool second: run for second time :returns: ''' print "refactor" for v, variable in enumerate(variable_list): flags = station.qc_flags[:, flag_col[v]] prev_flag_number = 0 if second: # count currently existing flags: prev_flag_number = len(flags[flags != 0]) st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # to match IDL system - should never be called as would mean no data if reporting_resolution == -1: reporting_resolution = 1 month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) good = np.where(all_filtered.mask == False) full_time_diffs = np.ma.zeros(len(all_filtered)) full_time_diffs.mask = all_filtered.mask full_time_diffs[good] = station.time.data[good][1:] - station.time.data[good][:-1] # develop critical values using clean values # NOTE 4/7/14 - make sure that Missing and Flagged values treated appropriately print "sort the differencing if values were flagged rather than missing" full_filtered_diffs = np.ma.zeros(len(all_filtered)) full_filtered_diffs.mask = all_filtered.mask full_filtered_diffs[good] = all_filtered.compressed()[1:] - all_filtered.compressed()[:-1] # test all values good_to_uncompress = np.where(st_var.data.mask == False) full_value_diffs = np.ma.zeros(len(st_var.data)) full_value_diffs.mask = st_var.data.mask full_value_diffs[good_to_uncompress] = st_var.data.compressed()[1:] - st_var.data.compressed()[:-1] # convert to compressed time to match IDL value_diffs = full_value_diffs.compressed() time_diffs = full_time_diffs.compressed() filtered_diffs = full_filtered_diffs.compressed() flags = flags[good_to_uncompress] critical_values = np.zeros([9,12]) critical_values.fill(st_var.mdi) # link observation to calendar month month_locs = np.zeros(full_time_diffs.shape) for month in range(12): for year in range(month_ranges.shape[0]): if year == 0: this_month_time_diff = full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] this_month_filtered_diff = full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]] else: this_month_time_diff = np.ma.concatenate([this_month_time_diff, full_time_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) this_month_filtered_diff = np.ma.concatenate([this_month_filtered_diff, full_filtered_diffs[month_ranges[year,month,0]:month_ranges[year,month,1]]]) month_locs[month_ranges[year,month,0]:month_ranges[year,month,1]] = month for delta in range(1,9): locs = np.ma.where(this_month_time_diff == delta) if len(locs[0]) >= 100: iqr = utils.IQR(this_month_filtered_diff[locs]) if iqr == 0. and delta == 1: critical_values[delta-1,month] = 6. elif iqr == 0: critical_values[delta-1,month] = st_var.mdi else: critical_values[delta-1,month] = 6. * iqr # January 2015 - changed to dynamically calculating the thresholds if less than IQR method ^RJHD if plots: import calendar title = "{}, {}-hr differences".format(calendar.month_name[month+1], delta) line_label = st_var.name xlabel = "First Difference Magnitudes" else: title, line_label, xlabel = "","","" threshold = utils.get_critical_values(this_month_filtered_diff[locs], binmin = 0, binwidth = 0.5, plots = plots, diagnostics = diagnostics, title = title, line_label = line_label, xlabel = xlabel, old_threshold = critical_values[delta-1,month]) if threshold < critical_values[delta-1,month]: critical_values[delta-1,month] = threshold if plots or diagnostics: print critical_values[delta-1,month] , iqr, 6 * iqr month_locs = month_locs[good_to_uncompress] if diagnostics: print critical_values[0,:] # not less than 5x reporting accuracy good_critical_values = np.where(critical_values != st_var.mdi) low_critical_values = np.where(critical_values[good_critical_values] <= 5.*reporting_resolution) temporary = critical_values[good_critical_values] temporary[low_critical_values] = 5.*reporting_resolution critical_values[good_critical_values] = temporary if diagnostics: print critical_values[0,:], 5.*reporting_resolution # check hourly against 2 hourly, if <2/3 the increase to avoid crazy rejection rate for month in range(12): if critical_values[0,month] != st_var.mdi and critical_values[1,month] != st_var.mdi: if critical_values[0,month]/critical_values[1,month] <= 0.66: critical_values[0,month] = 0.66 * critical_values[1,month] if diagnostics: print critical_values[0,:] # get time differences for unfiltered data full_time_diffs = np.ma.zeros(len(st_var.data)) full_time_diffs.mask = st_var.data.mask full_time_diffs[good_to_uncompress] = station.time.data[good_to_uncompress][1:] - station.time.data[good_to_uncompress][:-1] time_diffs = full_time_diffs.compressed() # go through each difference, identify which month it is in if passes spike thresholds # spikes at the beginning or ends of sections for t in np.arange(len(time_diffs)): if (np.abs(time_diffs[t - 1]) > 240) and (np.abs(time_diffs[t]) < 3): # 10 days before but short gap thereafter next_values = st_var.data[good_to_uncompress[0][t + 1:]] good, = np.where(next_values.mask == False) next_median = np.ma.median(next_values[good[:10]]) next_diff = np.abs(value_diffs[t]) # out of spike median_diff = np.abs(next_median - st_var.data[good_to_uncompress[0][t]]) # are the remaining onees if (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi): # jump from spike > critical but average after < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t] - 1, month_locs[t]] / 2.) and\ (np.abs(next_diff) > critical_values[time_diffs[t] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) elif (np.abs(time_diffs[t - 1]) < 3) and (np.abs(time_diffs[t]) > 240): # 10 days after but short gap before prev_values = st_var.data[good_to_uncompress[0][:t - 1]] good, = np.where(prev_values.mask == False) prev_median = np.ma.median(prev_values[good[-10:]]) prev_diff = np.abs(value_diffs[t - 1]) median_diff = np.abs(prev_median - st_var.data[good_to_uncompress[0][t]]) if (critical_values[time_diffs[t - 1] - 1, month_locs[t]] != st_var.mdi): # jump into spike > critical but average before < critical / 2 if (np.abs(median_diff) < critical_values[time_diffs[t - 1] - 1, month_locs[t]] / 2.) and\ (np.abs(prev_diff) > critical_values[time_diffs[t - 1] - 1, month_locs[t]]) : flags[t] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t], good_to_uncompress[0][t+1], start, variable, plots = plots) ''' this isn't the nicest way, but a direct copy from IDL masked arrays might help remove some of the lines Also, this is relatively slow''' for t in np.arange(len(time_diffs)): for spk_len in [1,2,3]: if t >= spk_len and t < len(time_diffs) - spk_len: # check if time differences are appropriate, for multi-point spikes if (np.abs(time_diffs[t - spk_len]) <= spk_len * 3) and\ (np.abs(time_diffs[t]) <= spk_len * 3) and\ (time_diffs[t - spk_len - 1] - 1 < spk_len * 3) and\ (time_diffs[t + 1] - 1 < spk_len * 3) and \ ((spk_len == 1) or \ ((spk_len == 2) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3)) or \ ((spk_len == 3) and (np.abs(time_diffs[t - spk_len + 1]) <= spk_len * 3) and (np.abs(time_diffs[t - spk_len + 2]) <= spk_len * 3))): # check if differences are valid if (value_diffs[t - spk_len] != st_var.mdi) and \ (value_diffs[t - spk_len] != st_var.fdi) and \ (critical_values[time_diffs[t - spk_len] - 1, month_locs[t]] != st_var.mdi): # if exceed critical values if (np.abs(value_diffs[t - spk_len]) >= critical_values[time_diffs[t - spk_len] - 1, month_locs[t]]): # are signs of two differences different if (math.copysign(1, value_diffs[t]) != math.copysign(1, value_diffs[t - spk_len])): # are within spike differences small if (spk_len == 1) or\ ((spk_len == 2) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.)) or \ ((spk_len == 3) and (np.abs(value_diffs[t - spk_len + 1]) < critical_values[time_diffs[t - spk_len + 1] -1, month_locs[t]] / 2.) and\ (np.abs(value_diffs[t - spk_len + 2]) < critical_values[time_diffs[t - spk_len + 2] -1, month_locs[t]] / 2.)): # check if following value is valid if (value_diffs[t] != st_var.mdi) and (critical_values[time_diffs[t] - 1, month_locs[t]] != st_var.mdi) and\ (value_diffs[t] != st_var.fdi): # and if at least critical value if (np.abs(value_diffs[t]) >= critical_values[time_diffs[t] - 1, month_locs[t]]): # test if surrounding differences below 1/2 critical value if (np.abs(value_diffs[t - spk_len - 1]) <= critical_values[time_diffs[t - spk_len - 1] - 1, month_locs[t]] / 2.): if (np.abs(value_diffs[t + 1]) <= critical_values[time_diffs[t + 1] - 1, month_locs[t]] / 2.): # set the flags flags[ t - spk_len + 1 : t +1] = 1 if plots or diagnostics: sc_diagnostics_and_plots(station.time.data, st_var.data, good_to_uncompress[0][t-spk_len+1], good_to_uncompress[0][t+1], start, variable, plots = plots) station.qc_flags[good_to_uncompress, flag_col[v]] = flags flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number, noWrite = True) # additional flags else: utils.print_flagged_obs_number(logfile, "Spike", variable, len(flag_locs[0]) - prev_flag_number) # additional flags # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 - but with adapted IDL # matches 030220 OK, but finds more but all are reasonable 1/9/14 do_interactive = False if plots and do_interactive == True: import matplotlib.pyplot as plt plot_times = utils.times_hours_to_datetime(station.time.data, start) plt.clf() plt.plot(plot_times, all_filtered, 'bo', ls='-') flg = np.where(flags[:, flag_col[v]] == 1) plt.plot(plot_times[flg], all_filtered[flg], 'ro', markersize=10) plt.show() station = utils.append_history(station, "Spike Check") return # sc
def hcc_dpd(times, T, D, P, C, SX, start, logfile, plots=False, diagnostics=False): ''' Dew point Depression check. If long string of DPD = 0, then flag :param array times: timestamps :param array T: temperatures :param array D: dewpoint temperatures :param array P: precipitation depth :param array C: cloud base :param array SX: past significant weather :param datetime start: DATASTART (for plotting) :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose output :returns: flags - locations where flags have been set ''' flags = np.zeros(len(T)) dpds = T - D last_dpds = -9999. string_start_time = times[0] start_loc = 0 for t, tt in enumerate(times): if (tt > 0) and (tt < times[-1]): if (dpds.mask[t] == False): # if change in DPD, examine previous string if (dpds[t] != last_dpds): # if long enough if (times[t - 1] - string_start_time >= 24): these_dpds = dpds[start_loc:t] good = np.where(these_dpds.mask == False) if T[t] >= 0: abs_diff = 0.25 else: abs_diff = 1. # has enough data and is small enough if (len(good[0]) >= 4) and (abs(last_dpds) <= abs_diff): # check if weather event could explain it. these_sigwx = SX[start_loc:t] these_P = P[start_loc:t] these_CB = C[start_loc:t] # use past significant weather, precipitation or low cloud base fog = np.where(np.logical_or.reduce((these_sigwx[good] >= 4, \ these_P[good] > 0., \ np.logical_and(these_CB[good] > 0., these_CB[good] < 1000.)))) if len(fog[0]) >= 1: if len(fog[0]) / float(len(good[0])) < 0.333: flags[start_loc:t][good] = 1 if plots: hcc_time_plot(T, D, start_loc, t, start) else: flags[start_loc:t][good] = 1 if plots: hcc_time_plot(T, D, start_loc, t, start) string_start_time = tt start_loc = t last_dpds = dpds[t] nflags = len(np.where(flags != 0)[0]) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Dewpoint Depression", "temperature", nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Dewpoint Depression", "temperature", nflags) # checked on 032220 on 19/8/2014 and matches identically return flags # hcc_dpd
def hcc_cutoffs(T, D, month_ranges, logfile, start, plots=False, diagnostics=False): """ Check each month to see if most T obs have corresponding D obs do in bins of 10C - if one bin has <50% of match, then remove month :param array T: temperatures :param array D: dewpoint temperatures :param array month_ranges: array of month start and end times :param file logfile: logfile to store outputs :param datetime start: start of dataset for labelling of plots if required :param bool plots: do plots :param bool diagnostics: output extra verbose information :returns: flags - locations where flags have been set """ flags = np.zeros(len(T)) binwidth = 10 bins = np.arange(-90, 70, binwidth) for m, month in enumerate(month_ranges): this_month_T = T[month[0] : month[1]] this_month_D = D[month[0] : month[1]] goodT = np.where(this_month_T.mask == False) goodD = np.where(this_month_D.mask == False) # check if more than 112 obs (4/d * 28 d) if len(goodT[0]) > 112: # run through each bin for bin in bins: bin_locs = np.where((this_month_T >= bin) & (this_month_T < bin + binwidth)) # if data in this bin if len(bin_locs[0]) > 20: # get the data for this bin binT = this_month_T[bin_locs] binD = this_month_D[bin_locs] # find good temperatures good_binT = np.where(binT.mask == False) good_binD = np.where(binD.mask == False) # and the bad dewpoints coincident with the good temperatures bad_binD_T = np.where(binD.mask[good_binT] == False) if len(bad_binD_T[0]) != 0: bad_D_fraction = 1.0 - len(bad_binD_T[0]) / float(len(good_binT[0])) # if more than 50% missing if bad_D_fraction >= 0.5: # check the temporal resolution - if OK, then flag # This is a better temporal resolution calculation than IDL - will pick up more months T_resoln = np.median(np.diff(goodT)) D_resoln = np.median(np.diff(goodD)) # only flag if resolutions the same and number of observations in total are similar if (T_resoln != D_resoln) and (float(len(goodD[0])) / len(goodT[0]) < 0.666): continue else: flags[month[0] : month[1]][goodD] = 1 """ break this loop testing bins as whole month flagged no point doing any more testing return to month loop and move to next month""" dt_month = start + dt.timedelta(hours=month[0]) if diagnostics: print dt.datetime.strftime(dt_month, "%B %Y - ", bin, len(goodD[0])) if plots: # show the histogram import matplotlib.pyplot as plt plt.clf() plt.hist(this_month_T, bins=bins, color="r", label="Temperature", alpha=0.5) plt.hist(this_month_D, bins=bins, color="b", label="Dewpoint", alpha=0.5) plt.title(dt.datetime.strftime(dt_month, "%B %Y")) plt.legend() plt.show() break # checked on 032220-99999 on 20/8/2014 and matches mostly # one extra month found - issues with IDL month start/end times. nflags = len(np.where(flags != 0)[0]) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Dewpoint Cut-off", "temperature", nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Dewpoint Cut-off", "temperature", nflags) return flags # hcc_cutoffs
def hcc_cutoffs(T, D, month_ranges, logfile, start, plots=False, diagnostics=False): ''' Check each month to see if most T obs have corresponding D obs do in bins of 10C - if one bin has <50% of match, then remove month :param array T: temperatures :param array D: dewpoint temperatures :param array month_ranges: array of month start and end times :param file logfile: logfile to store outputs :param datetime start: start of dataset for labelling of plots if required :param bool plots: do plots :param bool diagnostics: output extra verbose information :returns: flags - locations where flags have been set ''' flags = np.zeros(len(T)) binwidth = 10 bins = np.arange(-90, 70, binwidth) for m, month in enumerate(month_ranges): this_month_T = T[month[0]:month[1]] this_month_D = D[month[0]:month[1]] goodT = np.where(this_month_T.mask == False) goodD = np.where(this_month_D.mask == False) # check if more than 112 obs (4/d * 28 d) if len(goodT[0]) > 112: # run through each bin for bin in bins: bin_locs = np.where((this_month_T >= bin) & (this_month_T < bin + binwidth)) # if data in this bin if len(bin_locs[0]) > 20: # get the data for this bin binT = this_month_T[bin_locs] binD = this_month_D[bin_locs] # find good temperatures good_binT = np.where(binT.mask == False) good_binD = np.where(binD.mask == False) # and the bad dewpoints coincident with the good temperatures bad_binD_T = np.where(binD.mask[good_binT] == False) if len(bad_binD_T[0]) != 0: bad_D_fraction = 1. - len(bad_binD_T[0]) / float( len(good_binT[0])) # if more than 50% missing if bad_D_fraction >= 0.5: # check the temporal resolution - if OK, then flag # This is a better temporal resolution calculation than IDL - will pick up more months T_resoln = np.median(np.diff(goodT)) D_resoln = np.median(np.diff(goodD)) # only flag if resolutions the same and number of observations in total are similar if (T_resoln != D_resoln) and (float(len( goodD[0])) / len(goodT[0]) < 0.666): continue else: flags[month[0]:month[1]][goodD] = 1 ''' break this loop testing bins as whole month flagged no point doing any more testing return to month loop and move to next month''' dt_month = start + dt.timedelta(hours=month[0]) if diagnostics: print dt.datetime.strftime( dt_month, "%B %Y - ", bin, len(goodD[0])) if plots: # show the histogram import matplotlib.pyplot as plt plt.clf() plt.hist(this_month_T, bins=bins, color='r', label='Temperature', alpha=0.5) plt.hist(this_month_D, bins=bins, color='b', label='Dewpoint', alpha=0.5) plt.title( dt.datetime.strftime( dt_month, "%B %Y")) plt.legend() plt.show() break # checked on 032220-99999 on 20/8/2014 and matches mostly # one extra month found - issues with IDL month start/end times. nflags = len(np.where(flags != 0)[0]) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Dewpoint Cut-off", "temperature", nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Dewpoint Cut-off", "temperature", nflags) return flags # hcc_cutoffs
def dmc(station, variable_list, full_variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False): ''' Method copied from check_duplicates.pro :param obj station: station object with suitable attributes (see netcdf_procs.py) :param list variable_list: list of netcdf variables to process :param list full_variable_list: the variables for flags to be applied to :param list flag_col: which column to set in flag array :param datetime start: data start :param datetime end: data end :param file logfile: logfile to store outputs :param bool diagnostics: extra verbosity :param bool plots: do plots ''' MIN_DATA_REQUIRED = 20 # obs per month # get array of Nx2 start/end pairs month_ranges = utils.month_starts_in_pairs(start, end) for v, variable in enumerate(variable_list): st_var = getattr(station, variable) # double loop structure - not ideal duplicated = np.zeros(len(month_ranges)) for sm, source_month in enumerate(month_ranges): if diagnostics: print "Month %i of %i" % (sm + 1, len(month_ranges)) source_data = st_var.data[source_month[0]:source_month[1]] if duplicated[sm] == 0: # don't repeat if already a duplicated for tm, target_month in enumerate(month_ranges[sm + 1:]): target_data = st_var.data[target_month[0]:target_month[1]] # match the data periods overlap = np.min([len(source_data), len(target_data)]) s_data, t_data = source_data[:overlap], target_data[:overlap] s_valid, t_valid = np.where(s_data.compressed() != st_var.fdi), \ np.where(t_data.compressed() != st_var.fdi) # if enough of an overlap if (len(s_valid[0]) >= MIN_DATA_REQUIRED) and \ (len(t_valid[0]) >= MIN_DATA_REQUIRED): if len(s_valid[0]) < len(t_valid[0]): duplicated = duplication_test(source_data, target_data, s_valid, sm, tm, source_month, target_month, duplicated, diagnostics, station.qc_flags, flag_col[v]) else: # swap the list of valid points duplicated = duplication_test(source_data, target_data, t_valid, sm, tm, source_month, target_month, duplicated, diagnostics, station.qc_flags, flag_col[v]) if plots: dmc_plot(s_data, t_data, start, source_month[0], target_month[0], st_var.name) # target month # source month # variable list flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Duplicate Month", variable, len(flag_locs[0]), noWrite = diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Duplicate Months", plots = plots, diagnostics = diagnostics) station = utils.append_history(station, "Duplicate Months Check") return # dmc
def hcc_sss(T, D, month_ranges, start, logfile, plots=False, diagnostics=False): ''' Supersaturation check, on individual obs, and then if >20% of month affected :param array T: temperatures :param array D: dewpoint temperatures :param array month_ranges: array of month start and end times :param datetime start: DATASTART (for plotting) :param file logfile: logfile to store outputs :param bool plots: do plots or not :param bool diagnostics: extra verbose output :returns: flags - locations where flags have been set ''' flags = np.zeros(len(T)) # flag each location where D > T for m, month in enumerate(month_ranges): data_count = 0. sss_count = 0. for t in np.arange(month[0], month[1]): data_count += 1 if D[t] > T[t]: sss_count += 1 flags[t] = 1 if plots: hcc_time_plot(T, D, t - 1, t, start) # test whole month # if more than 20% flagged, flag whole month if sss_count / data_count >= SSS_MONTH_FRACTION: flags[month[0]:month[1]] = 1 if plots: hcc_time_plot(T, D, month[0], month[1], start) nflags = len(np.where(flags != 0)[0]) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Supersaturation", "temperature", nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Supersaturation", "temperature", nflags) # not yet tested. return flags # hcc_sss
def fvc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' Check for certain values occurring more frequently than would be expected :param object station: station object to process :param list variable_list: list of variables to process :param list flag_col: columns to fill in flag array :param datetime start: datetime object of start of data :param datetime end: datetime object of end of data :param file logfile: logfile to store outputs :param bool diagnostics: produce extra diagnostic output :param bool plots: produce plots :param bool month: ignore months after last complete year/season for distribution ''' MIN_DATA_REQUIRED = 500 # to create histogram for complete record MIN_DATA_REQUIRED_YEAR = 100 # to create histogram month_ranges = utils.month_starts_in_pairs(start, end) month_ranges_years = month_ranges.reshape(-1, 12, 2) for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_accuracy = utils.reporting_accuracy( utils.apply_filter_flags(st_var)) # apply flags - for detection only filtered_data = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) for season in range(5): # Year,MAM,JJA,SON,JF+D if season == 0: # all year season_data = np.ma.masked_values(filtered_data.compressed(), st_var.fdi) thresholds = [30, 20, 10] else: thresholds = [20, 15, 10] season_data = np.ma.array([]) for y, year in enumerate(month_ranges_years): # churn through months extracting data, accounting for fdi and concatenating together if season == 1: #mam season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[2][0]:year[4][-1]], st_var.fdi) ]) elif season == 2: #jja season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[5][0]:year[7][-1]], st_var.fdi) ]) elif season == 3: #son season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[8][0]:year[10][-1]], st_var.fdi) ]) elif season == 4: #d+jf season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[0][0]:year[1][-1]], st_var.fdi) ]) season_data = np.ma.concatenate([ season_data, np.ma.masked_values( filtered_data[year[-1][0]:year[-1][-1]], st_var.fdi) ]) season_data = season_data.compressed() if len(season_data) > MIN_DATA_REQUIRED: if 0 < reporting_accuracy <= 0.5: # -1 used as missing value bins, bincenters = utils.create_bins(season_data, 0.5) else: bins, bincenters = utils.create_bins(season_data, 1.0) hist, binEdges = np.histogram(season_data, bins=bins) if plots: plot_hist, bincenters = fvc_plot_setup(season_data, hist, binEdges, st_var.name, title="%s" % (SEASONS[season])) bad_bin = np.zeros(len(hist)) # scan through bin values and identify bad ones for e, element in enumerate(hist): if e > 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e - 3:e + 3 + 1] if (seven_bins[3] == seven_bins.max()) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3] / float(seven_bins.sum()) >= 0.5) and (seven_bins[3] >= thresholds[0]): # contains >50% of data and is greater than threshold bad_bin[e] = 1 # for plotting remove good bins else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 if plots: import matplotlib.pyplot as plt plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() # having identified possible bad bins, check each year in turn, on unfiltered data for y, year in enumerate(month_ranges_years): if season == 0: # year year_data = np.ma.masked_values( st_var.data[year[0][0]:year[-1][-1]], st_var.fdi) year_flags = station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] elif season == 1: #mam year_data = np.ma.masked_values( st_var.data[year[2][0]:year[4][-1]], st_var.fdi) year_flags = station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] elif season == 2: #jja year_data = np.ma.masked_values( st_var.data[year[5][0]:year[7][-1]], st_var.fdi) year_flags = station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] elif season == 3: #son year_data = np.ma.masked_values( st_var.data[year[8][0]:year[10][-1]], st_var.fdi) year_flags = station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] elif season == 4: #d+jf year_data = np.ma.concatenate([np.ma.masked_values(st_var.data[year[0][0]:year[1][-1]], st_var.fdi),\ np.ma.masked_values(st_var.data[year[-1][0]:year[-1][-1]], st_var.fdi)]) year_flags = np.append( station.qc_flags[year[0][0]:year[1][-1], flag_col[v]], station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]]) if len(year_data.compressed()) > MIN_DATA_REQUIRED_YEAR: hist, binEdges = np.histogram(year_data.compressed(), bins=bins) if plots: plot_hist, bincenters = fvc_plot_setup( year_data.compressed(), hist, binEdges, st_var.name, title="%s - %s" % (y + start.year, SEASONS[season])) for e, element in enumerate(hist): if bad_bin[e] == 1: # only look at pre-identified bins if e >= 3 and e <= (len(hist) - 3): # don't bother with first three or last three bins seven_bins = hist[e - 3:e + 3 + 1].astype('float') if (seven_bins[3] == seven_bins.max() ) and (seven_bins[3] != 0): # is local maximum and != zero if (seven_bins[3]/seven_bins.sum() >= 0.5 and seven_bins[3] >= thresholds[1]) \ or (seven_bins[3]/seven_bins.sum() >= 0.9 and seven_bins[3] >= thresholds[2]): # contains >50% or >90% of data and is greater than appropriate threshold # Flag these data bad_points = np.where( (year_data >= binEdges[e]) & (year_data < binEdges[e + 1])) year_flags[bad_points] = 1 # for plotting remove good bins else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 else: if plots: plot_hist[e] = 1e-1 if diagnostics or plots: nflags = len(np.where(year_flags != 0)[0]) print "{} {}".format(y + start.year, nflags) if plots: if nflags > 0: plt.step(bincenters, plot_hist, 'r-', where='mid') plt.show() else: plt.clf() # copy flags back if season == 0: station.qc_flags[year[0][0]:year[-1][-1], flag_col[v]] = year_flags elif season == 1: station.qc_flags[year[2][0]:year[4][-1], flag_col[v]] = year_flags elif season == 2: station.qc_flags[year[5][0]:year[7][-1], flag_col[v]] = year_flags elif season == 3: station.qc_flags[year[8][0]:year[10][-1], flag_col[v]] = year_flags elif season == 4: split = len(station.qc_flags[year[0][0]:year[1][-1], flag_col[v]]) station.qc_flags[year[0][0]:year[1][-1], flag_col[v]] = year_flags[:split] station.qc_flags[year[-1][0]:year[-1][-1], flag_col[v]] = year_flags[split:] flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Frequent Value", variable, len(flag_locs[0]), noWrite=diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 station = utils.append_history(station, "Frequent Values Check") return # fvc
def dgc_all_obs(station, variable, flags, start, end, plots=False, diagnostics=False, idl=False, windspeeds=False, GH=False): '''RJHD addition working on all observations''' if plots: import matplotlib.pyplot as plt st_var = getattr(station, variable) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) all_filtered = utils.apply_filter_flags(st_var) for month in range(12): if windspeeds == True: st_var_wind = getattr(station, "windspeeds") # get monthly averages windspeeds_month = np.empty([]) for y, year in enumerate(month_ranges[:, month, :]): if y == 0: windspeeds_month = np.ma.array( st_var_wind.data[year[0]:year[1]]) else: windspeeds_month = np.ma.concatenate( [windspeeds_month, st_var_wind.data[year[0]:year[1]]]) windspeeds_month_average = dgc_get_monthly_averages( windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN) windspeeds_month_mad = utils.mean_absolute_deviation( windspeeds_month, median=True) this_month_data = np.array([]) this_month_filtered = np.array([]) this_month_data, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var.data, hours=False) this_month_filtered, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], all_filtered, hours=False) if len(this_month_filtered.compressed()) > OBS_LIMIT: if idl: monthly_median = utils.idl_median( this_month_filtered.compressed().reshape(-1)) else: monthly_median = np.ma.median(this_month_filtered) iqr = utils.IQR(this_month_filtered.compressed()) if iqr == 0.0: # to get some spread if IQR too small iqr = utils.IQR(this_month_filtered.compressed(), percentile=0.05) print "Spurious_stations file not yet sorted" if iqr != 0.0: monthly_values = np.ma.array( (this_month_data.compressed() - monthly_median) / iqr) bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE) dummy, plot_bincenters = utils.create_bins( monthly_values, BIN_SIZE / 10.) hist, binEdges = np.histogram(monthly_values, bins=bins) if GH: # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD initial_values = [ np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values) ] # norm, mean, std, skew, kurtosis fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))]) res = utils.hermite2gauss(fit[0], diagnostics=diagnostics) plot_gaussian = utils.funcGH(fit[0], plot_bincenters) # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting? mid_point = np.argmax(plot_gaussian) bad, = np.where( plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[mid_point:][ bad[0]:] = FREQUENCY_THRESHOLD / 10. bad, = np.where( plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.) if len(bad) > 0: plot_gaussian[:mid_point][:bad[ -1]] = FREQUENCY_THRESHOLD / 10. # extract threshold values good_values = np.argwhere( plot_gaussian > FREQUENCY_THRESHOLD) l_minimum_threshold = round( plot_bincenters[good_values[0]]) - 1 u_minimum_threshold = 1 + round( plot_bincenters[good_values[-1]]) else: gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(monthly_values), sig=np.std(monthly_values)) # assume the same threshold value u_minimum_threshold = 1 + round( utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) l_minimum_threshold = -u_minimum_threshold plot_gaussian = utils.gaussian(plot_bincenters, gaussian) if diagnostics: if GH: print hist print res print iqr, l_minimum_threshold, u_minimum_threshold else: print hist print gaussian print iqr, u_minimum_threshold, 1. + utils.invert_gaussian( FREQUENCY_THRESHOLD, gaussian) if plots: dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold=(u_minimum_threshold, l_minimum_threshold), sub_par="observations", GH=GH) if GH: plt.figtext( 0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' % (res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small') uppercount = len( np.where(monthly_values > u_minimum_threshold)[0]) lowercount = len( np.where(monthly_values < l_minimum_threshold)[0]) # this needs refactoring - but lots of variables to pass in if plots or diagnostics: gap_plot_values = np.array([]) if uppercount > 0: gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold) if gap_start != 0: for y, year in enumerate(month_ranges[:, month, :]): this_year_data = np.ma.array( all_filtered[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.where( ((this_year_data - monthly_median) / iqr) > gap_start) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if lowercount > 0: gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold) if gap_start != 0: for y, year in enumerate(month_ranges[:, month, :]): this_year_data = np.ma.array( all_filtered[year[0]:year[1]]) this_year_flags = np.array(flags[year[0]:year[1]]) gap_cleaned_locations = np.where( np.logical_and( ((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True)) this_year_flags[gap_cleaned_locations] = 1 flags[year[0]:year[1]] = this_year_flags if plots or diagnostics: gap_plot_values = np.append( gap_plot_values, (this_year_data[gap_cleaned_locations]. compressed() - monthly_median) / iqr) if windspeeds: this_year_flags[ gap_cleaned_locations] = 2 # tentative flags slp_average = dgc_get_monthly_averages( this_month_data, OBS_LIMIT, st_var.mdi, MEAN) slp_mad = utils.mean_absolute_deviation( this_month_data, median=True) storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\ (((this_month_data - slp_average) / slp_mad) > 4.5)) if len(storms[0]) >= 2: storm_1diffs = np.diff(storms) separations = np.where(storm_1diffs != 1) #for sep in separations: if plots: hist, binEdges = np.histogram(gap_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='flagged', where='mid') import calendar plt.text(0.1, 0.9, calendar.month_name[month + 1], transform=plt.gca().transAxes) plt.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png') if diagnostics: utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True) return flags # dgc_all_obs
def dcc(station, variable_list, full_variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, doMonth=False): ''' The diurnal cycle check. :param object station: the station object to be processed :param list variable_list: the variables to be processed :param list full_variable_list: the variables for flags to be applied to :param list flag_col: which column in the qc_flags array to work on :param file logfile: logfile to store outputs :param bool plots: to do any plots :param bool diagnostics: to do any extra diagnostic output :returns: ''' # list of flags for each variable diurnal_flags = [] for v, variable in enumerate(variable_list): st_var = getattr(station, variable) # is this needed 21/08/2014 # reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) # apply flags, but discount incomplete year - so that test values against these later. all_data = utils.apply_filter_flags(st_var) all_data = all_data.reshape(-1, 24) # working in fulltimes. # apply flags - also apply to final incomplete year so that best values only use complete years filtered_data = utils.apply_filter_flags(st_var, doMonth=doMonth, start=start, end=end) filtered_data = filtered_data.reshape(-1, 24) # working in fulltimes. number_of_days = filtered_data.shape[0] if plots: import matplotlib.pyplot as plt plt.clf() plot_data = np.ma.zeros(filtered_data.shape) plot_data.mask = True # best_estimate_counter = np.zeros(HOURS) diurnal_filtered_fits = np.zeros(filtered_data.shape[0], dtype=(int)) diurnal_filtered_fits.fill(INTMDI) diurnal_best_fits = np.zeros(st_var.data.shape[0], dtype=(int)) diurnal_best_fits.fill(INTMDI) diurnal_uncertainties = np.zeros(filtered_data.shape[0]) diurnal_uncertainties.fill(INTMDI) for d, day in enumerate(all_data): '''enough observations and have large enough diurnal range ''' if len(day.compressed()) >= OBS_PER_DAY: obs_daily_range = max(day.compressed()) - min(day.compressed()) if obs_daily_range >= DAILY_RANGE: if dcc_quartile_check(day): scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed()) diffs = np.zeros(HOURS) '''Find differences for each shifted sine --> cost function''' for h in range(HOURS): diffs[h] = np.sum( np.abs(day - scaled_sine).compressed()) scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT() # and keep this for testing against the average value later diurnal_best_fits[d] = np.argmin(diffs) for d, day in enumerate(filtered_data): '''enough observations and have large enough diurnal range ''' if len(day.compressed()) >= OBS_PER_DAY: obs_daily_range = max(day.compressed()) - min(day.compressed()) if obs_daily_range >= DAILY_RANGE: if dcc_quartile_check(day): scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed()) diffs = np.zeros(HOURS) '''Find differences for each shifted sine --> cost function''' for h in range(HOURS): diffs[h] = np.sum( np.abs(day - scaled_sine).compressed()) scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT() diurnal_filtered_fits[d] = np.argmin(diffs) # default uncertainty is the average time resolution of the data diurnal_uncertainties[d] = round( float(HOURS) / len(day.compressed())) if DYNAMIC_DIURNAL: critical_value = min(diffs) + ( (max(diffs) - min(diffs)) * 0.33) # centre so minimum in middle diffs = np.roll(diffs, 11 - diurnal_filtered_fits[d]) uncertainty = 1 while uncertainty < 11: if (diffs[11 - uncertainty] > critical_value) and\ (diffs[11 + uncertainty] > critical_value): # break if both sides greater than critical difference # when counting outwards # see diurnal_example.py break uncertainty += 1 # check if uncertainty greater than time resolution for day if uncertainty > diurnal_uncertainties[d]: diurnal_uncertainties[d] = uncertainty if plots: # best_estimate_counter[np.argmin(diffs)] += 1 # scale daily data to range -1 -> 1, plot with random scatter for clarity plot_data[d] = ((2 * (day - min(day.compressed())) / obs_daily_range) - 1.) plt.plot( np.arange(24) + np.random.randn(24) * 0.25, plot_data[d] + np.random.randn(24) * 0.05, 'k,') if plots: plt.plot( np.arange(24), np.roll( dcc_make_sine(), np.argmax( np.bincount(diurnal_filtered_fits[np.where( diurnal_filtered_fits != INTMDI)]))), 'r-') plt.xlim([-1, 25]) plt.ylim([-1.2, 1.2]) plt.show() # dumb copy of IDL '''For each uncertainty range (1-6h) find median of cycle offset''' filtered_fits = np.zeros(6) filtered_fits.fill(-9) for h in range(6): locs = np.where(diurnal_uncertainties == h + 1) if len(locs[0]) > 300: # filtered_fits[h] = int(np.median(diurnal_filtered_fits[locs])) # Numpy median gives average of central two values which may not be integer # 25/11/2014 use IDL style which gives lower value filtered_fits[h] = utils.idl_median( diurnal_filtered_fits[locs]) '''Build up range of cycles incl, uncertainty to find where best of best located''' hours = np.arange(24) hour_matches = np.zeros(24) diurnal_peak = -9 number_estimates = 0 for h in range(6): if filtered_fits[h] != -9: '''Store lowest uncertainty best fit as first guess''' if diurnal_peak == -9: diurnal_peak = filtered_fits[h] hours = np.roll(hours, 11 - int(diurnal_peak)) hour_matches[11 - (h + 1):11 + (h + 2)] = 1 number_estimates += 1 centre, = np.where(hours == filtered_fits[h]) if (centre[0] - h + 1) >= 0: if (centre[0] + h + 1) <= 23: hour_matches[centre[0] - (h + 1):centre[0] + h + 2] += 1 else: hour_matches[centre[0] - (h + 1):] += 1 hour_matches[:centre[0] + h + 2 - 24] += 1 else: hour_matches[:centre[0] + h + 2] += 1 hour_matches[centre[0] - (h + 1):] += 1 number_estimates += 1 '''If value at lowest uncertainty not found in all others, then see what value is found by all others ''' if hour_matches[ 11] != number_estimates: # central estimate at 12 o'clock all_match = np.where(hour_matches == number_estimates) # if one is, then use it if len(all_match[0]) > 0: diurnal_peak = all_match[0][0] else: diurnal_peak = -9 '''Now have value for best fit diurnal offset''' potentially_spurious = np.zeros(number_of_days) potentially_spurious.fill(INTMDI) if diurnal_peak != -9: hours = np.arange(24) hours = np.roll(hours, 11 - int(diurnal_peak)) for d in range(number_of_days): # and now going back to the unfiltered data if diurnal_best_fits[d] != INTMDI: '''Checks if global falls inside daily value+/-range rather than seeing if each day falls in global value+/-range''' min_range = 11 - diurnal_uncertainties[d] max_range = 11 + diurnal_uncertainties[d] maxloc = np.where(hours == diurnal_best_fits[d])[0][0] if maxloc < min_range or maxloc > max_range: potentially_spurious[d] = 1 else: potentially_spurious[d] = 0 # count number of good, missing and not-bad days n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 to_flag = np.zeros(number_of_days) for d in range(number_of_days): if potentially_spurious[d] == 1: n_good = 0 n_miss = 0 n_not_bad = 0 total_points += 1 total_not_miss += 1 else: if potentially_spurious[d] == 0: n_good += 1 n_not_bad += 1 if n_miss != 0: n_miss = 0 total_not_miss += 1 if potentially_spurious[d] == -999: n_miss += 1 n_not_bad += 1 if n_good != 0: n_good = 0 total_points += 1 if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6): if total_points >= 30: if float(total_not_miss) / total_points >= 0.5: to_flag[d - total_points:d] = 1 n_good = 0 n_miss = 0 n_not_bad = 0 total_points = 0 total_not_miss = 0 dcc_flags = np.zeros(filtered_data.shape) for d in range(number_of_days): if to_flag[d] == 1: good = np.where(filtered_data.mask[d, :] == False) if len(good[0]) >= 1: dcc_flags[d, good] = 1 if diagnostics: print len(np.where(dcc_flags == 1)[0]) print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?" diurnal_flags += [dcc_flags] else: diurnal_flags += [np.zeros(filtered_data.shape)] station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]), noWrite=diagnostics) # copy flags into attribute st_var.flags[flag_locs] = 1 # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Diurnal Cycle", plots=plots, diagnostics=diagnostics) station = utils.append_history(station, "Diurnal Cycle Check") return # dcc
def do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, start, logfile, plots = False, diagnostics = False): ''' Set up and run the unflagging process for the specified tests :param MetVar station: station object :param string variable: variable to process :param array all_data: array containing all neighbour obs for full time period :param array reporting accuracies: reporting accuracy for each neighbour :param array neigh_count: number of neighbours with data at each time stamp :param array dpd_flags: number of neighbours that have DPD set at each time stamp :param dict FLAG_COL_DICT: look up dictionary to :param datetime start: start of dataset :param file logfile: logfile to store outputs :param bool plots: do plots ''' # unflagging using neighbours '''This is slow - np.ma.median is known to be slow https://github.com/astropy/ccdproc/issues/74 https://github.com/astropy/ccdproc/blob/122cdbd5713140174f057eaa8fdb6f9ce03312df/docs/ccdproc/bottleneck_example.rst''' mean_of_neighbours = bn_median(all_data, axis = 0) std_of_neighbours = median_absolute_deviation(all_data, axis = 0) # find where the spread of neighbour observations is less than 1/2 # of maximum reporting accuracy std_of_neighbours[std_of_neighbours < 0.5*max(reporting_accuracies)] = 0.5*max(reporting_accuracies) # create series of normalised differences of obs from neighbour mean st_var = getattr(station, variable) normalised_differences = np.ma.abs(st_var.data - mean_of_neighbours)/std_of_neighbours for qc_test in ["climatological","gap","odd","dpd"]: if qc_test == "dpd" and variable == "dewpoints": flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, dpd_count = dpd_flags) elif qc_test == "dpd": # only unflag DPD on dewpoints continue elif qc_test == "gap" and variable != "slp": # only unflag gap check on slp observations continue else: flags = station.qc_flags[:, UNFLAG_COL_DICT[qc_test][variable]] if qc_test == "gap" or qc_test == "climatological": # only tentative flags unset_locs = unflagging_locs(normalised_differences, flags, neigh_count, flag_value = 2) else: unset_locs = unflagging_locs(normalised_differences, flags, neigh_count) if len(unset_locs) > 0: station.qc_flags[unset_locs, UNFLAG_COL_DICT[qc_test][variable]] = 0 # need to unflag attribute if and only if no other flags are set subset_flags = station.qc_flags[:, FLAG_COL_DICT[variable]] total_flags = np.sum(subset_flags[unset_locs, :], axis = 1) clean_locs = np.where(total_flags == 0) st_var.flags[unset_locs[clean_locs]] = 0 # and print result if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Unflagging "+qc_test, variable, len(unset_locs), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Unflagging "+qc_test, variable, len(unset_locs)) if plots: if len(unset_locs) > 0: plot_outlier(station, variable, unset_locs, all_data, start) station = utils.append_history(station, "Unflagging - "+variable) return # do_unflagging