def coc(station, variable_list, flag_col, start, end, logfile, diagnostics=False, plots=False, idl=False): for v, variable in enumerate(variable_list): st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) # is this needed 13th Nov 2014 RJHD #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1, 12, 2) for month in range(12): hourly_climatologies = np.zeros(24) hourly_climatologies.fill(st_var.mdi) # append all e.g. Januaries together this_month, year_ids, dummy = utils.concatenate_months( month_ranges[:, month, :], st_var.data, hours=True) this_month_filtered, dummy, dummy = utils.concatenate_months( month_ranges[:, month, :], all_filtered, hours=True) # if fixed climatology period, sort this here # get as array of 24 hrs. this_month = np.ma.array(this_month) this_month = this_month.reshape(-1, 24) this_month_filtered = np.ma.array(this_month_filtered) this_month_filtered = this_month_filtered.reshape(-1, 24) # get hourly climatology for each month for hour in range(24): this_hour = this_month[:, hour] # need to have data if this is going to work! if len(this_hour.compressed()) > 0: # winsorize & climatologies - done to match IDL if idl: this_hour = utils.winsorize(np.append( this_hour.compressed(), -999999), 0.05, idl=idl) hourly_climatologies[hour] = np.ma.sum(this_hour) / ( len(this_hour) - 1) else: this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl=idl) hourly_climatologies[hour] = np.ma.mean(this_hour) if len(this_month.compressed()) > 0: # can get stations with few obs in a particular variable. # anomalise each hour over month appropriately anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0], 1)) anomalies_filtered = this_month_filtered - np.tile( hourly_climatologies, (this_month_filtered.shape[0], 1)) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape( -1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr normed_anomalies_filtered = anomalies_filtered / iqr # get average anomaly for year year_ids = np.array(year_ids) monthly_vqvs = np.ma.zeros(month_ranges.shape[0]) monthly_vqvs.mask = [ False for x in range(month_ranges.shape[0]) ] for year in range(month_ranges.shape[0]): year_locs = np.where(year_ids == year) this_year = normed_anomalies_filtered[year_locs, :] if len(this_year.compressed()) > 0: # need to have data for this to work! if idl: monthly_vqvs[year] = utils.idl_median( this_year.compressed().reshape(-1)) else: monthly_vqvs[year] = np.ma.median(this_year) else: monthly_vqvs.mask[year] = True # low pass filter normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0]) # copy from distributional_gap.py - refactor! # get the threshold value bins, bincenters = utils.create_bins(normed_anomalies, 1.) hist, binEdges = np.histogram(normed_anomalies, bins=bins) gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig=np.std(normed_anomalies)) minimum_threshold = round( 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) if diagnostics: print iqr, minimum_threshold, 1. + utils.invert_gaussian( FREQUENCY_THRESHOLD, gaussian) print gaussian print hist if plots: coc_set_up_plot(bincenters, hist, gaussian, variable, threshold=minimum_threshold, sub_par="observations") uppercount = len( np.where(normed_anomalies > minimum_threshold)[0]) lowercount = len( np.where(normed_anomalies < -minimum_threshold)[0]) these_flags = station.qc_flags[:, flag_col[v]] gap_plot_values, tentative_plot_values = [], [] # find the gaps and apply the flags gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size=1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size=1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) station.qc_flags[:, flag_col[v]] = these_flags if uppercount + lowercount > 1000: #print "not sorted spurious stations yet" pass if plots: import matplotlib.pyplot as plt hist, binEdges = np.histogram(tentative_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, c='orange', ls='-', label='tentative', where='mid') hist, binEdges = np.histogram(gap_plot_values, bins=bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label='flagged', where='mid') import calendar plt.text(0.1, 0.9, calendar.month_name[month + 1], transform=plt.gca().transAxes) leg = plt.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.2), frameon=False, prop={'size': 13}, labelspacing=0.15, columnspacing=0.5) plt.setp(leg.get_title(), fontsize=14) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png') flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) # copy flags into attribute st_var.flags[flag_locs] = 1 if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite=True) print "where\n" nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags, noWrite=True) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags, noWrite=True) else: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0])) logfile.write("where\n") nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags) # firm flags match 030220 station = utils.append_history(station, "Climatological Check") return
def evc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): if plots or diagnostics: import matplotlib.pyplot as plt import calendar # very similar to climatological check - ensure that not duplicating for v, variable in enumerate(variable_list): st_var = getattr(station, variable) reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) reporting_freq = utils.reporting_frequency(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) month_data_count = np.zeros(month_ranges.shape[0:2]) # for each month for month in range(12): # set up hourly climatologies hourly_clims = np.zeros(24) hourly_clims.fill(st_var.data.fill_value) this_month, year_ids, month_data_count[:,month] = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) # # extract each year and append together # year_ids = [] # counter to determine which year each day corresponds to # for year in range(month_ranges.shape[0]): # this_year = st_var.data[month_ranges[year,month][0]:month_ranges[year,month][1]] # if year == 0: # # store so can access each hour of day separately # this_month = this_year.reshape(-1,24) # year_ids = [year for x in range(this_month.shape[0])] # month_data_count[year,month] = len(this_year.compressed()) # else: # this_year = this_year.reshape(-1,24) # this_month = np.ma.concatenate((this_month, this_year), axis = 0) # year_ids.extend([year for x in range(this_year.shape[0])]) # month_data_count[year,month] = len(this_year.compressed()) # winsorize and get hourly climatology for h in range(24): this_hour = this_month[:,h] if len(this_hour.compressed()) > 100: # winsorize & climatologies - done to match IDL if idl: this_hour_winsorized = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_clims[h] = np.ma.sum(this_hour_winsorized)/(len(this_hour_winsorized) - 1) else: this_hour_winsorized = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_clims[h] = np.ma.mean(this_hour_winsorized) hourly_clims = np.ma.masked_where(hourly_clims == st_var.data.fill_value, hourly_clims) anomalies = this_month - np.tile(hourly_clims, (this_month.shape[0], 1)) # extract IQR of anomalies (using 1/2 value to match IDL) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1)) / 2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr variances = np.ma.zeros(month_ranges.shape[0]) variances.mask = [False for i in range(month_ranges.shape[0])] rep_accuracies = np.zeros(month_ranges.shape[0]) rep_freqs = np.zeros(month_ranges.shape[0]) variances.fill(st_var.mdi) rep_accuracies.fill(st_var.mdi) rep_freqs.fill(st_var.mdi) year_ids = np.array(year_ids) # extract variance of normalised anomalies for each year for y, year in enumerate(range(month_ranges.shape[0])): year_locs = np.where(year_ids == y) this_year = normed_anomalies[year_locs,:] this_year = this_year.reshape(-1) # end of similarity with Climatological check if len(this_year.compressed()) >= 30: variances[y] = utils.mean_absolute_deviation(this_year, median = True) rep_accuracies[y] = utils.reporting_accuracy(this_year) rep_freqs[y] = utils.reporting_frequency(this_year) else: variances.mask[y] = True good = np.where(month_data_count[:,month] >= 100) # get median and IQR of variance for all years for this month if len(good[0]) >= 10: median_variance = np.median(variances[good]) iqr_variance = utils.IQR(variances[good]) / 2. # to match IDL if iqr_variance < 0.01: iqr_variance = 0.01 else: median_variance = st_var.mdi iqr_variance = st_var.mdi # if SLP, then get median and MAD of SLP and windspeed for month if variable in ["slp", "windspeeds"]: winds = getattr(station, "windspeeds") slp = getattr(station, "slp") # refactor this as similar in style to how target data extracted for y, year in enumerate(range(month_ranges.shape[0])): if y == 0: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_month = winds_year.reshape(-1,24) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp_year.reshape(-1,24) else: winds_year = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] winds_year = winds_year.reshape(-1,24) winds_month = np.ma.concatenate((winds_month, winds_year), axis = 0) slp_year = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_year = slp_year.reshape(-1,24) slp_month = np.ma.concatenate((slp_month, slp_year), axis = 0) median_wind = np.ma.median(winds_month) median_slp = np.ma.median(slp_month) wind_MAD = utils.mean_absolute_deviation(winds_month.compressed()) slp_MAD = utils.mean_absolute_deviation(slp_month.compressed()) if diagnostics: print "median windspeed {} m/s, MAD = {}".format(median_wind, wind_MAD) print "median slp {} hPa, MAD = {}".format(median_slp, slp_MAD) # now test to see if variance exceeds expected range for y, year in enumerate(range(month_ranges.shape[0])): if (variances[y] != st_var.mdi) and (iqr_variance != st_var.mdi) and \ (median_variance != st_var.mdi) and (month_data_count[y,month] >= DATA_COUNT_THRESHOLD): # if SLP, then need to test if deep low pressure ("hurricane/storm") present # as this will increase the variance for this month + year if variable in ["slp", "windspeeds"]: iqr_threshold = 6. # increase threshold if reporting frequency and resolution of this # year doesn't match average if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 8. if diagnostics: print np.abs(variances[y] - median_variance) / iqr_variance, variances[y] , median_variance , iqr_variance , iqr_threshold, month+1, year+start.year if np.abs((variances[y] - median_variance) / iqr_variance) > iqr_threshold: # check for storms winds_month = winds.data[month_ranges[year,month][0]:month_ranges[year,month][1]] slp_month = slp.data[month_ranges[year,month][0]:month_ranges[year,month][1]] storm = False if (len(winds_month.compressed()) >= 1) and (len(slp_month.compressed()) >= 1): # find max wind & min SLP # max_wind_loc = np.where(winds_month == np.max(winds_month))[0][0] # min_slp_loc = np.where(slp_month == np.min(slp_month))[0][0] # if these are above thresholds and within one day of each other, # then it likely was a storm # print "fix this in case of multiple max/min locations" # if (np.abs(max_wind_loc - min_slp_loc) <= 24) and \ # (((np.max(winds_month) - median_wind) / wind_MAD) > MAD_THRESHOLD) and \ # (((median_slp - np.min(slp_month)) / slp_MAD) > MAD_THRESHOLD): # locations where winds greater than threshold high_winds, = np.where((winds_month - median_wind)/wind_MAD > MAD_THRESHOLD) # and where SLP less than threshold low_slps, = np.where((median_slp - slp_month)/slp_MAD > MAD_THRESHOLD) # if any locations match, then it's a storm match_loc = high_winds[np.in1d(high_winds, low_slps)] if len(match_loc) > 0: storm = True else: print "write spurious" # check the SLP first difference series # to ensure a drop down and climb out of minimum SLP/or climb up and down from maximum wind speed if variable == "slp": diffs = np.diff(slp_month.compressed()) elif variable == "windspeeds": diffs = np.diff(winds_month.compressed()) negs, poss = 0,0 biggest_neg, biggest_pos = 0,0 for diff in diffs: if diff > 0: if negs > biggest_neg: biggest_neg = negs negs = 0 poss += 1 else: if poss > biggest_pos: biggest_pos = poss poss = 0 negs += 1 if (biggest_neg < 10) and (biggest_pos < 10) and not storm: # not a hurricane, so mask station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots or diagnostics: print "No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year) else: logfile.write("No Storm or Hurricane in %i %i - flagging\n" % (month+1, y+start.year)) else: # hurricane if plots or diagnostics: print "Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year) else: logfile.write("Storm or Hurricane in %i %i - not flagging\n" % (month+1, y+start.year)) if plots: # plot showing the pressure, pressure first differences and the wind speeds plot_times = utils.times_hours_to_datetime(station.time.data[month_ranges[year,month][0]:month_ranges[year,month][1]], start) evc_plot_slp_wind(plot_times, slp_month, diffs, median_slp, slp_MAD, winds_month, median_wind, wind_MAD) else: iqr_threshold = 8. if (rep_accuracies[y] != reporting_resolution) and \ (rep_freqs[y] != reporting_freq): iqr_threshold = 10. if np.abs(variances[y] - median_variance) / iqr_variance > iqr_threshold: if diagnostics: print "flagging {} {}".format(year+start.year,calendar.month_name[month+1]) # remove the data station.qc_flags[month_ranges[year,month,0]:month_ranges[year,month,1], flag_col[v]] = 1 if plots: plot_variances = (variances - median_variance) / iqr_variance plot_variances = np.ma.masked_where(month_data_count[:,month] < DATA_COUNT_THRESHOLD,plot_variances) evc_plot_hist(plot_variances, iqr_threshold, "Variance Check - %s - %s" % (variable, calendar.month_name[month+1])) flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0]), noWrite = True) else: utils.print_flagged_obs_number(logfile, "Variance", variable, len(flag_locs[0])) # copy flags into attribute st_var.flags[flag_locs] = 1 # matches 030660 for T, D and SLP 21/8/2014 station = utils.append_history(station, "Excess Variance Check") return # evc
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False): for v, variable in enumerate(variable_list): st_var = getattr(station, variable) all_filtered = utils.apply_filter_flags(st_var) # is this needed 13th Nov 2014 RJHD #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var)) month_ranges = utils.month_starts_in_pairs(start, end) month_ranges = month_ranges.reshape(-1,12,2) for month in range(12): hourly_climatologies = np.zeros(24) hourly_climatologies.fill(st_var.mdi) # append all e.g. Januaries together this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True) this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True) # if fixed climatology period, sort this here # get as array of 24 hrs. this_month = np.ma.array(this_month) this_month = this_month.reshape(-1,24) this_month_filtered = np.ma.array(this_month_filtered) this_month_filtered = this_month_filtered.reshape(-1,24) # get hourly climatology for each month for hour in range(24): this_hour = this_month[:,hour] # need to have data if this is going to work! if len(this_hour.compressed()) > 0: # winsorize & climatologies - done to match IDL if idl: this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1) else: this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl) hourly_climatologies[hour] = np.ma.mean(this_hour) if len(this_month.compressed()) > 0: # can get stations with few obs in a particular variable. # anomalise each hour over month appropriately anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1)) anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1)) if len(anomalies.compressed()) >= 10: iqr = utils.IQR(anomalies.compressed().reshape(-1))/2. # to match IDL if iqr < 1.5: iqr = 1.5 else: iqr = st_var.mdi normed_anomalies = anomalies / iqr normed_anomalies_filtered = anomalies_filtered / iqr # get average anomaly for year year_ids = np.array(year_ids) monthly_vqvs = np.ma.zeros(month_ranges.shape[0]) monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])] for year in range(month_ranges.shape[0]): year_locs = np.where(year_ids == year) this_year = normed_anomalies_filtered[year_locs,:] if len(this_year.compressed()) > 0: # need to have data for this to work! if idl: monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1)) else: monthly_vqvs[year] = np.ma.median(this_year) else: monthly_vqvs.mask[year] = True # low pass filter normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0]) # copy from distributional_gap.py - refactor! # get the threshold value bins, bincenters = utils.create_bins(normed_anomalies, 1.) hist, binEdges = np.histogram(normed_anomalies, bins = bins) gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies)) minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)) if diagnostics: print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian) print gaussian print hist if plots: coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations") uppercount = len(np.where(normed_anomalies > minimum_threshold)[0]) lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0]) these_flags = station.qc_flags[:, flag_col[v]] gap_plot_values, tentative_plot_values = [], [] # find the gaps and apply the flags gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2. these_flags, gap_plot_values, tentative_plot_values =\ coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \ upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values) station.qc_flags[:, flag_col[v]] = these_flags if uppercount + lowercount > 1000: #print "not sorted spurious stations yet" pass if plots: import matplotlib.pyplot as plt hist, binEdges = np.histogram(tentative_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid') hist, binEdges = np.histogram(gap_plot_values, bins = bins) plot_hist = np.array([0.01 if h == 0 else h for h in hist]) plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid') import calendar plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes) leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5) plt.setp(leg.get_title(), fontsize=14) plt.show() #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png') flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0) # copy flags into attribute st_var.flags[flag_locs] = 1 if plots or diagnostics: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True) print "where\n" nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags, noWrite = True) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags, noWrite = True) else: utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0])) logfile.write("where\n") nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0]) utils.print_flagged_obs_number(logfile, " Firm Clim", variable, nflags) nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0]) utils.print_flagged_obs_number(logfile, " Tentative Clim", variable, nflags) # firm flags match 030220 station = utils.append_history(station, "Climatological Check") return
def prepare_data(obs_var, station, month, diagnostics=False, winsorize=True): """ Calculate the monthly variances :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: which month to run on :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ anomalies = np.ma.zeros(obs_var.data.shape[0]) anomalies.mask = np.ones(anomalies.shape[0]) normed_anomalies = np.ma.copy(anomalies) mlocs, = np.where(station.months == month) anomalies.mask[mlocs] = False normed_anomalies.mask[mlocs] = False hourly_clims = np.ma.zeros(24) hourly_clims.mask = np.ones(24) for hour in range(24): # calculate climatology hlocs, = np.where( np.logical_and(station.months == month, station.hours == hour)) hour_data = obs_var.data[hlocs] if winsorize: if len(hour_data.compressed()) > 10: hour_data = utils.winsorize(hour_data, 5) if len(hour_data.compressed()) >= utils.DATA_COUNT_THRESHOLD: hourly_clims[hour] = np.ma.mean(hour_data) hourly_clims.mask[hour] = False # make anomalies - keeping the order anomalies[hlocs] = obs_var.data[hlocs] - hourly_clims[hour] if len(anomalies[mlocs].compressed()) >= MIN_VARIANCES: # for the month, normalise anomalies by spread spread = utils.spread(anomalies[mlocs]) if spread < 1.5: spread = 1.5 else: spread = 1.5 normed_anomalies[mlocs] = anomalies[mlocs] / spread # calculate the variance for each year in this single month. all_years = np.unique(station.years) variances = np.ma.zeros(all_years.shape[0]) variances.mask = np.ones(all_years.shape[0]) for y, year in enumerate(all_years): ymlocs, = np.where( np.logical_and(station.months == month, station.years == year)) this_year = normed_anomalies[ymlocs] # HadISD used M.A.D. if this_year.compressed().shape[0] > MIN_VALUES: variances[y] = utils.spread(this_year) return variances # prepare_data
def prepare_data(obs_var, station, month, diagnostics=False, winsorize=True): """ Prepare the data for the climatological check. Makes anomalies and applies low-pass filter :param MetVar obs_var: meteorological variable object :param Station station: station object :param int month: which month to run on :param bool plots: turn on plots :param bool diagnostics: turn on diagnostic output :param bool winsorize: apply winsorization at 5%/95% """ anomalies = np.ma.zeros(obs_var.data.shape[0]) anomalies.mask = np.ones(anomalies.shape[0]) normed_anomalies = np.ma.copy(anomalies) mlocs, = np.where(station.months == month) nyears = len(np.unique(station.years[mlocs])) # need to have some data and in at least 5 years! if len(mlocs) >= utils.DATA_COUNT_THRESHOLD and nyears >= 5: anomalies.mask[mlocs] = False normed_anomalies.mask[mlocs] = False hourly_clims = np.ma.zeros(24) hourly_clims.mask = np.ones(24) for hour in range(24): # calculate climatology hlocs, = np.where( np.logical_and(station.months == month, station.hours == hour)) hour_data = obs_var.data[hlocs] if winsorize: if len(hour_data.compressed()) > 10: hour_data = utils.winsorize(hour_data, 5) if len(hour_data) >= utils.DATA_COUNT_THRESHOLD: hourly_clims[hour] = np.ma.mean(hour_data) hourly_clims.mask[hour] = False # make anomalies - keeping the order anomalies[hlocs] = obs_var.data[hlocs] - hourly_clims[hour] # if insufficient data at each hour, then no anomalies calculated if len(anomalies[mlocs].compressed()) >= utils.DATA_COUNT_THRESHOLD: # for the month, normalise anomalies by spread spread = utils.spread(anomalies[mlocs]) if spread < 1.5: spread = 1.5 normed_anomalies[mlocs] = anomalies[mlocs] / spread # apply low pass filter derived from monthly values all_years = np.unique(station.years) monthly_anoms = np.ma.zeros(all_years.shape[0]) for y, year in enumerate(all_years): ylocs, = np.where(station.years == year) year_data = obs_var.data[ylocs] monthly_anoms[y] = utils.average(year_data) lp_filtered_anomalies = low_pass_filter(normed_anomalies, station, monthly_anoms, month) return lp_filtered_anomalies # prepare_data else: return anomalies # prepare_data else: return anomalies # prepare_data