Exemplo n.º 1
0
def dcc(station,
        variable_list,
        full_variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        doMonth=False):
    '''
    The diurnal cycle check.
    
    :param object station: the station object to be processed
    :param list variable_list: the variables to be processed
    :param list full_variable_list: the variables for flags to be applied to
    :param list flag_col: which column in the qc_flags array to work on
    :param file logfile: logfile to store outputs
    :param bool plots: to do any plots
    :param bool diagnostics: to do any extra diagnostic output
    :returns:
    '''

    # list of flags for each variable
    diurnal_flags = []

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)

        # is this needed 21/08/2014
        #        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        # apply flags, but discount incomplete year - so that test values against these later.
        all_data = utils.apply_filter_flags(st_var)
        all_data = all_data.reshape(-1, 24)  # working in fulltimes.
        # apply flags - also apply to final incomplete year so that best values only use complete years
        filtered_data = utils.apply_filter_flags(st_var,
                                                 doMonth=doMonth,
                                                 start=start,
                                                 end=end)
        filtered_data = filtered_data.reshape(-1, 24)  # working in fulltimes.
        number_of_days = filtered_data.shape[0]

        if plots:
            import matplotlib.pyplot as plt
            plt.clf()
            plot_data = np.ma.zeros(filtered_data.shape)
            plot_data.mask = True
#            best_estimate_counter = np.zeros(HOURS)

        diurnal_filtered_fits = np.zeros(filtered_data.shape[0], dtype=(int))
        diurnal_filtered_fits.fill(INTMDI)
        diurnal_best_fits = np.zeros(st_var.data.shape[0], dtype=(int))
        diurnal_best_fits.fill(INTMDI)
        diurnal_uncertainties = np.zeros(filtered_data.shape[0])
        diurnal_uncertainties.fill(INTMDI)

        for d, day in enumerate(all_data):
            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:

                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. *
                                       obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)
                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(
                                np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine,
                                                  1)  # matched to IDL SHIFT()

                        # and keep this for testing against the average value later
                        diurnal_best_fits[d] = np.argmin(diffs)

        for d, day in enumerate(filtered_data):
            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:

                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. *
                                       obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)
                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(
                                np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine,
                                                  1)  # matched to IDL SHIFT()

                        diurnal_filtered_fits[d] = np.argmin(diffs)

                        # default uncertainty is the average time resolution of the data
                        diurnal_uncertainties[d] = round(
                            float(HOURS) / len(day.compressed()))

                        if DYNAMIC_DIURNAL:
                            critical_value = min(diffs) + (
                                (max(diffs) - min(diffs)) * 0.33)

                            # centre so minimum in middle
                            diffs = np.roll(diffs,
                                            11 - diurnal_filtered_fits[d])

                            uncertainty = 1
                            while uncertainty < 11:
                                if (diffs[11 - uncertainty] > critical_value) and\
                                        (diffs[11 + uncertainty] > critical_value):
                                    # break if both sides greater than critical difference
                                    # when counting outwards
                                    #    see diurnal_example.py
                                    break

                                uncertainty += 1

                            # check if uncertainty greater than time resolution for day
                            if uncertainty > diurnal_uncertainties[d]:
                                diurnal_uncertainties[d] = uncertainty

                        if plots:
                            #                            best_estimate_counter[np.argmin(diffs)] += 1
                            # scale daily data to range -1 -> 1, plot with random scatter for clarity
                            plot_data[d] = ((2 *
                                             (day - min(day.compressed())) /
                                             obs_daily_range) - 1.)
                            plt.plot(
                                np.arange(24) + np.random.randn(24) * 0.25,
                                plot_data[d] + np.random.randn(24) * 0.05,
                                'k,')

        if plots:
            plt.plot(
                np.arange(24),
                np.roll(
                    dcc_make_sine(),
                    np.argmax(
                        np.bincount(diurnal_filtered_fits[np.where(
                            diurnal_filtered_fits != INTMDI)]))), 'r-')
            plt.xlim([-1, 25])
            plt.ylim([-1.2, 1.2])
            plt.show()

        # dumb copy of IDL
        '''For each uncertainty range (1-6h) find median of cycle offset'''
        filtered_fits = np.zeros(6)
        filtered_fits.fill(-9)
        for h in range(6):
            locs = np.where(diurnal_uncertainties == h + 1)

            if len(locs[0]) > 300:
                # filtered_fits[h] = int(np.median(diurnal_filtered_fits[locs]))
                # Numpy median gives average of central two values which may not be integer
                # 25/11/2014 use IDL style which gives lower value
                filtered_fits[h] = utils.idl_median(
                    diurnal_filtered_fits[locs])
        '''Build up range of cycles incl, uncertainty to find where best of best located'''

        hours = np.arange(24)
        hour_matches = np.zeros(24)
        diurnal_peak = -9
        number_estimates = 0
        for h in range(6):
            if filtered_fits[h] != -9:
                '''Store lowest uncertainty best fit as first guess'''
                if diurnal_peak == -9:
                    diurnal_peak = filtered_fits[h]
                    hours = np.roll(hours, 11 - int(diurnal_peak))
                    hour_matches[11 - (h + 1):11 + (h + 2)] = 1
                    number_estimates += 1

                centre, = np.where(hours == filtered_fits[h])

                if (centre[0] - h + 1) >= 0:
                    if (centre[0] + h + 1) <= 23:
                        hour_matches[centre[0] - (h + 1):centre[0] + h +
                                     2] += 1
                    else:
                        hour_matches[centre[0] - (h + 1):] += 1
                        hour_matches[:centre[0] + h + 2 - 24] += 1
                else:
                    hour_matches[:centre[0] + h + 2] += 1
                    hour_matches[centre[0] - (h + 1):] += 1

                number_estimates += 1
        '''If value at lowest uncertainty not found in all others, then see what value is found by all others '''
        if hour_matches[
                11] != number_estimates:  # central estimate at 12 o'clock
            all_match = np.where(hour_matches == number_estimates)

            # if one is, then use it
            if len(all_match[0]) > 0:
                diurnal_peak = all_match[0][0]
            else:
                diurnal_peak = -9
        '''Now have value for best fit diurnal offset'''

        potentially_spurious = np.zeros(number_of_days)
        potentially_spurious.fill(INTMDI)

        if diurnal_peak != -9:
            hours = np.arange(24)
            hours = np.roll(hours, 11 - int(diurnal_peak))
            for d in range(number_of_days):
                # and now going back to the unfiltered data
                if diurnal_best_fits[d] != INTMDI:
                    '''Checks if global falls inside daily value+/-range
                    rather than seeing if each day falls in global value+/-range'''

                    min_range = 11 - diurnal_uncertainties[d]
                    max_range = 11 + diurnal_uncertainties[d]
                    maxloc = np.where(hours == diurnal_best_fits[d])[0][0]

                    if maxloc < min_range or maxloc > max_range:
                        potentially_spurious[d] = 1
                    else:
                        potentially_spurious[d] = 0

            # count number of good, missing and not-bad days
            n_good = 0
            n_miss = 0
            n_not_bad = 0
            total_points = 0
            total_not_miss = 0
            to_flag = np.zeros(number_of_days)

            for d in range(number_of_days):

                if potentially_spurious[d] == 1:

                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points += 1
                    total_not_miss += 1

                else:

                    if potentially_spurious[d] == 0:

                        n_good += 1
                        n_not_bad += 1
                        if n_miss != 0:
                            n_miss = 0
                        total_not_miss += 1

                    if potentially_spurious[d] == -999:

                        n_miss += 1
                        n_not_bad += 1
                        if n_good != 0:
                            n_good = 0

                    total_points += 1

                    if (n_good == 3) or (n_miss == 3) or (n_not_bad >= 6):

                        if total_points >= 30:
                            if float(total_not_miss) / total_points >= 0.5:
                                to_flag[d - total_points:d] = 1

                        n_good = 0
                        n_miss = 0
                        n_not_bad = 0
                        total_points = 0
                        total_not_miss = 0

            dcc_flags = np.zeros(filtered_data.shape)

            for d in range(number_of_days):

                if to_flag[d] == 1:
                    good = np.where(filtered_data.mask[d, :] == False)
                    if len(good[0]) >= 1:
                        dcc_flags[d, good] = 1

            if diagnostics:
                print len(np.where(dcc_flags == 1)[0])
                print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?"

            diurnal_flags += [dcc_flags]
        else:
            diurnal_flags += [np.zeros(filtered_data.shape)]

        station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        utils.print_flagged_obs_number(logfile,
                                       "Diurnal Cycle",
                                       variable,
                                       len(flag_locs[0]),
                                       noWrite=diagnostics)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD

    utils.apply_flags_all_variables(station,
                                    full_variable_list,
                                    flag_col[variable_list == "temperatures"],
                                    logfile,
                                    "Diurnal Cycle",
                                    plots=plots,
                                    diagnostics=diagnostics)

    station = utils.append_history(station, "Diurnal Cycle Check")

    return  # dcc
Exemplo n.º 2
0
def coc(station,
        variable_list,
        flag_col,
        start,
        end,
        logfile,
        diagnostics=False,
        plots=False,
        idl=False):

    for v, variable in enumerate(variable_list):

        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)

        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))

        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1, 12, 2)

        for month in range(12):

            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)

            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(
                month_ranges[:, month, :], st_var.data, hours=True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(
                month_ranges[:, month, :], all_filtered, hours=True)

            # if fixed climatology period, sort this here

            # get as array of 24 hrs.
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1, 24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1, 24)

            # get hourly climatology for each month
            for hour in range(24):

                this_hour = this_month[:, hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(
                            this_hour.compressed(), -999999),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour) / (
                            len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(),
                                                    0.05,
                                                    idl=idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)

            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies,
                                                 (this_month.shape[0], 1))
                anomalies_filtered = this_month_filtered - np.tile(
                    hourly_climatologies, (this_month_filtered.shape[0], 1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(
                        -1)) / 2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr

                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [
                    False for x in range(month_ranges.shape[0])
                ]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs, :]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(
                                this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True

                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies,
                                                       year_ids, monthly_vqvs,
                                                       month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins=bins)

                gaussian = utils.fit_gaussian(bincenters,
                                              hist,
                                              max(hist),
                                              mu=np.mean(normed_anomalies),
                                              sig=np.std(normed_anomalies))
                minimum_threshold = round(
                    1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(
                        FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters,
                                    hist,
                                    gaussian,
                                    variable,
                                    threshold=minimum_threshold,
                                    sub_par="observations")

                uppercount = len(
                    np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(
                    np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist,
                                             binEdges,
                                             -minimum_threshold,
                                             gap_size=1)  # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values,
                                                  bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             c='orange',
                             ls='-',
                             label='tentative',
                             where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    leg = plt.legend(loc='lower center',
                                     ncol=4,
                                     bbox_to_anchor=(0.5, -0.2),
                                     frameon=False,
                                     prop={'size': 13},
                                     labelspacing=0.15,
                                     columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile,
                                           "Climatological",
                                           variable,
                                           len(flag_locs[0]),
                                           noWrite=True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Firm Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile,
                                           "  Tentative Clim",
                                           variable,
                                           nflags,
                                           noWrite=True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable,
                                           len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable,
                                           nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim",
                                           variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")

    return
Exemplo n.º 3
0
def dgc_monthly(station,
                variable,
                flags,
                start,
                end,
                plots=False,
                diagnostics=False,
                idl=False):
    '''
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
    :returns: 
       flags - updated flag array
    '''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)

    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average.fill(st_var.mdi)
    month_average_filtered = np.empty(month_ranges.shape[0])
    month_average_filtered.fill(st_var.mdi)

    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):

        data = st_var.data[month[0]:month[1]]

        filtered = all_filtered[month[0]:month[1]]

        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT,
                                                    st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(
            filtered, OBS_LIMIT, st_var.mdi, MEAN)

    # get overall monthly climatologies - use filtered data

    month_average = month_average.reshape(-1, 12)
    month_average_filtered = month_average_filtered.reshape(-1, 12)

    standardised_months = np.empty(month_average.shape)
    standardised_months.fill(st_var.mdi)

    for m in range(12):

        valid_filtered = np.where(month_average_filtered[:, m] != st_var.mdi)

        if len(valid_filtered[0]) >= VALID_MONTHS:

            valid_data = month_average_filtered[valid_filtered, m][0]

            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)

            else:
                if idl:
                    clim = utils.idl_median(
                        valid_data.compressed().reshape(-1))
                else:
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT

            standardised_months[valid_filtered,
                                m] = (month_average[valid_filtered, m] -
                                      clim) / spread

    standardised_months = standardised_months.reshape(month_ranges.shape[0])

    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months],
                                             BIN_SIZE)
        dummy, plot_bincenters = utils.create_bins(
            standardised_months[good_months], BIN_SIZE / 10.)

        hist, binEdges = np.histogram(standardised_months[good_months],
                                      bins=bins)

        fit = utils.fit_gaussian(bincenters,
                                 hist,
                                 max(hist),
                                 mu=np.mean(standardised_months[good_months]),
                                 sig=np.std(standardised_months[good_months]))
        plot_gaussian = utils.gaussian(plot_bincenters, fit)

        dgc_set_up_plot(plot_gaussian,
                        standardised_months[good_months],
                        variable,
                        sub_par="Months")

    # remove all months with a large standardised offset

    if len(good_months[0]) >= MONTH_LIMIT:

        standardised_months = np.ma.masked_values(standardised_months,
                                                  st_var.mdi)
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:

            for lo in large_offsets[0]:
                flags[month_ranges[lo, 0]:month_ranges[lo, 1]] = 1

            if plots:

                hist, binEdges = np.histogram(
                    standardised_months[large_offsets], bins=bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                plt.step(bincenters,
                         plot_hist,
                         'g-',
                         label='> %i' % LARGE_LIMIT,
                         where='mid',
                         zorder=5)

                plt.axvline(5, c='g')
                plt.axvline(-5, c='g')

        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2

        good = True
        iter = 1
        while good:

            if standardised_months[good_months][sort_order][
                    mid_point -
                    iter] != standardised_months[good_months][sort_order][
                        mid_point + iter]:
                # using IDL notation
                tempvals = [
                    np.abs(
                        standardised_months[good_months][sort_order][mid_point
                                                                     - iter]),
                    np.abs(
                        standardised_months[good_months][sort_order][mid_point
                                                                     + iter])
                ]

                if min(tempvals) != 0:
                    if max(tempvals) / min(tempvals) >= 2. and min(
                            tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.

                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            #RHS
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][mid_point + iter:]

                        for b in bad:
                            flags[month_ranges[b, 0]:month_ranges[b, 1]] = 1

                        if plots:

                            hist, binEdges = np.histogram(badplot, bins=bins)
                            plot_hist = np.array(
                                [0.01 if h == 0 else h for h in hist])
                            plt.step(bincenters,
                                     plot_hist,
                                     'r-',
                                     label='Gap',
                                     where='mid',
                                     zorder=4)

                        good = False

            iter += 1
            if iter == mid_point: break

        if plots:
            plt.legend(loc='lower center',
                       ncol=4,
                       bbox_to_anchor=(0.5, -0.2),
                       frameon=False,
                       prop={'size': 13})
            plt.show()
            #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png')

    return flags  # dgc_monthly
Exemplo n.º 4
0
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    all_filtered = utils.apply_filter_flags(st_var)

    for month in range(12):

        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        this_month_data = np.array([])
        this_month_filtered = np.array([])

        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)

        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)

                print "Spurious_stations file not yet sorted"

            if iqr != 0.0:
                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    monthly_values, BIN_SIZE / 10.)

                hist, binEdges = np.histogram(monthly_values, bins=bins)

                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [
                        np.max(hist),
                        np.mean(monthly_values),
                        np.std(monthly_values),
                        stats.skew(monthly_values),
                        stats.kurtosis(monthly_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                else:
                    gaussian = utils.fit_gaussian(bincenters,
                                                  hist,
                                                  max(hist),
                                                  mu=np.mean(monthly_values),
                                                  sig=np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    monthly_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                            if windspeeds:
                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))

                                if len(storms[0]) >= 2:

                                    storm_1diffs = np.diff(storms)

                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("",
                                       "Distributional Gap",
                                       variable,
                                       len(gap_plot_values),
                                       noWrite=True)

    return flags  # dgc_all_obs
Exemplo n.º 5
0
def coc(station, variable_list, flag_col, start, end, logfile, diagnostics = False, plots = False, idl = False):
    
    for v, variable in enumerate(variable_list):
        
        st_var = getattr(station, variable)
        all_filtered = utils.apply_filter_flags(st_var)
        
        # is this needed 13th Nov 2014 RJHD
        #reporting_resolution = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        month_ranges = utils.month_starts_in_pairs(start, end)
        month_ranges = month_ranges.reshape(-1,12,2)
    
        for month in range(12):
            
            hourly_climatologies = np.zeros(24)
            hourly_climatologies.fill(st_var.mdi)
            
            # append all e.g. Januaries together

            this_month, year_ids, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = True)
            this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = True)

            # if fixed climatology period, sort this here
            
            # get as array of 24 hrs.  
            this_month = np.ma.array(this_month)
            this_month = this_month.reshape(-1,24)

            this_month_filtered = np.ma.array(this_month_filtered)
            this_month_filtered = this_month_filtered.reshape(-1,24)

            # get hourly climatology for each month
            for hour in range(24):
                
                this_hour = this_month[:,hour]

                # need to have data if this is going to work!
                if len(this_hour.compressed()) > 0:

                    # winsorize & climatologies - done to match IDL
                    if idl:
                        this_hour = utils.winsorize(np.append(this_hour.compressed(), -999999), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.sum(this_hour)/(len(this_hour) - 1)

                    else:
                        this_hour = utils.winsorize(this_hour.compressed(), 0.05, idl = idl)
                        hourly_climatologies[hour] = np.ma.mean(this_hour)



            if len(this_month.compressed()) > 0:
                # can get stations with few obs in a particular variable.

                # anomalise each hour over month appropriately

                anomalies = this_month - np.tile(hourly_climatologies, (this_month.shape[0],1))
                anomalies_filtered = this_month_filtered - np.tile(hourly_climatologies, (this_month_filtered.shape[0],1))

                if len(anomalies.compressed()) >= 10:
                    iqr = utils.IQR(anomalies.compressed().reshape(-1))/2.  # to match IDL
                    if iqr < 1.5: iqr = 1.5
                else:
                    iqr = st_var.mdi

                normed_anomalies = anomalies / iqr
                normed_anomalies_filtered = anomalies_filtered / iqr


                # get average anomaly for year
                year_ids = np.array(year_ids)
                monthly_vqvs = np.ma.zeros(month_ranges.shape[0])
                monthly_vqvs.mask = [False for x in range(month_ranges.shape[0])]
                for year in range(month_ranges.shape[0]):
                    year_locs = np.where(year_ids == year)
                    this_year = normed_anomalies_filtered[year_locs,:]

                    if len(this_year.compressed()) > 0:
                        # need to have data for this to work!
                        if idl:
                            monthly_vqvs[year] = utils.idl_median(this_year.compressed().reshape(-1))
                        else:
                            monthly_vqvs[year] = np.ma.median(this_year)
                    else:
                        monthly_vqvs.mask[year] = True


                # low pass filter
                normed_anomalies = coc_low_pass_filter(normed_anomalies, year_ids, monthly_vqvs, month_ranges.shape[0])

                # copy from distributional_gap.py - refactor!
                # get the threshold value
                bins, bincenters = utils.create_bins(normed_anomalies, 1.)

                hist, binEdges = np.histogram(normed_anomalies, bins = bins)

                gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(normed_anomalies), sig = np.std(normed_anomalies))
                minimum_threshold = round(1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))

                if diagnostics:
                    print iqr, minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)
                    print gaussian
                    print hist

                if plots:
                    coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = minimum_threshold, sub_par = "observations")


                uppercount = len(np.where(normed_anomalies > minimum_threshold)[0])
                lowercount = len(np.where(normed_anomalies < -minimum_threshold)[0])

                these_flags = station.qc_flags[:, flag_col[v]]
                gap_plot_values, tentative_plot_values = [], []

                # find the gaps and apply the flags

                gap_start = dgc.dgc_find_gap(hist, binEdges, minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = True, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                gap_start = dgc.dgc_find_gap(hist, binEdges, -minimum_threshold, gap_size = 1) # in DGC it is 2.
                these_flags, gap_plot_values, tentative_plot_values =\
                    coc_find_and_apply_flags(month_ranges[:,month,:],normed_anomalies, these_flags, year_ids, minimum_threshold, gap_start, \
                                                           upper = False, plots = plots, gpv = gap_plot_values, tpv = tentative_plot_values)

                station.qc_flags[:, flag_col[v]] = these_flags

                if uppercount + lowercount > 1000:
                    #print "not sorted spurious stations yet"
                    pass
                if plots:
                    import matplotlib.pyplot as plt
                    hist, binEdges = np.histogram(tentative_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, c='orange', ls='-', label = 'tentative', where='mid')

                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    leg=plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13},labelspacing=0.15,columnspacing=0.5)
                    plt.setp(leg.get_title(), fontsize=14)
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_ClimatologicalGap_'+str(month+1)+'.png')


        
        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)

        # copy flags into attribute
        st_var.flags[flag_locs] = 1


        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]), noWrite = True)
            print "where\n"
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags, noWrite = True)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags, noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Climatological", variable, len(flag_locs[0]))
            logfile.write("where\n")
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 1)[0])
            utils.print_flagged_obs_number(logfile, "  Firm Clim", variable, nflags)
            nflags = len(np.where(station.qc_flags[:, flag_col[v]] == 2)[0])
            utils.print_flagged_obs_number(logfile, "  Tentative Clim", variable, nflags)

        # firm flags match 030220
    station = utils.append_history(station, "Climatological Check")  
                     
    return
Exemplo n.º 6
0
def dcc(station, variable_list, full_variable_list, flag_col, logfile, plots = False, diagnostics = False):
    '''
    The diurnal cycle check.
    
    :param object station: the station object to be processed
    :param list variable_list: the variables to be processed
    :param list full_variable_list: the variables for flags to be applied to
    :param list flag_col: which column in the qc_flags array to work on
    :param file logfile: logfile to store outputs
    :param bool plots: to do any plots
    :param bool diagnostics: to do any extra diagnostic output
    :returns:
    '''

    # list of flags for each variable
    diurnal_flags = []

    for v,variable in enumerate(variable_list):
    
        st_var = getattr(station, variable)
 
 	# is this needed 21/08/2014        
#        reporting_accuracy = utils.reporting_accuracy(utils.apply_filter_flags(st_var))
        
        # apply flags - for detection only
        filtered_data = utils.apply_filter_flags(st_var)

        filtered_data = filtered_data.reshape(-1,24) # working in fulltimes.
        number_of_days = filtered_data.shape[0]

        if plots:
            import matplotlib.pyplot as plt
            plt.clf()
            plot_data = np.ma.zeros(filtered_data.shape)
            plot_data.mask = True
#            best_estimate_counter = np.zeros(HOURS)

        diurnal_best_fits     = np.zeros(filtered_data.shape[0], dtype = (int))
        diurnal_best_fits.fill(INTMDI)
        diurnal_uncertainties = np.zeros(filtered_data.shape[0])
        diurnal_uncertainties.fill(INTMDI)

        for d,day in enumerate(filtered_data):

            '''enough observations and have large enough diurnal range '''
            if len(day.compressed()) >= OBS_PER_DAY:

                obs_daily_range = max(day.compressed()) - min(day.compressed())
                if obs_daily_range >= DAILY_RANGE:
                    
                    if dcc_quartile_check(day):
                        scaled_sine = ((dcc_make_sine() + 1.) / 2. * obs_daily_range) + min(day.compressed())
                        diffs = np.zeros(HOURS)

                        '''Find differences for each shifted sine --> cost function'''
                        for h in range(HOURS):
                            diffs[h] = np.sum(np.abs(day - scaled_sine).compressed())
                            scaled_sine = np.roll(scaled_sine, 1) # matched to IDL SHIFT()
                                                                           
                        diurnal_best_fits[d] = np.argmin(diffs)

                        # default uncertainty is the average time resolution of the data
                        diurnal_uncertainties[d] = round(float(HOURS) / len(day.compressed()))

                        if DYNAMIC_DIURNAL:
                            critical_value = min(diffs) + ((max(diffs) - min(diffs)) * 0.33)

                            # centre so minimum in middle
                            diffs = np.roll(diffs, 11 - diurnal_best_fits[d])
                            
                            uncertainty = 1
                            while uncertainty < 11:
                                if (diffs[11 - uncertainty] > critical_value) and\
                                        (diffs[11 + uncertainty] > critical_value):
                                    # break if both sides greater than critical difference
                                    # when counting outwards
                                    #    see diurnal_example.py
                                    break

                                uncertainty += 1

                            # check if uncertainty greater than time resolution for day
                            if uncertainty > diurnal_uncertainties[d] :
                                diurnal_uncertainties[d] = uncertainty

                        if plots:
#                            best_estimate_counter[np.argmin(diffs)] += 1
                            # scale daily data to range -1 -> 1, plot with random scatter for clarity
                            plot_data[d] = ((2 * (day - min(day.compressed())) / obs_daily_range) - 1.)
                            plt.plot(np.arange(24)+np.random.randn(24)*0.25, plot_data[d]+np.random.randn(24)*0.05, 'k,')
                            

        if plots:
            plt.plot(np.arange(24),np.roll(dcc_make_sine(), np.argmax(np.bincount(diurnal_best_fits[np.where(diurnal_best_fits != INTMDI)]))),'r-')
            plt.xlim([-1,25])
            plt.ylim([-1.2,1.2])
            plt.show()

        # dumb copy of IDL

        '''For each uncertainty range (1-6h) find median of cycle offset'''
        best_fits = np.zeros(6)
        best_fits.fill(-9)
        for h in range(6):
            locs = np.where(diurnal_uncertainties == h+1)

            if len(locs[0]) > 300:
                # best_fits[h] = int(np.median(diurnal_best_fits[locs])) 
                # Numpy median gives average of central two values which may not be integer
                # 25/11/2014 use IDL style which gives lower value
                best_fits[h] = utils.idl_median(diurnal_best_fits[locs])
 
        '''Build up range of cycles incl, uncertainty to find where best of best located'''

        hours = np.arange(24)
        hour_matches=np.zeros(24)
        diurnal_peak = -9
        number_estimates = 0
        for h in range(6):
            if best_fits[h] != -9:

                '''Store lowest uncertainty best fit as first guess'''
                if diurnal_peak == -9: 
                    diurnal_peak = best_fits[h]
                    hours = np.roll(hours,11-int(diurnal_peak))
                    hour_matches[11-(h+1):11+(h+2)] = 1
                    number_estimates += 1
                 
                centre = np.where(hours == best_fits[h])
 
                if (centre[0] - h + 1) >= 0:
                    if (centre[0] + h + 1 ) <=23:
                        hour_matches[centre[0] - (h + 1) : centre[0] + h + 2] += 1
                    else:
                        hour_matches[centre[0] - (h + 1) : ] += 1
                        hour_matches[ : centre[0] + h + 2- 24] += 1                                        
                else:
                    hour_matches[: centre[0] + h + 2] += 1
                    hour_matches[centre[0] - (h + 1) :] += 1

                number_estimates += 1

        
        '''If value at lowest uncertainty not found in all others, then see what value is found by all others '''
        if hour_matches[11] != number_estimates:  # central estimate at 12 o'clock
            all_match = np.where(hour_matches == number_estimates)

            # if one is, then use it
            if len(all_match[0]) > 0:
                diurnal_peak = all_match[0][0]
            else:
                diurnal_peak = -9
            
 
        '''Now have value for best fit diurnal offset'''

        potentially_spurious = np.zeros(number_of_days)
        potentially_spurious.fill(INTMDI)

        if diurnal_peak != -9:
            hours = np.arange(24)
            hours = np.roll(hours,11-int(diurnal_peak))
            for d in range(number_of_days):
                if diurnal_best_fits[d] != INTMDI:

                    '''Checks if global falls inside daily value+/-range
                    rather than seeing if each day falls in global value+/-range'''

                    

                    min_range = 11 - diurnal_uncertainties[d]
                    max_range = 11 + diurnal_uncertainties[d]
                    maxloc = np.where(hours == diurnal_best_fits[d])[0][0]


                    if maxloc < min_range or maxloc > max_range:
                        potentially_spurious[d] = 1
                    else:
                        potentially_spurious[d] = 0
                    

 

            # count number of good, missing and not-bad days
            n_good = 0
            n_miss = 0
            n_not_bad = 0
            total_points = 0
            total_not_miss = 0
            to_flag = np.zeros(number_of_days)

            for d in range(number_of_days):

                if potentially_spurious[d] == 1:
                    
                    n_good = 0
                    n_miss = 0
                    n_not_bad = 0
                    total_points += 1
                    total_not_miss +=1
                    
                else:

                    if potentially_spurious[d] == 0:

                        n_good += 1
                        n_not_bad += 1
                        if n_miss != 0:
                            n_miss = 0
                        total_not_miss += 1

                    if potentially_spurious[d] == -999:

                        n_miss += 1
                        n_not_bad += 1
                        if n_good != 0:
                            n_good = 0

                    total_points += 1


                    if (n_good == 3) or (n_miss == 3) or (n_not_bad >=6):

                        
                        if total_points >= 30:
                            if float(total_not_miss)/total_points >= 0.5:
                                to_flag[d - total_points : d ] = 1
                        
                        n_good = 0
                        n_miss = 0
                        n_not_bad = 0
                        total_points = 0 
                        total_not_miss = 0

            dcc_flags = np.zeros(filtered_data.shape)

            for d in range(number_of_days):

                if to_flag[d] == 1:
                    good = np.where(filtered_data.mask[d,:] == False)
                    if len(good[0]) >= 1:
                        dcc_flags[d,good]=1

            if diagnostics:
                print len(np.where(dcc_flags == 1)[0])
                print "currently matches IDL, but should all hours in days have flags set, not just the missing/flagged ones?"

            diurnal_flags += [dcc_flags]
        else: 
            diurnal_flags += [np.zeros(filtered_data.shape)]

        station.qc_flags[:, flag_col[v]] = np.array(diurnal_flags).reshape(-1)

        flag_locs = np.where(station.qc_flags[:, flag_col[v]] != 0)
        if plots or diagnostics:
            utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]), noWrite = True)
        else:
            utils.print_flagged_obs_number(logfile, "Diurnal Cycle", variable, len(flag_locs[0]))

        # copy flags into attribute
        st_var.flags[flag_locs] = 1

        # CHECKED 030660-99999, 30-06-2014, 855 flagged RJHD
    
    utils.apply_flags_all_variables(station, full_variable_list, flag_col[variable_list == "temperatures"], logfile, "Diurnal Cycle", plots = plots, diagnostics = diagnostics)

    station = utils.append_history(station, "Diurnal Cycle Check")  
                     
    return # dcc
Exemplo n.º 7
0
def dgc_all_obs(station, variable, flags, start, end, plots = False, diagnostics = False, idl = False, windspeeds = False, GH = False):
    '''RJHD addition working on all observations'''
    
    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)
    
    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1,12,2)
    
    all_filtered = utils.apply_filter_flags(st_var)

 
    for month in range(12):
    
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")
            
            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:,month,:]):
            
                if y == 0:
                    windspeeds_month = np.ma.array(st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate([windspeeds_month, st_var_wind.data[year[0]:year[1]]])
                  
            windspeeds_month_average = dgc_get_monthly_averages(windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(windspeeds_month, median=True)
    
        
        this_month_data = np.array([])
        this_month_filtered = np.array([])
        
        this_month_data, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = False)
                
        if len(this_month_filtered.compressed()) > OBS_LIMIT:
            
            if idl:
                monthly_median = utils.idl_median(this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)
                  
            iqr = utils.IQR(this_month_filtered.compressed())
            
            
            if iqr == 0.0:
                # to get some spread if IQR too small                   
                iqr = utils.IQR(this_month_filtered.compressed(), percentile = 0.05)
                
                print "Spurious_stations file not yet sorted"
    

            if iqr != 0.0:               
                monthly_values = np.ma.array((this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(monthly_values, BIN_SIZE/10.)
        
                hist, binEdges = np.histogram(monthly_values, bins = bins)
                                               
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values)] # norm, mean, std, skew, kurtosis
                    
                    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics = diagnostics)
                    
                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[mid_point:][bad[0]:] = FREQUENCY_THRESHOLD/10.

                    bad, = np.where(plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[:mid_point][:bad[-1]] = FREQUENCY_THRESHOLD/10.                   

                    # extract threshold values
                    good_values = np.argwhere(plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(plot_bincenters[good_values[-1]])
                                      

                else:
                    gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(monthly_values), sig = np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold


                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold = (u_minimum_threshold, l_minimum_threshold), sub_par = "observations", GH = GH)
                     
                    if GH:
                        plt.figtext(0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %(res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small')

                    

                uppercount = len(np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(np.where(monthly_values < l_minimum_threshold)[0])
                
                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold)
                        
                    if gap_start != 0:
                        
                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(((this_year_data - monthly_median) / iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)


                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold)
                        
                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(np.logical_and(((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)
                    

                            if windspeeds:
                                this_year_flags[gap_cleaned_locations] = 2 # tentative flags
                                
                                slp_average = dgc_get_monthly_averages(this_month_data, OBS_LIMIT, st_var.mdi, MEAN)
                                slp_mad = utils.mean_absolute_deviation(this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))
                                
                                if len(storms[0]) >= 2:
                                    
                                    storm_1diffs = np.diff(storms)
                                    
                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:


                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    plt.legend(loc='lower center',ncol=3, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True)

    return flags # dgc_all_obs
Exemplo n.º 8
0
def dgc_monthly(station, variable, flags, start, end, plots=False, diagnostics=False, idl = False):
    '''
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
    :returns: 
       flags - updated flag array
    '''

    if plots:
        import matplotlib.pyplot as plt
    
    st_var = getattr(station, variable)
    
    month_ranges = utils.month_starts_in_pairs(start, end)
    
    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average.fill(st_var.mdi)
    month_average_filtered = np.empty(month_ranges.shape[0])
    month_average_filtered.fill(st_var.mdi)
    
    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):
        
        data = st_var.data[month[0]:month[1]]
        
        filtered = all_filtered[month[0]:month[1]]
        
        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT, st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(filtered, OBS_LIMIT, st_var.mdi, MEAN)
            
    # get overall monthly climatologies - use filtered data
    
    month_average = month_average.reshape(-1,12)
    month_average_filtered = month_average_filtered.reshape(-1,12)
    
    standardised_months = np.empty(month_average.shape)
    standardised_months.fill(st_var.mdi)
    
    for m in range(12):
        
        valid_filtered = np.where(month_average_filtered[:,m] != st_var.mdi)
        
        if len(valid_filtered[0]) >= VALID_MONTHS:
            
            valid_data = month_average_filtered[valid_filtered,m][0]
            
            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)
                
            else:        
                if idl:
                    clim = utils.idl_median(valid_data.compressed().reshape(-1))
                else:
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT
                    
            standardised_months[valid_filtered,m] = (month_average[valid_filtered,m] - clim) / spread 
                    
    standardised_months = standardised_months.reshape(month_ranges.shape[0]) 
    
    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE)
        dummy, plot_bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE/10.)

        hist, binEdges = np.histogram(standardised_months[good_months], bins = bins)   

        fit = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(standardised_months[good_months]), sig = np.std(standardised_months[good_months]))
        plot_gaussian = utils.gaussian(plot_bincenters, fit)

        dgc_set_up_plot(plot_gaussian, standardised_months[good_months], variable, sub_par = "Months")
        
    # remove all months with a large standardised offset
        
    if len(good_months[0]) >= MONTH_LIMIT:
                
        standardised_months = np.ma.masked_values(standardised_months, st_var.mdi)
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:
            
            for lo in large_offsets[0]:
                flags[month_ranges[lo,0]:month_ranges[lo,1]] = 1
                
            if plots:
                
                hist, binEdges = np.histogram(standardised_months[large_offsets], bins = bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                plt.step(bincenters, plot_hist, 'g-', label = '> %i' % LARGE_LIMIT, where = 'mid', zorder = 5)
                
                plt.axvline(5,c='g')
                plt.axvline(-5,c='g')



        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2
        
        good = True
        iter = 1
        while good:
            
            if standardised_months[good_months][sort_order][mid_point - iter] != standardised_months[good_months][sort_order][mid_point + iter]:
                # using IDL notation
                tempvals = [np.abs(standardised_months[good_months][sort_order][mid_point - iter]),np.abs(standardised_months[good_months][sort_order][mid_point + iter])]
                
                if min(tempvals) != 0:
                    if max(tempvals)/min(tempvals) >= 2. and min(tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.
                        
                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots: badplot = standardised_months[good_months][sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            #RHS
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots: badplot = standardised_months[good_months][sort_order][mid_point + iter:]
                            
                        for b in bad:
                            flags[month_ranges[b,0]:month_ranges[b,1]] = 1
                
                        if plots:
                            
                            hist, binEdges = np.histogram(badplot, bins = bins)
                            plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                            plt.step(bincenters, plot_hist, 'r-', label = 'Gap', where = 'mid', zorder = 4)
                
                        good = False        
                            
                
            iter += 1
            if iter == mid_point: break
                
                          
        if plots: 
            plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
            plt.show()
            #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png')
                   
    return flags # dgc_monthly
Exemplo n.º 9
0
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                logfile,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False,
                doMonth=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    # extract variable
    st_var = getattr(station, variable)
    # apply flags (and mask incomplete year if appropriate)
    all_filtered = utils.apply_filter_flags(st_var,
                                            doMonth=doMonth,
                                            start=start,
                                            end=end)

    st_var_complete_year = copy.deepcopy(st_var)
    if doMonth:
        # restrict the incomplete year if appropriate - keep other flagged obs.
        full_year_end = utils.get_first_hour_this_year(start, end)
        st_var_complete_year.data.mask[full_year_end:] = True

    for month in range(12):

        # if requiring wind data, extract data and find monthly averages
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            if doMonth:
                # restrict the incomplete year if appropriate
                st_var_wind.data.mask[full_year_end:] = True

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        # pull data from each calendar month together
        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)
        this_month_complete, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var_complete_year.data, hours=False)

        # if enough clean and complete data for this calendar month find the median and IQR
        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)
                print "Spurious_stations file not yet sorted"

            # if have an IQR, anomalise using median and standardise using IQR
            if iqr != 0.0:

                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)
                complete_values = np.ma.array(
                    (this_month_complete.compressed() - monthly_median) / iqr)

                # use complete years only for the histogram - aiming to find outliers.
                bins, bincenters = utils.create_bins(complete_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    complete_values, BIN_SIZE / 10.)
                hist, binEdges = np.histogram(complete_values, bins=bins)
                """
                Change to monthly updates Oct 2017
                Thought about changing distribution to use filtered values
                But this changes the test beyond just dealing with additional months
                Commented out lines below would be alternative.
                """
                # bins, bincenters = utils.create_bins(filtered_values, BIN_SIZE)
                # dummy, plot_bincenters = utils.create_bins(filtered_values, BIN_SIZE/10.)
                # hist, binEdges = np.histogram(filtered_values, bins = bins)

                # used filtered (incl. incomplete year mask) to determine the distribution.
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    # Feb 2019 - if large amounts off centre, can affect initial values
                    # switched to median and MAD
                    initial_values = [
                        np.max(hist),
                        np.median(complete_values),
                        utils.mean_absolute_deviation(complete_values,
                                                      median=True),
                        stats.skew(complete_values),
                        stats.kurtosis(complete_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                    if diagnostics:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                # or just a standard Gaussian
                else:
                    gaussian = utils.fit_gaussian(
                        bincenters,
                        hist,
                        max(hist),
                        mu=np.median(complete_values),
                        sig=utils.mean_absolute_value(complete_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                    if diagnostics:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    complete_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                # now trying to find gaps in the distribution
                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                # do one side of distribution and then other
                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            # not using filtered - checking all available data
                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Upper {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))
                            # add flag requirement for low pressure bit if appropriate

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Lower {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                            # if doing SLP then do extra checks for storms
                            if windspeeds:
                                windspeeds_year = np.ma.array(
                                    st_var_wind.data[year[0]:year[1]])

                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)

                                # need to ensure that this_year_data is less than slp_average, hence order of test
                                storms, = np.ma.where((((windspeeds_year - windspeeds_month_average) / windspeeds_month_mad) > MAD_THRESHOLD) &\
                                                   (((slp_average - this_year_data) / slp_mad) > MAD_THRESHOLD))

                                # using IDL terminology
                                if len(storms) >= 2:
                                    # use the first difference series to find when there are gaps in
                                    # contiguous sequences of storm observations - want to split up into
                                    # separate storm events
                                    storm_1diffs = np.diff(storms)
                                    separations, = np.where(storm_1diffs != 1)

                                    # expand around storm signal so that all low SLP values covered, and unflagged
                                    if len(separations) >= 1:
                                        print "  multiple storms in {} {}".format(
                                            y + start.year, month)

                                        # if more than one storm signal that month, then use intervals
                                        #    in the first difference series to expand around the first interval alone
                                        storm_start = 0
                                        storm_finish = separations[0] + 1
                                        first_storm = dgc_expand_storms(
                                            storms[storm_start:storm_finish],
                                            len(this_year_data))
                                        final_storms = copy.deepcopy(
                                            first_storm)

                                        for j in range(len(separations)):
                                            # then do the rest in a loop

                                            if j + 1 == len(separations):
                                                # final one
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:],
                                                    len(this_year_data))
                                            else:
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:separations[j +
                                                                         1] +
                                                           1],
                                                    len(this_year_data))

                                            final_storms = np.append(
                                                final_storms, this_storm)

                                    else:
                                        # else just expand around the signal by 6 hours either way
                                        final_storms = dgc_expand_storms(
                                            storms, len(this_year_data))

                                else:
                                    final_storms = storms

                                if len(storms) >= 1:
                                    print "Tropical Storm signal in {} {}".format(
                                        y + start.year, month)
                                    this_year_flags[final_storms] = 0

                            # and write flags back into array
                            flags[year[0]:year[1]] = this_year_flags

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')

    nflags, = np.where(flags != 0)
    utils.print_flagged_obs_number(logfile,
                                   "Distributional Gap All",
                                   variable,
                                   len(nflags),
                                   noWrite=diagnostics)

    return flags  # dgc_all_obs