Exemplo n.º 1
0
def coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = 0, sub_par = ""):
    '''
    Set up the plotting space for the Climatological Outlier Check

    :param array bincenters: bin centres of histogram
    :param array hist: histogram values
    :param array gaussian: parameters of gaussian fit [m, s, n]
    :param str variable: name of variable for title
    :param int threshold: threshold to plot
    :param str sub_par: sub parameter for axis label
    '''   
    import matplotlib.pyplot as plt
    
    plt.clf()
    plt.axes([0.1,0.15,0.85,0.75])
    plot_hist = np.array([0.01 if h == 0 else h for h in hist])  
    plt.step(bincenters, plot_hist, 'k-', label = 'standardised months', where='mid')

    # plot fitted Gaussian
    plot_gaussian = utils.gaussian(bincenters, gaussian)
    plt.plot(bincenters, plot_gaussian, 'b-', label = 'Gaussian fit')

    # sort the labels etc
    plt.xlabel("%s offset (IQR)" % variable)                    
    plt.ylabel("Frequency (%s)" % sub_par)
    plt.gca().set_yscale('log')
    plt.axvline(-threshold-1,c='r')
    plt.axvline(threshold+1,c='r')
    plt.axvline(-threshold,c='orange')
    plt.axvline(threshold,c='orange')
    plt.ylim(ymin=0.1)
    plt.title("Climatological Gap Check - %s - %s" % (sub_par, variable) )        

    return  # coc_set_up_plot
Exemplo n.º 2
0
def fvc_plot_setup(hist_data, hist, binEdges, xlabel, title = ""):
    '''
    Plot the histogram, with removed observations highlighted
    
    :param array hist_data: raw values which have been binned to create hist
    :param array hist: values of histogram
    :param array binEdges: location of LH bin edge
    :param str xlabel: label for x-axis
    :param str title: title of plot
    
    :returns:
        plot-hist - useful histogram data to plot in log-scale
        bincenters - locations of centres of bins
    '''
    import matplotlib.pyplot as plt

    plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
    plt.clf()
    bincenters = 0.5 * (binEdges[1:] + binEdges[:-1])
    plt.step(bincenters, plot_hist, 'b-', label = 'observations', where='mid')
            
    fit = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(hist_data), sig = np.std(hist_data))
    plot_gaussian = utils.gaussian(bincenters, fit)
    plt.plot(bincenters, plot_gaussian, 'r-', label = 'Gaussian fit')
    # sort labels and prettify
    plt.xlabel(xlabel)                    
    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')
    plt.ylim([0.1,10000])
    plt.title(title)

    return plot_hist, bincenters # fvc_plot_setup
Exemplo n.º 3
0
def plot_target_neigh_diffs_dist(differences, iqr):
    '''
    Plot the distribution of target-neighbour differences
    
    :param array differences: masked difference array
    :param float iqr: inter quartile range of differences

    :returns: 
    '''
    import matplotlib.pyplot as plt
    
    plt.clf()
    
    bins, bincenters = utils.create_bins(differences.compressed(), 1.0)
    
    hist, binEdges = np.histogram(differences.compressed(), bins=bins)
    plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
    plt.step(bincenters, plot_hist, 'k-', label = 'observations', where='mid')
    
    fit = utils.fit_gaussian(bincenters, hist, max(hist), mu=np.mean(differences.compressed()), sig = np.std(differences.compressed()))
    plot_gaussian = utils.gaussian(bincenters, fit)
    plt.plot(bincenters, plot_gaussian, 'b-', label = 'Gaussian fit')
    
    plt.axvline(5.*iqr, c = 'r')
    plt.axvline(-5.*iqr, c = 'r')
    
    print "only shows lowest of monthly IQRs"

    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')
    plt.ylim([0.1,2*max(hist)])
    
    plt.show()      

    return # plot_target_neigh_diffs_dist
Exemplo n.º 4
0
def coc_set_up_plot(bincenters, hist, gaussian, variable, threshold = 0, sub_par = ""):
    '''
    Set up the plotting space for the Climatological Outlier Check

    :param array bincenters: bin centres of histogram
    :param array hist: histogram values
    :param array gaussian: parameters of gaussian fit [m, s, n]
    :param str variable: name of variable for title
    :param int threshold: threshold to plot
    :param str sub_par: sub parameter for axis label
    '''   
    import matplotlib.pyplot as plt
    
    plt.clf()
    plt.axes([0.1,0.15,0.85,0.75])
    plot_hist = np.array([0.01 if h == 0 else h for h in hist])  
    plt.step(bincenters, plot_hist, 'k-', label = 'standardised months', where='mid')

    # plot fitted Gaussian
    plot_gaussian = utils.gaussian(bincenters, gaussian)
    plt.plot(bincenters, plot_gaussian, 'b-', label = 'Gaussian fit')

    # sort the labels etc
    plt.xlabel("%s offset (IQR)" % variable)                    
    plt.ylabel("Frequency (%s)" % sub_par)
    plt.gca().set_yscale('log')
    plt.axvline(-threshold-1,c='r')
    plt.axvline(threshold+1,c='r')
    plt.axvline(-threshold,c='orange')
    plt.axvline(threshold,c='orange')
    plt.ylim(ymin=0.1)
    plt.title("Climatological Gap Check - %s - %s" % (sub_par, variable) )        

    return  # coc_set_up_plot
Exemplo n.º 5
0
def plot_target_neigh_diffs_dist(differences, iqr):
    '''
    Plot the distribution of target-neighbour differences
    
    :param array differences: masked difference array
    :param float iqr: inter quartile range of differences

    :returns: 
    '''
    import matplotlib.pyplot as plt

    plt.clf()

    bins, bincenters = utils.create_bins(differences.compressed(), 1.0)

    hist, binEdges = np.histogram(differences.compressed(), bins=bins)
    plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
    plt.step(bincenters, plot_hist, 'k-', label='observations', where='mid')

    fit = utils.fit_gaussian(bincenters,
                             hist,
                             max(hist),
                             mu=np.mean(differences.compressed()),
                             sig=np.std(differences.compressed()))
    plot_gaussian = utils.gaussian(bincenters, fit)
    plt.plot(bincenters, plot_gaussian, 'b-', label='Gaussian fit')

    plt.axvline(5. * iqr, c='r')
    plt.axvline(-5. * iqr, c='r')

    print "only shows lowest of monthly IQRs"

    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')
    plt.ylim([0.1, 2 * max(hist)])

    plt.show()

    return  # plot_target_neigh_diffs_dist
Exemplo n.º 6
0
def fvc_plot_setup(hist_data, hist, binEdges, xlabel, title=""):
    '''
    Plot the histogram, with removed observations highlighted
    
    :param array hist_data: raw values which have been binned to create hist
    :param array hist: values of histogram
    :param array binEdges: location of LH bin edge
    :param str xlabel: label for x-axis
    :param str title: title of plot
    
    :returns:
        plot-hist - useful histogram data to plot in log-scale
        bincenters - locations of centres of bins
    '''
    import matplotlib.pyplot as plt

    plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
    plt.clf()
    bincenters = 0.5 * (binEdges[1:] + binEdges[:-1])
    plt.step(bincenters, plot_hist, 'b-', label='observations', where='mid')

    fit = utils.fit_gaussian(bincenters,
                             hist,
                             max(hist),
                             mu=np.mean(hist_data),
                             sig=np.std(hist_data))
    plot_gaussian = utils.gaussian(bincenters, fit)
    plt.plot(bincenters, plot_gaussian, 'r-', label='Gaussian fit')
    # sort labels and prettify
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")
    plt.gca().set_yscale('log')
    plt.ylim([0.1, 10000])
    plt.title(title)

    return plot_hist, bincenters  # fvc_plot_setup
Exemplo n.º 7
0
def wind_rose_check(station, flag_col, start, end, logfile, plots=False, diagnostics=False):
    """
    Checks for large differences in the year-to-year wind-rose shape.  
    Uses RMSE and fits Gaussian.  Finds gap in distribution to flag beyond

    :param MetStation station: station object
    :param int flag_col: which column to store the flags in
    :param datetime start: start of data
    :param datetime end: end of data
    :param bool plots: run the plots
    :param bool diagnostics: run the diagnostics

    """

    direction = station.winddirs.data
    speed = station.windspeeds.data
    flags = station.qc_flags[:, flag_col]

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges_years = month_ranges.reshape(-1, 12, 2)

    # histogram of wind directions ( ~ unravelled wind-rose)
    bw = 20
    bins = range(0, 360 + bw, bw)
    full_hist, binEdges = np.histogram(direction, bins=bins, normed=True)

    # use rmse as this is known (Chi-sq remains just in case)
    rmse, chisq = -np.ma.ones([month_ranges_years.shape[0]]), -np.ma.ones([month_ranges_years.shape[0]])

    # run through each year to extract RMSE's
    for y, year in enumerate(month_ranges_years):

        if len(direction[year[0][0] : year[-1][0]].compressed()) > 0:

            hist, binEdges = np.histogram(direction[year[0][0] : year[-1][0]], bins=bins, normed=True)

            chisq[y] = np.sum((full_hist - hist) ** 2 / (full_hist + hist)) / 2.0
            rmse[y] = np.sqrt(np.mean((full_hist - hist) ** 2))

        else:
            rmse.mask[y] = True

    # now to bin up the differences and see what the fit is.
    # need to have values spread so can bin!
    if len(np.unique(rmse.compressed())) > 1:
        binEdges, bincenters = wind_create_bins(rmse)
        hist, binEdges = np.histogram(rmse, bins=binEdges)  # , density=True)

        norm = get_histogram_norm(rmse, binEdges)

        # inputs for fit
        mu = np.mean(rmse)
        std = np.std(rmse)

        # try to get decent fit to bulk of obs.
        #    initial_values = [np.max(hist), np.mean(rmse), np.std(rmse), stats.skew(rmse), stats.kurtosis(rmse)] # norm, mean, std, sk#ew, kurtosis
        #    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
        #    res = utils.hermite2gauss(fit[0])
        #    plot_gaussian = utils.funcGH(fit[0], bincenters)

        fit = stats.rice.fit(rmse.compressed(), loc=0, scale=np.ma.std(rmse))
        dist_pdf = stats.rice.pdf(bincenters, fit[:-2], loc=fit[-2], scale=fit[-1]) * norm

        gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu=mu, sig=std)

        # invert Gaussian to find initial threshold, then hunt for first gap beyond
        # threshold = utils.invert_gaussian(PROB_THRESHOLD, gaussian)

        # invert Rician to find initial threshold, then hunt for first gap beyond
        if dist_pdf[-1] < PROB_THRESHOLD:
            # then curve has dropped below the threshold, so can find some updated ones.
            threshold = -np.where(dist_pdf[::-1] > PROB_THRESHOLD)[0][0]
        else:
            threshold = bincenters[-1]

        n = 0
        center = np.argmax(hist)
        gap = bincenters[-1]  # nothing should be beyond this

        while True:
            if center + n + 1 == len(bincenters):
                # gone beyond edge - nothing to flag, so just break
                break

            if bincenters[center + n] < threshold:
                n += 1
                # continue moving outwards
                continue

            if hist[center + n] == 0:
                # found one
                if center + n + 1 == len(bincenters):
                    # gone beyond edge - nothing to flag - escape
                    break
                elif hist[center + n + 1] == 0:
                    # has to be two bins wide?
                    gap = bincenters[center + n]
                    break
            n += 1

        # run through each year to extract RMSE's
        for y, year in enumerate(month_ranges_years):

            if rmse[y] > gap:

                # only flag where there are observations
                good, = np.where(
                    np.logical_or(
                        direction.mask[year[0][0] : year[-1][0]] == False, speed.mask[year[0][0] : year[-1][0]] == False
                    )
                )

                flags[year[0][0] : year[-1][0]][good] = 1

                if diagnostics or plots:
                    print "Flagging {}  RMSE {} > {}".format(y + start.year, rmse[y], gap)
            elif rmse.mask[y] == False:
                if diagnostics or plots:
                    print "{}".format(y + start.year)

        if plots:
            import matplotlib.pyplot as plt

            # plot underlying histogram
            plt.clf()
            plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
            plt.step(binEdges[1:], plot_hist, color="k")

            # plot the Rician distribution on top
            plt.plot(bincenters, dist_pdf, "r-", label="Rician")

            # plot the gaussian on top
            plt.plot(binEdges[1:], utils.gaussian(bincenters, gaussian), color="b", ls=":", label="Gaussian")
            plt.yscale("log")
            plt.ylim([0.001, 2 * max(plot_hist)])

            # plot the thresholds
            plt.axvline(threshold, color="g")
            plt.axvline(gap, color="r")

            # plot flagged values in different colour
            if len(rmse[rmse > gap]) > 0:
                plt.step(binEdges[1:][bincenters >= gap], plot_hist[bincenters >= gap], color="r")

            # prettify
            plt.xlabel("RMSE between complete record and each year")
            plt.ylabel("Frequency")
            plt.title(station.id + " annual wind rose differences")
            plt.xlim([0, 1.1 * np.ma.max(rmse)])
            plt.legend(loc="lower right", frameon=False)

            plt.show()

            # plot all the annual wind roses, flattened out.

            plt.clf()

            hist, binEdges = np.histogram(direction, bins=np.arange(0.0, 360.0 + DEGREEBINS, DEGREEBINS), normed=True)
            bincenters = (binEdges[:-1] + binEdges[1:]) / 2.0
            plt.plot(bincenters, hist, "k-", lw=2)

            for y, year in enumerate(month_ranges_years):
                if len(speed[year[0][0] : year[-1][0]].compressed() > 0):
                    hist, binEdges = np.histogram(direction[year[0][0] : year[-1][0]], bins=binEdges, normed=True)
                    plt.plot(bincenters, hist)

            plt.xlabel("Direction (degrees)")
            plt.show()

            # plot wind roses as wind roses

            plot_wind_rose(speed, direction, "{} - {}".format(station.id, "all years"))

            for y, year in enumerate(month_ranges_years):
                if len(speed[year[0][0] : year[-1][0]].compressed() > 0):
                    plot_wind_rose(
                        speed[year[0][0] : year[-1][0]],
                        direction[year[0][0] : year[-1][0]],
                        "{} - {}".format(station.id, start.year + y),
                        label="RMSE {:6.4f}\nThreshold {:6.4f}".format(rmse[y], gap),
                    )

    # and apply the flags and output text

    flag_locs, = np.where(flags != 0)
    if plots or diagnostics:
        utils.print_flagged_obs_number(logfile, "Wind Rose Check", "windspeeds/dirs", len(flag_locs), noWrite=True)
    else:
        utils.print_flagged_obs_number(logfile, "Wind Rose Check", "windspeeds/dirs", len(flag_locs))

    station.qc_flags[:, flag_col] = flags

    # and flag the variables
    station.windspeeds.flags[flag_locs] = 1
    station.winddirs.flags[flag_locs] = 1

    return  # wind_rose_check
Exemplo n.º 8
0
def wind_rose_check(station, flag_col, start, end, logfile, plots = False, diagnostics = False, doMonth = False):
    '''
    Checks for large differences in the year-to-year wind-rose shape.  
    Uses RMSE and fits Gaussian.  Finds gap in distribution to flag beyond

    :param MetStation station: station object
    :param int flag_col: which column to store the flags in
    :param datetime start: start of data
    :param datetime end: end of data
    :param bool plots: run the plots
    :param bool diagnostics: run the diagnostics

    '''
    st_var_spd = getattr(station, "windspeeds")
    st_var_dir = getattr(station, "winddirs")

    direction = st_var_dir.data
    speed = st_var_spd.data
    flags = station.qc_flags[:,flag_col]

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges_years = month_ranges.reshape(-1,12,2)

    filtered_direction = utils.apply_filter_flags(st_var_dir, doMonth = doMonth, start = start, end = end)
    filtered_speed = utils.apply_filter_flags(st_var_spd, doMonth = doMonth, start = start, end = end)

    # histogram of wind directions ( ~ unravelled wind-rose)
    dir_bins = range(0,360+DEGREEBINS,DEGREEBINS)
    full_hist, full_binEdges = np.histogram(filtered_direction.compressed(), bins = dir_bins, normed = True)
    
    if diagnostics:
        print full_hist

    # use rmse as this is known (Chi-sq remains just in case)
    rmse, chisq = -np.ma.ones([month_ranges_years.shape[0]]), -np.ma.ones([month_ranges_years.shape[0]])

    # run through each year to extract RMSE's
    for y,year in enumerate(month_ranges_years):

        if len(direction[year[0][0]:year[-1][0]].compressed()) > 0:

            hist, dummy = np.histogram(direction[year[0][0]:year[-1][0]].compressed(),  bins = dir_bins, normed = True)

            chisq[y] = np.sum((full_hist-hist)**2/(full_hist+hist))/2.
            rmse[y] = np.sqrt(np.mean((full_hist-hist)**2))

        else:
            rmse.mask[y] = True

    # now to bin up the differences and see what the fit is.
    # need to have values spread so can bin!
    if len(np.unique(rmse.compressed())) > 1:
        rmse_binEdges, rmse_bincenters = wind_create_bins(rmse)
        hist, rmse_binEdges = np.histogram(rmse,  bins = rmse_binEdges)#, density=True)

        norm = get_histogram_norm(rmse, rmse_binEdges)

        # inputs for fit
        mu = np.mean(rmse)
        std = np.std(rmse)

        # try to get decent fit to bulk of obs.
    #    initial_values = [np.max(hist), np.mean(rmse), np.std(rmse), stats.skew(rmse), stats.kurtosis(rmse)] # norm, mean, std, sk#ew, kurtosis
    #    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
    #    res = utils.hermite2gauss(fit[0])
    #    plot_gaussian = utils.funcGH(fit[0], bincenters)

        fit = stats.rice.fit(rmse.compressed(), loc = 0, scale = np.ma.std(rmse))
        dist_pdf = stats.rice.pdf(rmse_bincenters, fit[:-2], loc=fit[-2], scale=fit[-1]) * norm

        gaussian = utils.fit_gaussian(rmse_bincenters, hist, max(hist), mu = mu, sig = std)

        # invert Gaussian to find initial threshold, then hunt for first gap beyond
        # threshold = utils.invert_gaussian(PROB_THRESHOLD, gaussian)

        # invert Rician to find initial threshold, then hunt for first gap beyond
        if dist_pdf[-1] < PROB_THRESHOLD:
            # then curve has dropped below the threshold, so can find some updated ones.
            threshold = -np.where(dist_pdf[::-1] > PROB_THRESHOLD)[0][0]
        else:
            threshold = rmse_bincenters[-1]

        n = 0
        center = np.argmax(hist)
        gap = rmse_bincenters[-1] # nothing should be beyond this

        while True:
            if center + n + 1 == len(rmse_bincenters): 
                # gone beyond edge - nothing to flag, so just break
                break

            if rmse_bincenters[center + n] < threshold:
                n += 1
                # continue moving outwards
                continue

            if hist[center + n] == 0:
                # found one
                if center + n + 1 == len(rmse_bincenters):
                    # gone beyond edge - nothing to flag - escape
                    break
                elif hist[center + n + 1] == 0:
                    # has to be two bins wide?
                    gap = rmse_bincenters[center + n]
                    break
            n += 1

        # run through each year to extract RMSE's
        for y,year in enumerate(month_ranges_years):

                if rmse[y] > gap:

                    # only flag where there are observations
                    good, = np.where(np.logical_or(direction.mask[year[0][0]:year[-1][0]] == False, speed.mask[year[0][0]:year[-1][0]] == False))

                    if len(good) > 100:

                        flags[year[0][0]:year[-1][0]][good] = 1

                        if diagnostics or plots:
                            print "Flagging {}  RMSE {} > {}".format(y+start.year, rmse[y], gap)

                    else:
                        if diagnostics or plots:
                            print "{} beyond threshold (RMSE {} > {}) but retained as only {} observations\n".format(y+start.year, rmse[y], gap, len(good))
                        logfile.write("{} beyond threshold but retained as only {} observations\n".format(y+start.year, len(good)))
                            
                elif rmse.mask[y] == False: 
                    if diagnostics or plots:
                        print "{}".format(y+start.year)



        if plots:
            import matplotlib.pyplot as plt
            # plot underlying histogram
            plt.clf()
            plot_hist = np.array([float(x) if x != 0 else 1e-1 for x in hist])
            plt.step(rmse_binEdges[1:], plot_hist, color = 'k')

            # plot the Rician distribution on top
            plt.plot(rmse_bincenters, dist_pdf, "r-", label = "Rician") 

            # plot the gaussian on top
            plt.plot(rmse_binEdges[1:], utils.gaussian(rmse_bincenters, gaussian), color = 'b', ls = ":", label = "Gaussian")
            plt.yscale("log")
            plt.ylim([0.001, 2*max(plot_hist)])

            # plot the thresholds
            plt.axvline(threshold, color = 'g')
            plt.axvline(gap, color = 'r')

            # plot flagged values in different colour
            if len(rmse[rmse > gap]) > 0:
                plt.step(rmse_binEdges[1:][rmse_bincenters >= gap], plot_hist[rmse_bincenters >= gap], color = 'r')

            # prettify
            plt.xlabel("RMSE between complete record and each year")
            plt.ylabel("Frequency")
            plt.title(station.id + " annual wind rose differences")
            plt.xlim([0, 1.1*np.ma.max(rmse)])
            plt.legend(loc = "lower right", frameon = False)

            plt.show()


            # plot all the annual wind roses, flattened out.

            plt.clf()

            bincenters = (full_binEdges[:-1] + full_binEdges[1:])/2.
            plt.plot(bincenters, full_hist, "k-", lw = 2)

            for y,year in enumerate(month_ranges_years):
                if len(speed[year[0][0]:year[-1][0]].compressed() > 0):
                    hist, binEdges = np.histogram(direction[year[0][0]:year[-1][0]].compressed(),  bins = dir_bins, normed = True) 
                    plt.plot(bincenters, hist)

            plt.xlabel("Direction (degrees)")
            plt.show()

            # plot wind roses as wind roses

            plot_wind_rose(speed, direction, "{} - {}".format(station.id, "all years"))

            for y,year in enumerate(month_ranges_years):
                if len(speed[year[0][0]:year[-1][0]].compressed() > 0):
                    plot_wind_rose(speed[year[0][0]:year[-1][0]], direction[year[0][0]:year[-1][0]], "{} - {}".format(station.id, start.year + y), label = "RMSE {:6.4f}\nThreshold {:6.4f}".format(rmse[y], gap))
                else:
                    print "no data for {}".format(year)

    # and apply the flags and output text

    flag_locs, = np.where(flags != 0)

    utils.print_flagged_obs_number(logfile, "Wind Rose Check", "windspeeds/dirs", len(flag_locs), noWrite=diagnostics)
    
    
    station.qc_flags[:,flag_col] = flags

    # and flag the variables
    station.windspeeds.flags[flag_locs] = 1
    station.winddirs.flags[flag_locs] = 1

    return # wind_rose_check
Exemplo n.º 9
0
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    all_filtered = utils.apply_filter_flags(st_var)

    for month in range(12):

        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        this_month_data = np.array([])
        this_month_filtered = np.array([])

        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)

        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)

                print "Spurious_stations file not yet sorted"

            if iqr != 0.0:
                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    monthly_values, BIN_SIZE / 10.)

                hist, binEdges = np.histogram(monthly_values, bins=bins)

                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [
                        np.max(hist),
                        np.mean(monthly_values),
                        np.std(monthly_values),
                        stats.skew(monthly_values),
                        stats.kurtosis(monthly_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                else:
                    gaussian = utils.fit_gaussian(bincenters,
                                                  hist,
                                                  max(hist),
                                                  mu=np.mean(monthly_values),
                                                  sig=np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    monthly_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                            if windspeeds:
                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))

                                if len(storms[0]) >= 2:

                                    storm_1diffs = np.diff(storms)

                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("",
                                       "Distributional Gap",
                                       variable,
                                       len(gap_plot_values),
                                       noWrite=True)

    return flags  # dgc_all_obs
Exemplo n.º 10
0
def dgc_monthly(station,
                variable,
                flags,
                start,
                end,
                plots=False,
                diagnostics=False,
                idl=False):
    '''
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
    :returns: 
       flags - updated flag array
    '''

    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)

    month_ranges = utils.month_starts_in_pairs(start, end)

    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average.fill(st_var.mdi)
    month_average_filtered = np.empty(month_ranges.shape[0])
    month_average_filtered.fill(st_var.mdi)

    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):

        data = st_var.data[month[0]:month[1]]

        filtered = all_filtered[month[0]:month[1]]

        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT,
                                                    st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(
            filtered, OBS_LIMIT, st_var.mdi, MEAN)

    # get overall monthly climatologies - use filtered data

    month_average = month_average.reshape(-1, 12)
    month_average_filtered = month_average_filtered.reshape(-1, 12)

    standardised_months = np.empty(month_average.shape)
    standardised_months.fill(st_var.mdi)

    for m in range(12):

        valid_filtered = np.where(month_average_filtered[:, m] != st_var.mdi)

        if len(valid_filtered[0]) >= VALID_MONTHS:

            valid_data = month_average_filtered[valid_filtered, m][0]

            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)

            else:
                if idl:
                    clim = utils.idl_median(
                        valid_data.compressed().reshape(-1))
                else:
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT

            standardised_months[valid_filtered,
                                m] = (month_average[valid_filtered, m] -
                                      clim) / spread

    standardised_months = standardised_months.reshape(month_ranges.shape[0])

    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months],
                                             BIN_SIZE)
        dummy, plot_bincenters = utils.create_bins(
            standardised_months[good_months], BIN_SIZE / 10.)

        hist, binEdges = np.histogram(standardised_months[good_months],
                                      bins=bins)

        fit = utils.fit_gaussian(bincenters,
                                 hist,
                                 max(hist),
                                 mu=np.mean(standardised_months[good_months]),
                                 sig=np.std(standardised_months[good_months]))
        plot_gaussian = utils.gaussian(plot_bincenters, fit)

        dgc_set_up_plot(plot_gaussian,
                        standardised_months[good_months],
                        variable,
                        sub_par="Months")

    # remove all months with a large standardised offset

    if len(good_months[0]) >= MONTH_LIMIT:

        standardised_months = np.ma.masked_values(standardised_months,
                                                  st_var.mdi)
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:

            for lo in large_offsets[0]:
                flags[month_ranges[lo, 0]:month_ranges[lo, 1]] = 1

            if plots:

                hist, binEdges = np.histogram(
                    standardised_months[large_offsets], bins=bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                plt.step(bincenters,
                         plot_hist,
                         'g-',
                         label='> %i' % LARGE_LIMIT,
                         where='mid',
                         zorder=5)

                plt.axvline(5, c='g')
                plt.axvline(-5, c='g')

        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2

        good = True
        iter = 1
        while good:

            if standardised_months[good_months][sort_order][
                    mid_point -
                    iter] != standardised_months[good_months][sort_order][
                        mid_point + iter]:
                # using IDL notation
                tempvals = [
                    np.abs(
                        standardised_months[good_months][sort_order][mid_point
                                                                     - iter]),
                    np.abs(
                        standardised_months[good_months][sort_order][mid_point
                                                                     + iter])
                ]

                if min(tempvals) != 0:
                    if max(tempvals) / min(tempvals) >= 2. and min(
                            tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.

                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            #RHS
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots:
                                badplot = standardised_months[good_months][
                                    sort_order][mid_point + iter:]

                        for b in bad:
                            flags[month_ranges[b, 0]:month_ranges[b, 1]] = 1

                        if plots:

                            hist, binEdges = np.histogram(badplot, bins=bins)
                            plot_hist = np.array(
                                [0.01 if h == 0 else h for h in hist])
                            plt.step(bincenters,
                                     plot_hist,
                                     'r-',
                                     label='Gap',
                                     where='mid',
                                     zorder=4)

                        good = False

            iter += 1
            if iter == mid_point: break

        if plots:
            plt.legend(loc='lower center',
                       ncol=4,
                       bbox_to_anchor=(0.5, -0.2),
                       frameon=False,
                       prop={'size': 13})
            plt.show()
            #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png')

    return flags  # dgc_monthly
Exemplo n.º 11
0
def find_month_thresholds(obs_var,
                          station,
                          config_file,
                          plots=False,
                          diagnostics=False,
                          winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    # get hourly climatology for each month
    for month in range(1, 13):

        normalised_anomalies = prepare_data(obs_var,
                                            station,
                                            month,
                                            diagnostics=diagnostics,
                                            winsorize=winsorize)

        if len(normalised_anomalies.compressed()
               ) >= utils.DATA_COUNT_THRESHOLD:

            bins = utils.create_bins(normalised_anomalies, BIN_WIDTH,
                                     obs_var.name)
            hist, bin_edges = np.histogram(normalised_anomalies.compressed(),
                                           bins)

            gaussian_fit = utils.fit_gaussian(
                bins[1:],
                hist,
                max(hist),
                mu=bins[np.argmax(hist)],
                sig=utils.spread(normalised_anomalies))

            fitted_curve = utils.gaussian(bins[1:], gaussian_fit)

            # diagnostic plots
            if plots:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.step(bins[1:], hist, color='k', where="pre")
                plt.yscale("log")

                plt.ylabel("Number of Observations")
                plt.xlabel("Scaled {}".format(obs_var.name.capitalize()))
                plt.title("{} - month {}".format(station.id, month))

                plt.plot(bins[1:], fitted_curve)
                plt.ylim([0.1, max(hist) * 2])

            # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD
            try:
                lower_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] < 0))[0]][-1]
            except:
                lower_threshold = bins[1]
            try:
                upper_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] > 0))[0]][0]
            except:
                upper_threshold = bins[-1]

            if plots:
                plt.axvline(upper_threshold, c="r")
                plt.axvline(lower_threshold, c="r")
                plt.show()

            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(upper_threshold),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(lower_threshold),
                                  diagnostics=diagnostics)

    return  # find_month_thresholds
Exemplo n.º 12
0
def dgc_all_obs(station, variable, flags, start, end, plots = False, diagnostics = False, idl = False, windspeeds = False, GH = False):
    '''RJHD addition working on all observations'''
    
    if plots:
        import matplotlib.pyplot as plt

    st_var = getattr(station, variable)
    
    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1,12,2)
    
    all_filtered = utils.apply_filter_flags(st_var)

 
    for month in range(12):
    
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")
            
            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:,month,:]):
            
                if y == 0:
                    windspeeds_month = np.ma.array(st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate([windspeeds_month, st_var_wind.data[year[0]:year[1]]])
                  
            windspeeds_month_average = dgc_get_monthly_averages(windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(windspeeds_month, median=True)
    
        
        this_month_data = np.array([])
        this_month_filtered = np.array([])
        
        this_month_data, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], st_var.data, hours = False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(month_ranges[:,month,:], all_filtered, hours = False)
                
        if len(this_month_filtered.compressed()) > OBS_LIMIT:
            
            if idl:
                monthly_median = utils.idl_median(this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)
                  
            iqr = utils.IQR(this_month_filtered.compressed())
            
            
            if iqr == 0.0:
                # to get some spread if IQR too small                   
                iqr = utils.IQR(this_month_filtered.compressed(), percentile = 0.05)
                
                print "Spurious_stations file not yet sorted"
    

            if iqr != 0.0:               
                monthly_values = np.ma.array((this_month_data.compressed() - monthly_median) / iqr)

                bins, bincenters = utils.create_bins(monthly_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(monthly_values, BIN_SIZE/10.)
        
                hist, binEdges = np.histogram(monthly_values, bins = bins)
                                               
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    initial_values = [np.max(hist), np.mean(monthly_values), np.std(monthly_values), stats.skew(monthly_values), stats.kurtosis(monthly_values)] # norm, mean, std, skew, kurtosis
                    
                    fit = leastsq(utils.residualsGH, initial_values, [bincenters, hist, np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics = diagnostics)
                    
                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[mid_point:][bad[0]:] = FREQUENCY_THRESHOLD/10.

                    bad, = np.where(plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD/10.)
                    if len(bad) > 0: plot_gaussian[:mid_point][:bad[-1]] = FREQUENCY_THRESHOLD/10.                   

                    # extract threshold values
                    good_values = np.argwhere(plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(plot_bincenters[good_values[-1]])
                                      

                else:
                    gaussian = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(monthly_values), sig = np.std(monthly_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold


                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                if diagnostics:
                    if GH:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                    else:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian, monthly_values, variable, threshold = (u_minimum_threshold, l_minimum_threshold), sub_par = "observations", GH = GH)
                     
                    if GH:
                        plt.figtext(0.15, 0.67, 'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %(res['mean'], res['dispersion'], res['skewness'], res['kurtosis']), color='k', size='small')

                    

                uppercount = len(np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(np.where(monthly_values < l_minimum_threshold)[0])
                
                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, u_minimum_threshold)
                        
                    if gap_start != 0:
                        
                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(((this_year_data - monthly_median) / iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)


                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges, l_minimum_threshold)
                        
                    if gap_start != 0:

                        for y, year in enumerate(month_ranges[:,month,:]):
                
                            this_year_data = np.ma.array(all_filtered[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.where(np.logical_and(((this_year_data - monthly_median) / iqr) < gap_start, this_year_data.mask != True))

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics: gap_plot_values = np.append(gap_plot_values, (this_year_data[gap_cleaned_locations].compressed() - monthly_median)/iqr)
                    

                            if windspeeds:
                                this_year_flags[gap_cleaned_locations] = 2 # tentative flags
                                
                                slp_average = dgc_get_monthly_averages(this_month_data, OBS_LIMIT, st_var.mdi, MEAN)
                                slp_mad = utils.mean_absolute_deviation(this_month_data, median=True)
                                storms = np.where((((windspeeds_month - windspeeds_month_average) / windspeeds_month_mad) > 4.5) &\
                                                   (((this_month_data - slp_average) / slp_mad) > 4.5))
                                
                                if len(storms[0]) >= 2:
                                    
                                    storm_1diffs = np.diff(storms)
                                    
                                    separations = np.where(storm_1diffs != 1)

                                    #for sep in separations:


                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins = bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters, plot_hist, 'r-', label = 'flagged', where='mid')
                    import calendar
                    plt.text(0.1,0.9,calendar.month_name[month+1], transform = plt.gca().transAxes)
                    plt.legend(loc='lower center',ncol=3, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')
    if diagnostics:
        utils.print_flagged_obs_number("", "Distributional Gap", variable, len(gap_plot_values), noWrite=True)

    return flags # dgc_all_obs
Exemplo n.º 13
0
def dgc_monthly(station, variable, flags, start, end, plots=False, diagnostics=False, idl = False):
    '''
    Original Distributional Gap Check

    :param obj station: station object
    :param str variable: variable to act on
    :param array flags: flags array
    :param datetime start: data start
    :param datetime end: data end
    :param bool plots: run plots
    :param bool diagnostics: run diagnostics
    :param bool idl: run IDL equivalent routines for median
    :returns: 
       flags - updated flag array
    '''

    if plots:
        import matplotlib.pyplot as plt
    
    st_var = getattr(station, variable)
    
    month_ranges = utils.month_starts_in_pairs(start, end)
    
    # get monthly averages
    month_average = np.empty(month_ranges.shape[0])
    month_average.fill(st_var.mdi)
    month_average_filtered = np.empty(month_ranges.shape[0])
    month_average_filtered.fill(st_var.mdi)
    
    all_filtered = utils.apply_filter_flags(st_var)
    for m, month in enumerate(month_ranges):
        
        data = st_var.data[month[0]:month[1]]
        
        filtered = all_filtered[month[0]:month[1]]
        
        month_average[m] = dgc_get_monthly_averages(data, OBS_LIMIT, st_var.mdi, MEAN)
        month_average_filtered[m] = dgc_get_monthly_averages(filtered, OBS_LIMIT, st_var.mdi, MEAN)
            
    # get overall monthly climatologies - use filtered data
    
    month_average = month_average.reshape(-1,12)
    month_average_filtered = month_average_filtered.reshape(-1,12)
    
    standardised_months = np.empty(month_average.shape)
    standardised_months.fill(st_var.mdi)
    
    for m in range(12):
        
        valid_filtered = np.where(month_average_filtered[:,m] != st_var.mdi)
        
        if len(valid_filtered[0]) >= VALID_MONTHS:
            
            valid_data = month_average_filtered[valid_filtered,m][0]
            
            if MEAN:
                clim = np.mean(valid_data)
                spread = np.stdev(valid_data)
                
            else:        
                if idl:
                    clim = utils.idl_median(valid_data.compressed().reshape(-1))
                else:
                    clim = np.median(valid_data)
                spread = utils.IQR(valid_data)
                if spread <= SPREAD_LIMIT:
                    spread = SPREAD_LIMIT
                    
            standardised_months[valid_filtered,m] = (month_average[valid_filtered,m] - clim) / spread 
                    
    standardised_months = standardised_months.reshape(month_ranges.shape[0]) 
    
    good_months = np.where(standardised_months != st_var.mdi)

    # must be able to do this with masked arrays
    if plots:
        bins, bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE)
        dummy, plot_bincenters = utils.create_bins(standardised_months[good_months], BIN_SIZE/10.)

        hist, binEdges = np.histogram(standardised_months[good_months], bins = bins)   

        fit = utils.fit_gaussian(bincenters, hist, max(hist), mu = np.mean(standardised_months[good_months]), sig = np.std(standardised_months[good_months]))
        plot_gaussian = utils.gaussian(plot_bincenters, fit)

        dgc_set_up_plot(plot_gaussian, standardised_months[good_months], variable, sub_par = "Months")
        
    # remove all months with a large standardised offset
        
    if len(good_months[0]) >= MONTH_LIMIT:
                
        standardised_months = np.ma.masked_values(standardised_months, st_var.mdi)
        large_offsets = np.where(standardised_months >= LARGE_LIMIT)

        if len(large_offsets[0]) > 0:
            
            for lo in large_offsets[0]:
                flags[month_ranges[lo,0]:month_ranges[lo,1]] = 1
                
            if plots:
                
                hist, binEdges = np.histogram(standardised_months[large_offsets], bins = bins)
                plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                plt.step(bincenters, plot_hist, 'g-', label = '> %i' % LARGE_LIMIT, where = 'mid', zorder = 5)
                
                plt.axvline(5,c='g')
                plt.axvline(-5,c='g')



        # walk distribution from centre and see if any assymetry
        sort_order = standardised_months[good_months].argsort()

        mid_point = len(good_months[0]) / 2
        
        good = True
        iter = 1
        while good:
            
            if standardised_months[good_months][sort_order][mid_point - iter] != standardised_months[good_months][sort_order][mid_point + iter]:
                # using IDL notation
                tempvals = [np.abs(standardised_months[good_months][sort_order][mid_point - iter]),np.abs(standardised_months[good_months][sort_order][mid_point + iter])]
                
                if min(tempvals) != 0:
                    if max(tempvals)/min(tempvals) >= 2. and min(tempvals) >= 1.5:
                        # substantial asymmetry in distribution - at least 1.5 from centre and difference of 2.
                        
                        if tempvals[0] == max(tempvals):
                            # LHS
                            bad = good_months[0][sort_order][:mid_point - iter]
                            if plots: badplot = standardised_months[good_months][sort_order][:mid_point - iter]
                        elif tempvals[1] == max(tempvals):
                            #RHS
                            bad = good_months[0][sort_order][mid_point + iter:]
                            if plots: badplot = standardised_months[good_months][sort_order][mid_point + iter:]
                            
                        for b in bad:
                            flags[month_ranges[b,0]:month_ranges[b,1]] = 1
                
                        if plots:
                            
                            hist, binEdges = np.histogram(badplot, bins = bins)
                            plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                            plt.step(bincenters, plot_hist, 'r-', label = 'Gap', where = 'mid', zorder = 4)
                
                        good = False        
                            
                
            iter += 1
            if iter == mid_point: break
                
                          
        if plots: 
            plt.legend(loc='lower center',ncol=4, bbox_to_anchor=(0.5,-0.2),frameon=False,prop={'size':13})
            plt.show()
            #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap.png')
                   
    return flags # dgc_monthly
Exemplo n.º 14
0
def dgc_all_obs(station,
                variable,
                flags,
                start,
                end,
                logfile,
                plots=False,
                diagnostics=False,
                idl=False,
                windspeeds=False,
                GH=False,
                doMonth=False):
    '''RJHD addition working on all observations'''

    if plots:
        import matplotlib.pyplot as plt

    month_ranges = utils.month_starts_in_pairs(start, end)
    month_ranges = month_ranges.reshape(-1, 12, 2)

    # extract variable
    st_var = getattr(station, variable)
    # apply flags (and mask incomplete year if appropriate)
    all_filtered = utils.apply_filter_flags(st_var,
                                            doMonth=doMonth,
                                            start=start,
                                            end=end)

    st_var_complete_year = copy.deepcopy(st_var)
    if doMonth:
        # restrict the incomplete year if appropriate - keep other flagged obs.
        full_year_end = utils.get_first_hour_this_year(start, end)
        st_var_complete_year.data.mask[full_year_end:] = True

    for month in range(12):

        # if requiring wind data, extract data and find monthly averages
        if windspeeds == True:
            st_var_wind = getattr(station, "windspeeds")

            if doMonth:
                # restrict the incomplete year if appropriate
                st_var_wind.data.mask[full_year_end:] = True

            # get monthly averages
            windspeeds_month = np.empty([])
            for y, year in enumerate(month_ranges[:, month, :]):

                if y == 0:
                    windspeeds_month = np.ma.array(
                        st_var_wind.data[year[0]:year[1]])
                else:
                    windspeeds_month = np.ma.concatenate(
                        [windspeeds_month, st_var_wind.data[year[0]:year[1]]])

            windspeeds_month_average = dgc_get_monthly_averages(
                windspeeds_month, OBS_LIMIT, st_var_wind.mdi, MEAN)
            windspeeds_month_mad = utils.mean_absolute_deviation(
                windspeeds_month, median=True)

        # pull data from each calendar month together
        this_month_data, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var.data, hours=False)
        this_month_filtered, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], all_filtered, hours=False)
        this_month_complete, dummy, dummy = utils.concatenate_months(
            month_ranges[:, month, :], st_var_complete_year.data, hours=False)

        # if enough clean and complete data for this calendar month find the median and IQR
        if len(this_month_filtered.compressed()) > OBS_LIMIT:

            if idl:
                monthly_median = utils.idl_median(
                    this_month_filtered.compressed().reshape(-1))
            else:
                monthly_median = np.ma.median(this_month_filtered)

            iqr = utils.IQR(this_month_filtered.compressed())

            if iqr == 0.0:
                # to get some spread if IQR too small
                iqr = utils.IQR(this_month_filtered.compressed(),
                                percentile=0.05)
                print "Spurious_stations file not yet sorted"

            # if have an IQR, anomalise using median and standardise using IQR
            if iqr != 0.0:

                monthly_values = np.ma.array(
                    (this_month_data.compressed() - monthly_median) / iqr)
                complete_values = np.ma.array(
                    (this_month_complete.compressed() - monthly_median) / iqr)

                # use complete years only for the histogram - aiming to find outliers.
                bins, bincenters = utils.create_bins(complete_values, BIN_SIZE)
                dummy, plot_bincenters = utils.create_bins(
                    complete_values, BIN_SIZE / 10.)
                hist, binEdges = np.histogram(complete_values, bins=bins)
                """
                Change to monthly updates Oct 2017
                Thought about changing distribution to use filtered values
                But this changes the test beyond just dealing with additional months
                Commented out lines below would be alternative.
                """
                # bins, bincenters = utils.create_bins(filtered_values, BIN_SIZE)
                # dummy, plot_bincenters = utils.create_bins(filtered_values, BIN_SIZE/10.)
                # hist, binEdges = np.histogram(filtered_values, bins = bins)

                # used filtered (incl. incomplete year mask) to determine the distribution.
                if GH:
                    # Use Gauss-Hermite polynomials to add skew and kurtosis to Gaussian fit - January 2015 ^RJHD

                    # Feb 2019 - if large amounts off centre, can affect initial values
                    # switched to median and MAD
                    initial_values = [
                        np.max(hist),
                        np.median(complete_values),
                        utils.mean_absolute_deviation(complete_values,
                                                      median=True),
                        stats.skew(complete_values),
                        stats.kurtosis(complete_values)
                    ]  # norm, mean, std, skew, kurtosis

                    fit = leastsq(utils.residualsGH, initial_values,
                                  [bincenters, hist,
                                   np.ones(len(hist))])
                    res = utils.hermite2gauss(fit[0], diagnostics=diagnostics)

                    plot_gaussian = utils.funcGH(fit[0], plot_bincenters)

                    # adjust to remove the rising bumps seen in some fits - artefacts of GH fitting?
                    mid_point = np.argmax(plot_gaussian)
                    bad, = np.where(
                        plot_gaussian[mid_point:] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[mid_point:][
                            bad[0]:] = FREQUENCY_THRESHOLD / 10.

                    bad, = np.where(
                        plot_gaussian[:mid_point] < FREQUENCY_THRESHOLD / 10.)
                    if len(bad) > 0:
                        plot_gaussian[:mid_point][:bad[
                            -1]] = FREQUENCY_THRESHOLD / 10.

                    # extract threshold values
                    good_values = np.argwhere(
                        plot_gaussian > FREQUENCY_THRESHOLD)

                    l_minimum_threshold = round(
                        plot_bincenters[good_values[0]]) - 1
                    u_minimum_threshold = 1 + round(
                        plot_bincenters[good_values[-1]])

                    if diagnostics:
                        print hist
                        print res
                        print iqr, l_minimum_threshold, u_minimum_threshold

                # or just a standard Gaussian
                else:
                    gaussian = utils.fit_gaussian(
                        bincenters,
                        hist,
                        max(hist),
                        mu=np.median(complete_values),
                        sig=utils.mean_absolute_value(complete_values))

                    # assume the same threshold value
                    u_minimum_threshold = 1 + round(
                        utils.invert_gaussian(FREQUENCY_THRESHOLD, gaussian))
                    l_minimum_threshold = -u_minimum_threshold

                    plot_gaussian = utils.gaussian(plot_bincenters, gaussian)

                    if diagnostics:
                        print hist
                        print gaussian
                        print iqr, u_minimum_threshold, 1. + utils.invert_gaussian(
                            FREQUENCY_THRESHOLD, gaussian)

                if plots:
                    dgc_set_up_plot(plot_gaussian,
                                    complete_values,
                                    variable,
                                    threshold=(u_minimum_threshold,
                                               l_minimum_threshold),
                                    sub_par="observations",
                                    GH=GH)

                    if GH:
                        plt.figtext(
                            0.15,
                            0.67,
                            'Mean %.2f, S.d. %.2f,\nSkew %.2f, Kurtosis %.2f' %
                            (res['mean'], res['dispersion'], res['skewness'],
                             res['kurtosis']),
                            color='k',
                            size='small')

                # now trying to find gaps in the distribution
                uppercount = len(
                    np.where(monthly_values > u_minimum_threshold)[0])
                lowercount = len(
                    np.where(monthly_values < l_minimum_threshold)[0])

                # this needs refactoring - but lots of variables to pass in
                if plots or diagnostics: gap_plot_values = np.array([])

                # do one side of distribution and then other
                if uppercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             u_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            # not using filtered - checking all available data
                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                ((this_year_data - monthly_median) /
                                 iqr) > gap_start)

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Upper {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                if lowercount > 0:
                    gap_start = dgc_find_gap(hist, binEdges,
                                             l_minimum_threshold)

                    if gap_start != 0:

                        # if found a gap, then go through each year for this calendar month
                        #  and flag observations further from middle
                        for y, year in enumerate(month_ranges[:, month, :]):

                            this_year_data = np.ma.array(
                                st_var.data[year[0]:year[1]])
                            this_year_flags = np.array(flags[year[0]:year[1]])
                            gap_cleaned_locations = np.ma.where(
                                np.logical_and(
                                    ((this_year_data - monthly_median) / iqr) <
                                    gap_start, this_year_data.mask != True))
                            # add flag requirement for low pressure bit if appropriate

                            this_year_flags[gap_cleaned_locations] = 1
                            flags[year[0]:year[1]] = this_year_flags

                            if plots or diagnostics:
                                gap_plot_values = np.append(
                                    gap_plot_values,
                                    (this_year_data[gap_cleaned_locations].
                                     compressed() - monthly_median) / iqr)

                                if len(gap_cleaned_locations[0]) > 0:
                                    print "Lower {}-{} - {} obs flagged".format(
                                        y + start.year, month,
                                        len(gap_cleaned_locations[0]))
                                    print gap_cleaned_locations, this_year_data[
                                        gap_cleaned_locations]

                            # if doing SLP then do extra checks for storms
                            if windspeeds:
                                windspeeds_year = np.ma.array(
                                    st_var_wind.data[year[0]:year[1]])

                                this_year_flags[
                                    gap_cleaned_locations] = 2  # tentative flags

                                slp_average = dgc_get_monthly_averages(
                                    this_month_data, OBS_LIMIT, st_var.mdi,
                                    MEAN)
                                slp_mad = utils.mean_absolute_deviation(
                                    this_month_data, median=True)

                                # need to ensure that this_year_data is less than slp_average, hence order of test
                                storms, = np.ma.where((((windspeeds_year - windspeeds_month_average) / windspeeds_month_mad) > MAD_THRESHOLD) &\
                                                   (((slp_average - this_year_data) / slp_mad) > MAD_THRESHOLD))

                                # using IDL terminology
                                if len(storms) >= 2:
                                    # use the first difference series to find when there are gaps in
                                    # contiguous sequences of storm observations - want to split up into
                                    # separate storm events
                                    storm_1diffs = np.diff(storms)
                                    separations, = np.where(storm_1diffs != 1)

                                    # expand around storm signal so that all low SLP values covered, and unflagged
                                    if len(separations) >= 1:
                                        print "  multiple storms in {} {}".format(
                                            y + start.year, month)

                                        # if more than one storm signal that month, then use intervals
                                        #    in the first difference series to expand around the first interval alone
                                        storm_start = 0
                                        storm_finish = separations[0] + 1
                                        first_storm = dgc_expand_storms(
                                            storms[storm_start:storm_finish],
                                            len(this_year_data))
                                        final_storms = copy.deepcopy(
                                            first_storm)

                                        for j in range(len(separations)):
                                            # then do the rest in a loop

                                            if j + 1 == len(separations):
                                                # final one
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:],
                                                    len(this_year_data))
                                            else:
                                                this_storm = dgc_expand_storms(
                                                    storms[separations[j] +
                                                           1:separations[j +
                                                                         1] +
                                                           1],
                                                    len(this_year_data))

                                            final_storms = np.append(
                                                final_storms, this_storm)

                                    else:
                                        # else just expand around the signal by 6 hours either way
                                        final_storms = dgc_expand_storms(
                                            storms, len(this_year_data))

                                else:
                                    final_storms = storms

                                if len(storms) >= 1:
                                    print "Tropical Storm signal in {} {}".format(
                                        y + start.year, month)
                                    this_year_flags[final_storms] = 0

                            # and write flags back into array
                            flags[year[0]:year[1]] = this_year_flags

                if plots:
                    hist, binEdges = np.histogram(gap_plot_values, bins=bins)
                    plot_hist = np.array([0.01 if h == 0 else h for h in hist])
                    plt.step(bincenters,
                             plot_hist,
                             'r-',
                             label='flagged',
                             where='mid')
                    import calendar
                    plt.text(0.1,
                             0.9,
                             calendar.month_name[month + 1],
                             transform=plt.gca().transAxes)
                    plt.legend(loc='lower center',
                               ncol=3,
                               bbox_to_anchor=(0.5, -0.2),
                               frameon=False,
                               prop={'size': 13})
                    plt.show()
                    #plt.savefig(IMAGELOCATION+'/'+station.id+'_DistributionalGap_'+str(month+1)+'.png')

    nflags, = np.where(flags != 0)
    utils.print_flagged_obs_number(logfile,
                                   "Distributional Gap All",
                                   variable,
                                   len(nflags),
                                   noWrite=diagnostics)

    return flags  # dgc_all_obs