示例#1
0
def get_repeating_string_threshold(obs_var,
                                   config_file,
                                   plots=False,
                                   diagnostics=False):
    """
    Use distribution to determine threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # mask calm periods (as these could be a reasonable string)
    this_var = copy.deepcopy(obs_var)
    if obs_var.name == "wind_speed":
        calms, = np.ma.where(this_var.data == 0)
        this_var.data[calms] = utils.MDI
        this_var.data.mask[calms] = True

    # only process further if there is enough data
    if len(this_var.data.compressed()) > 1:

        repeated_string_lengths, grouped_diffs, strings = prepare_data_repeating_string(
            this_var, plots=plots, diagnostics=diagnostics)

        # bin width is 1 as dealing in time index.
        # minimum bin value is 2 as this is the shortest string possible
        threshold = utils.get_critical_values(repeated_string_lengths,
                                              binmin=2,
                                              binwidth=1.0,
                                              plots=plots,
                                              diagnostics=diagnostics,
                                              title=this_var.name.capitalize(),
                                              xlabel="Repeating string length")

        # write out the thresholds...
        utils.write_qc_config(config_file,
                              "STREAK-{}".format(this_var.name),
                              "Straight",
                              "{}".format(threshold),
                              diagnostics=diagnostics)

    else:
        # store high value so threshold never reached
        utils.write_qc_config(config_file,
                              "STREAK-{}".format(this_var.name),
                              "Straight",
                              "{}".format(-utils.MDI),
                              diagnostics=diagnostics)

    return  # repeating_string_threshold
示例#2
0
def find_thresholds(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False,
                    winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    # get hourly climatology for each month
    for month in range(1, 13):

        variances = prepare_data(obs_var,
                                 station,
                                 month,
                                 diagnostics=diagnostics,
                                 winsorize=winsorize)

        if len(variances.compressed()) >= MIN_VARIANCES:
            average_variance = utils.average(variances)
            variance_spread = utils.spread(variances)
        else:
            average_variance = utils.MDI
            variance_spread = utils.MDI

        utils.write_qc_config(config_file,
                              "VARIANCE-{}".format(obs_var.name),
                              "{}-average".format(month),
                              "{}".format(average_variance),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "VARIANCE-{}".format(obs_var.name),
                              "{}-spread".format(month),
                              "{}".format(variance_spread),
                              diagnostics=diagnostics)

    return  # find_thresholds
示例#3
0
def find_monthly_scaling(obs_var, station, config_file, diagnostics=False):
    """
    Find scaling parameters for monthly values and store in config file

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    all_years = np.unique(station.years)

    for month in range(1, 13):

        month_averages = prepare_monthly_data(obs_var,
                                              station,
                                              month,
                                              diagnostics=diagnostics)

        if len(month_averages.compressed()) >= VALID_MONTHS:

            # have months, now to standardise
            climatology = utils.average(month_averages)  # mean
            spread = utils.spread(month_averages)  # IQR currently
            if spread < SPREAD_LIMIT:
                spread = SPREAD_LIMIT

            # write out the scaling...
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-clim".format(month),
                                  "{}".format(climatology),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-spread".format(month),
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

        else:
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-clim".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-spread".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)

    return  # find_monthly_scaling
示例#4
0
def identify_values(sealp,
                    stnlp,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Find average and spread of differences

    :param MetVar sealp: sea level pressure object
    :param MetVar stnlp: station level pressure object
    :param array times: datetime array
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    difference = sealp.data - stnlp.data

    if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD:

        average = utils.average(difference)
        spread = utils.spread(difference)
        if spread < MIN_SPREAD:  # less than XhPa
            spread = MIN_SPREAD
        elif spread > MAX_SPREAD:  # more than XhPa
            spread = MAX_SPREAD

        utils.write_qc_config(config_file,
                              "PRESSURE",
                              "average",
                              "{}".format(average),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "PRESSURE",
                              "spread",
                              "{}".format(spread),
                              diagnostics=diagnostics)

    return  # identify_values
示例#5
0
def find_offset(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Find the best offset for a sine curve to represent the cycle

    :param MetVar obs_var: Meteorological Variable object
    :param Station station: Station Object for the station
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    best_fit_diurnal, best_fit_uncertainty = prepare_data(station, obs_var)

    # done complete record, have best fit for each day
    # now to find best overall fit.
    #    find median offset for each uncertainty range from 1 to 6 hours
    best_fits = MISSING*np.ones(6).astype(int)
    for h in range(6):
        locs, = np.where(best_fit_uncertainty == h+1)

        if len(locs) >= utils.DATA_COUNT_THRESHOLD:
            best_fits[h] = np.median(best_fit_diurnal[locs])

    # now go through each of the 6hrs of uncertainty and see if the range
    # of the best fit +/- uncertainty overlap across them.
    # if they do, it's a well defined cycle, if not, then there's a problem

    '''Build up range of cycles incl, uncertainty to find where best of best located'''

    hours = np.arange(24)
    hour_matches = np.zeros(24)
    diurnal_peak = MISSING
    number_estimates = 0
    for h in range(6):
        if best_fits[h] != MISSING:
            '''Store lowest uncertainty best fit as first guess'''
            if diurnal_peak == MISSING: 
                diurnal_peak = best_fits[h]
                hours = np.roll(hours, 11-int(diurnal_peak))
                hour_matches[11-(h+1):11+(h+2)] = 1
                number_estimates += 1

            # get spread of uncertainty, and +1 to this range 
            centre, = np.where(hours == best_fits[h])

            if (centre[0] - (h + 1)) >= 0:
                if (centre[0] + h + 1) <= 23:
                    hour_matches[centre[0] - (h + 1) : centre[0] + (h + 2)] += 1
                else:
                    hour_matches[centre[0] - (h + 1) : ] += 1 # back part
                    hour_matches[ : centre[0] + (h + 2) - 24] += 1 # front part
            else:
                hour_matches[: centre[0] + h + 2] += 1 # front part
                hour_matches[centre[0] - (h + 1) :] += 1 # back part

            number_estimates += 1

    '''If value at lowest uncertainty not found in all others, then see what value is found by all others '''
    if hour_matches[11] != number_estimates:  # central estimate at 12 o'clock
        all_match, = np.where(hour_matches == number_estimates)

        # if one is, then use it
        if len(all_match) > 0:
            diurnal_peak = all_match[0]
        else:
            diurnal_peak = MISSING
            if diagnostics:
                print("Good fit to diurnal cycle not found")

    '''Now have value for best fit diurnal offset'''
    utils.write_qc_config(config_file, "DIURNAL-{}".format(obs_var.name), "peak", "{}".format(diurnal_peak), diagnostics=diagnostics)


    return best_fit_diurnal, best_fit_uncertainty # find_offset
示例#6
0
def get_critical_values(obs_var,
                        times,
                        config_file,
                        plots=False,
                        diagnostics=False):
    """
    Use distribution to determine critical values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param array times: array of times (usually in minutes)
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # use all first differences
    # TODO monthly?

    masked_times = np.ma.masked_array(times, mask=obs_var.data.mask)

    time_diffs = np.ma.diff(masked_times) / np.timedelta64(
        1, "m")  # presuming minutes
    value_diffs = np.ma.diff(obs_var.data)

    # get thresholds for each unique time differences
    unique_diffs = np.unique(time_diffs.compressed())

    for t_diff in unique_diffs:

        if t_diff == 0:
            # not a spike or jump, but 2 values at the same time.
            #  should be zero value difference, so fitting histogram not going to work
            #  handled in separate test
            print("test")
            continue

        locs, = np.where(time_diffs == t_diff)

        first_differences = value_diffs[locs]

        # ensure sufficient non-masked observations
        if len(first_differences.compressed()) >= utils.DATA_COUNT_THRESHOLD:

            # fit decay curve to one-sided distribution
            c_value = utils.get_critical_values(
                first_differences.compressed(),
                binmin=0,
                binwidth=0.5,
                plots=plots,
                diagnostics=diagnostics,
                xlabel="First differences",
                title="Spike - {} - {}m".format(obs_var.name.capitalize(),
                                                t_diff))

            # write out the thresholds...
            utils.write_qc_config(config_file,
                                  "SPIKE-{}".format(obs_var.name),
                                  "{}".format(t_diff),
                                  "{}".format(c_value),
                                  diagnostics=diagnostics)
            if diagnostics:
                print("   Time Difference: {} minutes".format(t_diff))
                print("      Number of obs: {}, threshold: {}".format(
                    len(first_differences.compressed()), c_value))
        else:
            if diagnostics:
                print("   Time Difference: {} minutes".format(t_diff))
                print("      Number of obs insufficient: {} < {}".format(
                    len(first_differences.compressed()),
                    utils.DATA_COUNT_THRESHOLD))

    return  # get_critical_values
示例#7
0
def identify_values(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Use distribution to identify frequent values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    # TODO - do we want to go down the road of allowing resolution (and hence test)
    #           to vary over the p-o-r?  I.e. 1C in early, to 0.5C to 0.1C in different decades?

    utils.write_qc_config(config_file,
                          "FREQUENT-{}".format(obs_var.name),
                          "width",
                          "{}".format(BIN_WIDTH),
                          diagnostics=diagnostics)

    for month in range(1, 13):

        locs, = np.where(station.months == month)

        month_data = obs_var.data[locs]

        if len(month_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
            # insufficient data, so write out empty config and move on
            utils.write_qc_config(config_file,
                                  "FREQUENT-{}".format(obs_var.name),
                                  "{}".format(month),
                                  "[{}]".format(",".join(str(s) for s in [])),
                                  diagnostics=diagnostics)
            continue

        # adjust bin widths according to reporting accuracy
        resolution = utils.reporting_accuracy(month_data)

        if resolution <= 0.5:
            bins = utils.create_bins(month_data, 0.5, obs_var.name)
        else:
            bins = utils.create_bins(month_data, 1.0, obs_var.name)

        hist, bin_edges = np.histogram(month_data, bins)

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

        # Scan through the histogram
        #   check if a bin is the maximum of a local area ("ROLLING")
        suspect = []
        for b, bar in enumerate(hist):
            if (b > ROLLING // 2) and (b <= (len(hist) - ROLLING // 2)):

                target_bins = hist[b - (ROLLING // 2):b + (ROLLING // 2) + 1]

                # if sufficient obs, maximum and contains > 50%, but not all, of the data
                if bar >= utils.DATA_COUNT_THRESHOLD:
                    if bar == target_bins.max():
                        if (bar / target_bins.sum()) > RATIO:
                            suspect += [bins[b]]

        # diagnostic plots
        if plots:
            bad_hist = np.copy(hist)
            for b, bar in enumerate(bad_hist):
                if bins[b] not in suspect:
                    bad_hist[b] = 0

            plt.step(bins[1:], bad_hist, color='r', where="pre")
            plt.show()

        # write out the thresholds...
        utils.write_qc_config(config_file,
                              "FREQUENT-{}".format(obs_var.name),
                              "{}".format(month),
                              "[{}]".format(",".join(str(s) for s in suspect)),
                              diagnostics=diagnostics)

    return  # identify_values
示例#8
0
def pressure_offset(sealp,
                    stnlp,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Flag locations where difference between station and sea-level pressure
    falls outside of bounds

    :param MetVar sealp: sea level pressure object
    :param MetVar stnlp: station level pressure object
    :param array times: datetime array
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(sealp.data.shape[0])])

    difference = sealp.data - stnlp.data

    if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD:

        try:
            average = float(
                utils.read_qc_config(config_file, "PRESSURE", "average"))
            spread = float(
                utils.read_qc_config(config_file, "PRESSURE", "spread"))
        except KeyError:
            print("Information missing in config file")
            average = utils.average(difference)
            spread = utils.spread(difference)
            if spread < MIN_SPREAD:  # less than XhPa
                spread = MIN_SPREAD
            elif spread > MAX_SPREAD:  # more than XhPa
                spread = MAX_SPREAD

            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "average",
                                  "{}".format(average),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "spread",
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

        if np.abs(np.ma.mean(difference) -
                  np.ma.median(difference)) > THRESHOLD * spread:
            if diagnostics:
                print("Large difference between mean and median")
                print("Likely to have two populations of roughly equal size")
                print("Test won't work")
            pass
        else:
            high, = np.ma.where(difference > (average + (THRESHOLD * spread)))
            low, = np.ma.where(difference < (average - (THRESHOLD * spread)))

            # diagnostic plots
            if plots:
                bins = np.arange(
                    np.round(difference.min()) - 1,
                    np.round(difference.max()) + 1, 0.1)
                import matplotlib.pyplot as plt
                plt.clf()
                plt.hist(difference.compressed(), bins=bins)
                plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r")
                plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r")
                plt.xlim([bins[0] - 1, bins[-1] + 1])
                plt.ylabel("Observations")
                plt.xlabel("Difference (hPa)")
                plt.show()

            if len(high) != 0:
                flags[high] = "p"
                if diagnostics:
                    print("Pressure".format(stnlp.name))
                    print("   Number of high differences {}".format(len(high)))
                if plots:
                    for bad in high:
                        plot_pressure(sealp, stnlp, times, bad)

            if len(low) != 0:
                flags[low] = "p"
                if diagnostics:
                    print("   Number of low differences {}".format(len(low)))
                if plots:
                    for bad in low:
                        plot_pressure(sealp, stnlp, times, bad)

            # only flag the station level pressure
            stnlp.flags = utils.insert_flags(stnlp.flags, flags)

    if diagnostics:

        print("Pressure {}".format(stnlp.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # pressure_offset
示例#9
0
def find_month_thresholds(obs_var,
                          station,
                          config_file,
                          plots=False,
                          diagnostics=False,
                          winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    # get hourly climatology for each month
    for month in range(1, 13):

        normalised_anomalies = prepare_data(obs_var,
                                            station,
                                            month,
                                            diagnostics=diagnostics,
                                            winsorize=winsorize)

        if len(normalised_anomalies.compressed()
               ) >= utils.DATA_COUNT_THRESHOLD:

            bins = utils.create_bins(normalised_anomalies, BIN_WIDTH,
                                     obs_var.name)
            hist, bin_edges = np.histogram(normalised_anomalies.compressed(),
                                           bins)

            gaussian_fit = utils.fit_gaussian(
                bins[1:],
                hist,
                max(hist),
                mu=bins[np.argmax(hist)],
                sig=utils.spread(normalised_anomalies))

            fitted_curve = utils.gaussian(bins[1:], gaussian_fit)

            # diagnostic plots
            if plots:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.step(bins[1:], hist, color='k', where="pre")
                plt.yscale("log")

                plt.ylabel("Number of Observations")
                plt.xlabel("Scaled {}".format(obs_var.name.capitalize()))
                plt.title("{} - month {}".format(station.id, month))

                plt.plot(bins[1:], fitted_curve)
                plt.ylim([0.1, max(hist) * 2])

            # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD
            try:
                lower_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] < 0))[0]][-1]
            except:
                lower_threshold = bins[1]
            try:
                upper_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] > 0))[0]][0]
            except:
                upper_threshold = bins[-1]

            if plots:
                plt.axvline(upper_threshold, c="r")
                plt.axvline(lower_threshold, c="r")
                plt.show()

            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(upper_threshold),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(lower_threshold),
                                  diagnostics=diagnostics)

    return  # find_month_thresholds
示例#10
0
def find_thresholds(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Extract data for month and find thresholds in distribution and store.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: month to process
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    for month in range(1, 13):

        normalised_anomalies = prepare_all_data(obs_var,
                                                station,
                                                month,
                                                config_file,
                                                full=True,
                                                diagnostics=diagnostics)

        if len(normalised_anomalies.compressed()
               ) == 1 and normalised_anomalies[0] == utils.MDI:
            # scaling not possible for this month
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            continue
        elif len(np.unique(normalised_anomalies)) == 1:
            # all the same value, so won't be able to fit a histogram
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            continue

        bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(normalised_anomalies, bins)

        gaussian_fit = utils.fit_gaussian(bins[1:], hist, max(hist), mu=bins[np.argmax(hist)], \
                                          sig=utils.spread(normalised_anomalies), skew=skew(normalised_anomalies.compressed()))

        fitted_curve = utils.skew_gaussian(bins[1:], gaussian_fit)

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.plot(bins[1:], fitted_curve)
            plt.ylim([0.1, max(hist) * 2])

        # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD
        try:
            lower_threshold = bins[1:][np.where(
                np.logical_and(
                    fitted_curve < FREQUENCY_THRESHOLD,
                    bins[1:] < bins[np.argmax(fitted_curve)]))[0]][-1]
        except:
            lower_threshold = bins[1]
        try:
            if len(np.unique(fitted_curve)) == 1:
                # just a line of zeros perhaps (found on AFA00409906 station_level_pressure 20190913)
                upper_threshold = bins[-1]
            else:
                upper_threshold = bins[1:][np.where(
                    np.logical_and(
                        fitted_curve < FREQUENCY_THRESHOLD,
                        bins[1:] > bins[np.argmax(fitted_curve)]))[0]][0]
        except:
            upper_threshold = bins[-1]

        if plots:
            plt.axvline(upper_threshold, c="r")
            plt.axvline(lower_threshold, c="r")
            plt.show()

        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-uthresh".format(month),
                              "{}".format(upper_threshold),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-lthresh".format(month),
                              "{}".format(lower_threshold),
                              diagnostics=diagnostics)

    return  # find_thresholds
示例#11
0
def prepare_all_data(obs_var,
                     station,
                     month,
                     config_file,
                     full=False,
                     diagnostics=False):
    """
    Extract data for the month, make & store or read average and spread.
    Use to calculate normalised anomalies.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: month to process
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    month_locs, = np.where(station.months == month)

    all_month_data = obs_var.data[month_locs]

    if full:

        if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD:
            # have data, now to standardise
            climatology = utils.average(all_month_data)  # mean
            spread = utils.spread(all_month_data)  # IQR currently
        else:
            climatology = utils.MDI
            spread = utils.MDI

        # write out the scaling...
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-clim".format(month),
                              "{}".format(climatology),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-spread".format(month),
                              "{}".format(spread),
                              diagnostics=diagnostics)

    else:

        try:
            climatology = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:

            if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD:
                # have data, now to standardise
                climatology = utils.average(all_month_data)  # mean
                spread = utils.spread(all_month_data)  # IQR currently
            else:
                climatology = utils.MDI
                spread = utils.MDI

            # write out the scaling...
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-clim".format(month),
                                  "{}".format(climatology),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-spread".format(month),
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

    if climatology == utils.MDI and spread == utils.MDI:
        # these weren't calculable, move on
        return np.ma.array([utils.MDI])
    elif spread == 0:
        # all the same value
        return (all_month_data - climatology)  # prepare_all_data
    else:
        return (all_month_data - climatology) / spread  # prepare_all_data