예제 #1
0
def find_monthly_scaling(obs_var, station, config_file, diagnostics=False):
    """
    Find scaling parameters for monthly values and store in config file

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    all_years = np.unique(station.years)

    for month in range(1, 13):

        month_averages = prepare_monthly_data(obs_var,
                                              station,
                                              month,
                                              diagnostics=diagnostics)

        if len(month_averages.compressed()) >= VALID_MONTHS:

            # have months, now to standardise
            climatology = utils.average(month_averages)  # mean
            spread = utils.spread(month_averages)  # IQR currently
            if spread < SPREAD_LIMIT:
                spread = SPREAD_LIMIT

            # write out the scaling...
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-clim".format(month),
                                  "{}".format(climatology),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-spread".format(month),
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

        else:
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-clim".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "MDISTRIBUTION-{}".format(obs_var.name),
                                  "{}-spread".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)

    return  # find_monthly_scaling
예제 #2
0
def find_thresholds(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False,
                    winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    # get hourly climatology for each month
    for month in range(1, 13):

        variances = prepare_data(obs_var,
                                 station,
                                 month,
                                 diagnostics=diagnostics,
                                 winsorize=winsorize)

        if len(variances.compressed()) >= MIN_VARIANCES:
            average_variance = utils.average(variances)
            variance_spread = utils.spread(variances)
        else:
            average_variance = utils.MDI
            variance_spread = utils.MDI

        utils.write_qc_config(config_file,
                              "VARIANCE-{}".format(obs_var.name),
                              "{}-average".format(month),
                              "{}".format(average_variance),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "VARIANCE-{}".format(obs_var.name),
                              "{}-spread".format(month),
                              "{}".format(variance_spread),
                              diagnostics=diagnostics)

    return  # find_thresholds
예제 #3
0
def identify_values(sealp,
                    stnlp,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Find average and spread of differences

    :param MetVar sealp: sea level pressure object
    :param MetVar stnlp: station level pressure object
    :param array times: datetime array
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    difference = sealp.data - stnlp.data

    if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD:

        average = utils.average(difference)
        spread = utils.spread(difference)
        if spread < MIN_SPREAD:  # less than XhPa
            spread = MIN_SPREAD
        elif spread > MAX_SPREAD:  # more than XhPa
            spread = MAX_SPREAD

        utils.write_qc_config(config_file,
                              "PRESSURE",
                              "average",
                              "{}".format(average),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "PRESSURE",
                              "spread",
                              "{}".format(spread),
                              diagnostics=diagnostics)

    return  # identify_values
예제 #4
0
def neighbour_outlier(target_station,
                      initial_neighbours,
                      variable,
                      diagnostics=False,
                      plots=False,
                      full=False):
    """
    Works on a single station and variable.  Reads in neighbour's data, finds locations where sufficent are sufficiently different.

    :param Station target_station: station to run on 
    :param array initial_neighbours: input neighbours (ID, distance) pairs
    :param str variable: obs variable being run on
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test
    :param bool full: run full reprocessing rather than using stored values.
    """
    station_list = utils.get_station_list()

    # if sufficient
    n_neighbours = len(np.where(initial_neighbours[:, 0] != "-")[0]) - 1
    if n_neighbours < utils.MIN_NEIGHBOURS:
        print("{} has insufficient neighbours ({}<{})".format(
            target_station.id, n_neighbours, utils.MIN_NEIGHBOURS))

    else:
        #*************************
        # extract target observations
        obs_var = getattr(target_station, variable)
        flags = np.array(["" for i in range(obs_var.data.shape[0])
                          ]).astype("<U10")

        #*************************
        # read in in the neighbour (buddy) data
        all_buddy_data = np.ma.zeros(
            [len(initial_neighbours[:, 0]),
             len(target_station.times)])
        all_buddy_data.mask = np.ones(all_buddy_data.shape)

        for bid, buddy_id in enumerate(initial_neighbours[:, 0]):
            if buddy_id == target_station.id:
                # first entry is self
                continue
            if buddy_id == "-":
                # end of the list of buddies
                break

            if diagnostics:
                print("{}/{} {}".format(bid, len(initial_neighbours[:, 0]),
                                        buddy_id))

            # set up station object to hold information
            buddy_idx, = np.where(station_list.id == buddy_id)
            buddy = utils.Station(buddy_id, station_list.iloc[buddy_idx].latitude.values[0], \
                                      station_list.iloc[buddy_idx].longitude.values[0], station_list.iloc[buddy_idx].elevation.values[0])

            try:
                buddy, buddy_df = io.read_station(os.path.join(
                    setup.SUBDAILY_PROC_DIR, "{:11s}.qff".format(buddy_id)),
                                                  buddy,
                                                  read_flags=True)

                buddy_var = getattr(buddy, variable)

                # apply flags
                flag_locs, = np.where(buddy_var.flags != "")
                buddy_var.data.mask[flag_locs] = True

            except OSError as e:
                # file missing, move on to next in sequence
                io.write_error(
                    target_station,
                    "File Missing (Buddy, {}) - {}".format(variable, buddy_id))
                continue
            except ValueError as e:
                # some issue in the raw file
                io.write_error(target_station,
                               "Error in input file (Buddy, {}) - {}".format(
                                   variable, buddy_id),
                               error=str(e))
                continue

            # match the timestamps of target_station and copy over
            match = np.in1d(target_station.times, buddy.times)
            match_back = np.in1d(buddy.times, target_station.times)

            if True in match and True in match_back:
                # skip if no overlapping times at all!
                all_buddy_data[bid, match] = buddy_var.data[match_back]

        if diagnostics:
            print("All buddies read in")

        #*************************
        # find differences
        differences = all_buddy_data - obs_var.data

        #*************************
        # find spread of differences on monthly basis (with minimum value)
        spreads = np.ma.zeros(differences.shape)

        for month in range(1, 13):

            month_locs = np.where(target_station.months == month)

            for bid, buddy in enumerate(differences):

                if len(differences[bid, month_locs].compressed()
                       ) > utils.DATA_COUNT_THRESHOLD:

                    this_spread = utils.spread(differences[bid, month_locs])
                    if this_spread < MIN_SPREAD:
                        spreads[bid, month_locs] = MIN_SPREAD
                    else:
                        spreads[bid, month_locs] = utils.spread(
                            differences[bid, month_locs])

                else:
                    spreads[bid, month_locs] = MIN_SPREAD

        spreads.mask = np.copy(differences.mask)

        # store which entries may be sufficient to flag
        dubious = np.ma.zeros(differences.shape)
        dubious.mask = np.copy(differences.mask)

        #*************************
        # adjust for storms
        if variable in ["sea_level_pressure", "station_level_pressure"]:
            distant, = np.where(initial_neighbours[:, 1].astype(int) > 100)
            if len(distant) > 0:
                # find positive and negative differences across neighbours
                positive = np.ma.where(
                    differences[distant] > spreads[distant] * SPREAD_LIMIT)
                negative = np.ma.where(
                    differences[distant] < spreads[distant] * SPREAD_LIMIT)

                # spin through each neighbour
                for dn, dist_neigh in enumerate(distant):

                    pos, = np.where(positive[0] == dn)
                    neg, = np.where(negative[0] == dn)

                    if len(neg) > 0:
                        ratio = len(neg) / (len(pos) + len(neg))
                        if ratio > 0.667:
                            # majority negative, only flag the positives [definitely not storms]
                            dubious[dist_neigh, positive[1][pos]] = 1

            else:
                # all stations close by so storms shouldn't affect, include all
                # note where differences exceed the spread
                dubious_locs = np.ma.where(
                    np.ma.abs(differences) > spreads * SPREAD_LIMIT)
                dubious[dubious_locs] = 1

        else:
            #*************************
            # note where differences exceed the spread [all non pressure variables]
            dubious_locs = np.ma.where(
                np.ma.abs(differences) > spreads * SPREAD_LIMIT)
            dubious[dubious_locs] = 1

        if diagnostics:
            print("cross checks complete - assessing all outcomes")
        #*************************
        # sum across neighbours
        neighbour_count = np.ma.count(differences, axis=0)
        dubious_count = np.ma.sum(dubious, axis=0)

        # flag if large enough fraction (>0.66)
        sufficient, = np.ma.where(dubious_count > 0.66 * neighbour_count)
        flags[sufficient] = "N"

        if plots:
            for flag in sufficient:
                plot_neighbour_flags(target_station.times, flag, obs_var,
                                     all_buddy_data)

        # append flags to object
        obs_var.flags = utils.insert_flags(obs_var.flags, flags)

        if diagnostics:

            print("Neighbour Outlier {}".format(obs_var.name))
            print("   Cumulative number of flags set: {}".format(
                len(np.where(flags != "")[0])))

    return  # neighbour_outlier
예제 #5
0
def prepare_data(obs_var, station, month, diagnostics=False, winsorize=True):
    """
    Calculate the monthly variances

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: which month to run on
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    anomalies = np.ma.zeros(obs_var.data.shape[0])
    anomalies.mask = np.ones(anomalies.shape[0])
    normed_anomalies = np.ma.copy(anomalies)

    mlocs, = np.where(station.months == month)
    anomalies.mask[mlocs] = False
    normed_anomalies.mask[mlocs] = False

    hourly_clims = np.ma.zeros(24)
    hourly_clims.mask = np.ones(24)
    for hour in range(24):

        # calculate climatology
        hlocs, = np.where(
            np.logical_and(station.months == month, station.hours == hour))

        hour_data = obs_var.data[hlocs]

        if winsorize:
            if len(hour_data.compressed()) > 10:
                hour_data = utils.winsorize(hour_data, 5)

        if len(hour_data.compressed()) >= utils.DATA_COUNT_THRESHOLD:
            hourly_clims[hour] = np.ma.mean(hour_data)
            hourly_clims.mask[hour] = False

        # make anomalies - keeping the order
        anomalies[hlocs] = obs_var.data[hlocs] - hourly_clims[hour]

    if len(anomalies[mlocs].compressed()) >= MIN_VARIANCES:
        # for the month, normalise anomalies by spread
        spread = utils.spread(anomalies[mlocs])
        if spread < 1.5:
            spread = 1.5
    else:
        spread = 1.5

    normed_anomalies[mlocs] = anomalies[mlocs] / spread

    # calculate the variance for each year in this single month.
    all_years = np.unique(station.years)

    variances = np.ma.zeros(all_years.shape[0])
    variances.mask = np.ones(all_years.shape[0])
    for y, year in enumerate(all_years):

        ymlocs, = np.where(
            np.logical_and(station.months == month, station.years == year))
        this_year = normed_anomalies[ymlocs]

        # HadISD used M.A.D.
        if this_year.compressed().shape[0] > MIN_VALUES:
            variances[y] = utils.spread(this_year)

    return variances  # prepare_data
예제 #6
0
def variance_check(obs_var,
                   station,
                   config_file,
                   plots=False,
                   diagnostics=False,
                   winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    # get hourly climatology for each month
    for month in range(1, 13):
        month_locs, = np.where(station.months == month)

        variances = prepare_data(obs_var,
                                 station,
                                 month,
                                 diagnostics=diagnostics,
                                 winsorize=winsorize)

        try:
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            average_variance = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-average".format(month)))
            variance_spread = float(
                utils.read_qc_config(config_file,
                                     "VARIANCE-{}".format(obs_var.name),
                                     "{}-spread".format(month)))

        if average_variance == utils.MDI and variance_spread == utils.MDI:
            # couldn't be calculated, move on
            continue

        bad_years, = np.where(
            np.abs(variances - average_variance) /
            variance_spread > SPREAD_THRESHOLD)

        # prepare wind and pressure data in case needed to check for storms
        if obs_var.name in [
                "station_level_pressure", "sea_level_pressure", "wind_speed"
        ]:
            wind_monthly_data = station.wind_speed.data[month_locs]
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure"
            ]:
                pressure_monthly_data = obs_var.data[month_locs]
            else:
                pressure_monthly_data = station.sea_level_pressure.data[
                    month_locs]

            if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                    len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                # need sufficient data to work with for storm check to work, else can't tell
                #    move on
                continue

            wind_average = utils.average(wind_monthly_data)
            wind_spread = utils.spread(wind_monthly_data)

            pressure_average = utils.average(pressure_monthly_data)
            pressure_spread = utils.spread(pressure_monthly_data)

        # go through each bad year for this month
        all_years = np.unique(station.years)
        for year in bad_years:

            # corresponding locations
            ym_locs, = np.where(
                np.logical_and(station.months == month,
                               station.years == all_years[year]))

            # if pressure or wind speed, need to do some further checking before applying flags
            if obs_var.name in [
                    "station_level_pressure", "sea_level_pressure",
                    "wind_speed"
            ]:

                # pull out the data
                wind_data = station.wind_speed.data[ym_locs]
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    pressure_data = obs_var.data[ym_locs]
                else:
                    pressure_data = station.sea_level_pressure.data[ym_locs]

                # need sufficient data to work with for storm check to work, else can't tell
                if len(pressure_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                        len(wind_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                    # move on
                    continue

                # find locations of high wind speeds and low pressures, cross match
                high_winds, = np.ma.where(
                    (wind_data - wind_average) / wind_spread > STORM_THRESHOLD)
                low_pressures, = np.ma.where(
                    (pressure_average - pressure_data) /
                    pressure_spread > STORM_THRESHOLD)

                match = np.in1d(high_winds, low_pressures)

                couldbe_storm = False
                if len(match) > 0:
                    # this could be a storm, either at tropical station (relatively constant pressure)
                    # or out of season in mid-latitudes.
                    couldbe_storm = True

                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    diffs = np.ma.diff(pressure_data)
                elif obs_var.name == "wind_speed":
                    diffs = np.ma.diff(wind_data)

                # count up the largest number of sequential negative and positive differences
                negs, poss = 0, 0
                biggest_neg, biggest_pos = 0, 0

                for diff in diffs:

                    if diff > 0:
                        if negs > biggest_neg: biggest_neg = negs
                        negs = 0
                        poss += 1
                    else:
                        if poss > biggest_pos: biggest_pos = poss
                        poss = 0
                        negs += 1

                if (biggest_neg < 10) and (biggest_pos <
                                           10) and not couldbe_storm:
                    # insufficient to identify as a storm (HadISD values)
                    # leave flags set
                    pass
                else:
                    # could be a storm, so better to leave this month unflagged
                    # zero length array to flag
                    ym_locs = np.ma.array([])

            # copy over the flags, if any
            if len(ym_locs) != 0:
                # and set the flags
                flags[ym_locs] = "V"

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt

            scaled_variances = ((variances - average_variance) /
                                variance_spread)
            bins = utils.create_bins(scaled_variances, 0.25, obs_var.name)
            hist, bin_edges = np.histogram(scaled_variances, bins)

            plt.clf()
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Months")
            plt.xlabel("Scaled {} Variances".format(obs_var.name.capitalize()))
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])
            plt.axvline(SPREAD_THRESHOLD, c="r")
            plt.axvline(-SPREAD_THRESHOLD, c="r")

            bad_hist, dummy = np.histogram(scaled_variances[bad_years], bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Variance {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # variance_check
예제 #7
0
def pressure_offset(sealp,
                    stnlp,
                    times,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Flag locations where difference between station and sea-level pressure
    falls outside of bounds

    :param MetVar sealp: sea level pressure object
    :param MetVar stnlp: station level pressure object
    :param array times: datetime array
    :param str configfile: string for configuration file
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(sealp.data.shape[0])])

    difference = sealp.data - stnlp.data

    if len(difference.compressed()) >= utils.DATA_COUNT_THRESHOLD:

        try:
            average = float(
                utils.read_qc_config(config_file, "PRESSURE", "average"))
            spread = float(
                utils.read_qc_config(config_file, "PRESSURE", "spread"))
        except KeyError:
            print("Information missing in config file")
            average = utils.average(difference)
            spread = utils.spread(difference)
            if spread < MIN_SPREAD:  # less than XhPa
                spread = MIN_SPREAD
            elif spread > MAX_SPREAD:  # more than XhPa
                spread = MAX_SPREAD

            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "average",
                                  "{}".format(average),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "PRESSURE",
                                  "spread",
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

        if np.abs(np.ma.mean(difference) -
                  np.ma.median(difference)) > THRESHOLD * spread:
            if diagnostics:
                print("Large difference between mean and median")
                print("Likely to have two populations of roughly equal size")
                print("Test won't work")
            pass
        else:
            high, = np.ma.where(difference > (average + (THRESHOLD * spread)))
            low, = np.ma.where(difference < (average - (THRESHOLD * spread)))

            # diagnostic plots
            if plots:
                bins = np.arange(
                    np.round(difference.min()) - 1,
                    np.round(difference.max()) + 1, 0.1)
                import matplotlib.pyplot as plt
                plt.clf()
                plt.hist(difference.compressed(), bins=bins)
                plt.axvline(x=(average + (THRESHOLD * spread)), ls="--", c="r")
                plt.axvline(x=(average - (THRESHOLD * spread)), ls="--", c="r")
                plt.xlim([bins[0] - 1, bins[-1] + 1])
                plt.ylabel("Observations")
                plt.xlabel("Difference (hPa)")
                plt.show()

            if len(high) != 0:
                flags[high] = "p"
                if diagnostics:
                    print("Pressure".format(stnlp.name))
                    print("   Number of high differences {}".format(len(high)))
                if plots:
                    for bad in high:
                        plot_pressure(sealp, stnlp, times, bad)

            if len(low) != 0:
                flags[low] = "p"
                if diagnostics:
                    print("   Number of low differences {}".format(len(low)))
                if plots:
                    for bad in low:
                        plot_pressure(sealp, stnlp, times, bad)

            # only flag the station level pressure
            stnlp.flags = utils.insert_flags(stnlp.flags, flags)

    if diagnostics:

        print("Pressure {}".format(stnlp.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # pressure_offset
예제 #8
0
def prepare_data(obs_var, station, month, diagnostics=False, winsorize=True):
    """
    Prepare the data for the climatological check.  Makes anomalies and applies low-pass filter

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: which month to run on
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    anomalies = np.ma.zeros(obs_var.data.shape[0])
    anomalies.mask = np.ones(anomalies.shape[0])
    normed_anomalies = np.ma.copy(anomalies)

    mlocs, = np.where(station.months == month)

    nyears = len(np.unique(station.years[mlocs]))

    # need to have some data and in at least 5 years!
    if len(mlocs) >= utils.DATA_COUNT_THRESHOLD and nyears >= 5:

        anomalies.mask[mlocs] = False
        normed_anomalies.mask[mlocs] = False

        hourly_clims = np.ma.zeros(24)
        hourly_clims.mask = np.ones(24)
        for hour in range(24):

            # calculate climatology
            hlocs, = np.where(
                np.logical_and(station.months == month, station.hours == hour))

            hour_data = obs_var.data[hlocs]

            if winsorize:
                if len(hour_data.compressed()) > 10:
                    hour_data = utils.winsorize(hour_data, 5)

            if len(hour_data) >= utils.DATA_COUNT_THRESHOLD:
                hourly_clims[hour] = np.ma.mean(hour_data)
                hourly_clims.mask[hour] = False

            # make anomalies - keeping the order
            anomalies[hlocs] = obs_var.data[hlocs] - hourly_clims[hour]

        # if insufficient data at each hour, then no anomalies calculated
        if len(anomalies[mlocs].compressed()) >= utils.DATA_COUNT_THRESHOLD:

            # for the month, normalise anomalies by spread
            spread = utils.spread(anomalies[mlocs])
            if spread < 1.5:
                spread = 1.5

            normed_anomalies[mlocs] = anomalies[mlocs] / spread

            # apply low pass filter derived from monthly values
            all_years = np.unique(station.years)
            monthly_anoms = np.ma.zeros(all_years.shape[0])
            for y, year in enumerate(all_years):

                ylocs, = np.where(station.years == year)
                year_data = obs_var.data[ylocs]

                monthly_anoms[y] = utils.average(year_data)

            lp_filtered_anomalies = low_pass_filter(normed_anomalies, station,
                                                    monthly_anoms, month)

            return lp_filtered_anomalies  # prepare_data

        else:
            return anomalies  # prepare_data
    else:
        return anomalies  # prepare_data
예제 #9
0
def find_month_thresholds(obs_var,
                          station,
                          config_file,
                          plots=False,
                          diagnostics=False,
                          winsorize=True):
    """
    Use distribution to identify threshold values.  Then also store in config file.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    :param bool winsorize: apply winsorization at 5%/95%
    """

    # get hourly climatology for each month
    for month in range(1, 13):

        normalised_anomalies = prepare_data(obs_var,
                                            station,
                                            month,
                                            diagnostics=diagnostics,
                                            winsorize=winsorize)

        if len(normalised_anomalies.compressed()
               ) >= utils.DATA_COUNT_THRESHOLD:

            bins = utils.create_bins(normalised_anomalies, BIN_WIDTH,
                                     obs_var.name)
            hist, bin_edges = np.histogram(normalised_anomalies.compressed(),
                                           bins)

            gaussian_fit = utils.fit_gaussian(
                bins[1:],
                hist,
                max(hist),
                mu=bins[np.argmax(hist)],
                sig=utils.spread(normalised_anomalies))

            fitted_curve = utils.gaussian(bins[1:], gaussian_fit)

            # diagnostic plots
            if plots:
                import matplotlib.pyplot as plt
                plt.clf()
                plt.step(bins[1:], hist, color='k', where="pre")
                plt.yscale("log")

                plt.ylabel("Number of Observations")
                plt.xlabel("Scaled {}".format(obs_var.name.capitalize()))
                plt.title("{} - month {}".format(station.id, month))

                plt.plot(bins[1:], fitted_curve)
                plt.ylim([0.1, max(hist) * 2])

            # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD
            try:
                lower_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] < 0))[0]][-1]
            except:
                lower_threshold = bins[1]
            try:
                upper_threshold = bins[1:][np.where(
                    np.logical_and(fitted_curve < FREQUENCY_THRESHOLD,
                                   bins[1:] > 0))[0]][0]
            except:
                upper_threshold = bins[-1]

            if plots:
                plt.axvline(upper_threshold, c="r")
                plt.axvline(lower_threshold, c="r")
                plt.show()

            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(upper_threshold),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "CLIMATOLOGICAL-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(lower_threshold),
                                  diagnostics=diagnostics)

    return  # find_month_thresholds
예제 #10
0
def all_obs_gap(obs_var, station, config_file, plots=False, diagnostics=False):
    """
    Extract data for month and find secondary populations in distribution.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param str config_file: configuration file to store critical values
    :param bool plots: turn on plots
    :param bool diagnostics: turn on diagnostic output
    """

    flags = np.array(["" for i in range(obs_var.data.shape[0])])

    for month in range(1, 13):

        normalised_anomalies = prepare_all_data(obs_var,
                                                station,
                                                month,
                                                config_file,
                                                full=False,
                                                diagnostics=diagnostics)

        if (len(normalised_anomalies.compressed()) == 1
                and normalised_anomalies[0] == utils.MDI):
            # no data to work with for this month, move on.
            continue

        bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(normalised_anomalies, bins)

        try:
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))
        except KeyError:
            print("Information missing in config file")
            find_thresholds(obs_var,
                            station,
                            config_file,
                            plots=plots,
                            diagnostics=diagnostics)
            upper_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-uthresh".format(month)))
            lower_threshold = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-lthresh".format(month)))

        if upper_threshold == utils.MDI and lower_threshold == utils.MDI:
            # these weren't able to be calculated, move on
            continue
        elif len(np.unique(normalised_anomalies)) == 1:
            # all the same value, so won't be able to fit a histogram
            continue

        # now to find the gaps
        uppercount = len(np.where(normalised_anomalies > upper_threshold)[0])
        lowercount = len(np.where(normalised_anomalies < lower_threshold)[0])

        month_locs, = np.where(
            station.months == month)  # append should keep year order
        if uppercount > 0:
            gap_start = utils.find_gap(hist, bins, upper_threshold, GAP_SIZE)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies >
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"
                flags[month_locs] = month_flags

        if lowercount > 0:
            gap_start = utils.find_gap(hist,
                                       bins,
                                       lower_threshold,
                                       GAP_SIZE,
                                       upwards=False)

            if gap_start != 0:
                bad_locs, = np.ma.where(normalised_anomalies <
                                        gap_start)  # all years for one month

                month_flags = flags[month_locs]
                month_flags[bad_locs] = "d"

                # TODO - can this bit be refactored?
                # for pressure data, see if the flagged obs correspond with high winds
                # could be a storm signal
                if obs_var.name in [
                        "station_level_pressure", "sea_level_pressure"
                ]:
                    wind_monthly_data = prepare_monthly_data(
                        station.wind_speed, station, month)
                    pressure_monthly_data = prepare_monthly_data(
                        obs_var, station, month)

                    if len(pressure_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD or \
                            len(wind_monthly_data.compressed()) < utils.DATA_COUNT_THRESHOLD:
                        # need sufficient data to work with for storm check to work, else can't tell
                        pass
                    else:

                        wind_monthly_average = utils.average(wind_monthly_data)
                        wind_monthly_spread = utils.spread(wind_monthly_data)

                        pressure_monthly_average = utils.average(
                            pressure_monthly_data)
                        pressure_monthly_spread = utils.spread(
                            pressure_monthly_data)

                        # already a single calendar month, so go through each year
                        all_years = np.unique(station.years)
                        for year in all_years:

                            # what's best - extract only when necessary but repeatedly if so, or always, but once
                            this_year_locs = np.where(
                                station.years[month_locs] == year)

                            if "d" not in month_flags[this_year_locs]:
                                # skip if you get the chance
                                continue

                            wind_data = station.wind_speed.data[month_locs][
                                this_year_locs]
                            pressure_data = obs_var.data[month_locs][
                                this_year_locs]

                            storms, = np.ma.where(
                                np.logical_and(
                                    (((wind_data - wind_monthly_average) /
                                      wind_monthly_spread) > STORM_THRESHOLD),
                                    (((pressure_monthly_average - pressure_data
                                       ) / pressure_monthly_spread) >
                                     STORM_THRESHOLD)))

                            # more than one entry - check if separate events
                            if len(storms) >= 2:
                                # find where separation more than the usual obs separation
                                storm_1diffs = np.ma.diff(storms)
                                separations, = np.where(
                                    storm_1diffs > np.ma.median(
                                        np.ma.diff(wind_data)))

                                if len(separations) != 0:
                                    # multiple storm signals
                                    storm_start = 0
                                    storm_finish = separations[0] + 1
                                    first_storm = expand_around_storms(
                                        storms[storm_start:storm_finish],
                                        len(wind_data))
                                    final_storm_locs = copy.deepcopy(
                                        first_storm)

                                    for j in range(len(separations)):
                                        # then do the rest in a loop

                                        if j + 1 == len(separations):
                                            # final one
                                            this_storm = expand_around_storms(
                                                storms[separations[j] + 1:],
                                                len(wind_data))
                                        else:
                                            this_storm = expand_around_storms(
                                                storms[separations[j] +
                                                       1:separations[j + 1] +
                                                       1], len(wind_data))

                                        final_storm_locs = np.append(
                                            final_storm_locs, this_storm)

                                else:
                                    # locations separated at same interval as data
                                    final_storm_locs = expand_around_storms(
                                        storms, len(wind_data))

                            # single entry
                            elif len(storms) != 0:
                                # expand around the storm signal (rather than
                                #  just unflagging what could be the peak and
                                #  leaving the entry/exit flagged)
                                final_storm_locs = expand_around_storms(
                                    storms, len(wind_data))

                            # unset the flags
                            if len(storms) > 0:
                                month_flags[this_year_locs][
                                    final_storm_locs] = ""

                # having checked for storms now store final flags
                flags[month_locs] = month_flags

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.ylim([0.1, max(hist) * 2])

            plt.axvline(upper_threshold, c="r")
            plt.axvline(lower_threshold, c="r")

            bad_locs, = np.where(flags[month_locs] == "d")
            bad_hist, dummy = np.histogram(normalised_anomalies[bad_locs],
                                           bins)
            plt.step(bins[1:], bad_hist, color='r', where="pre")

            plt.show()

    # append flags to object
    obs_var.flags = utils.insert_flags(obs_var.flags, flags)

    if diagnostics:

        print("Distribution (all) {}".format(obs_var.name))
        print("   Cumulative number of flags set: {}".format(
            len(np.where(flags != "")[0])))

    return  # all_obs_gap
예제 #11
0
def find_thresholds(obs_var,
                    station,
                    config_file,
                    plots=False,
                    diagnostics=False):
    """
    Extract data for month and find thresholds in distribution and store.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: month to process
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    for month in range(1, 13):

        normalised_anomalies = prepare_all_data(obs_var,
                                                station,
                                                month,
                                                config_file,
                                                full=True,
                                                diagnostics=diagnostics)

        if len(normalised_anomalies.compressed()
               ) == 1 and normalised_anomalies[0] == utils.MDI:
            # scaling not possible for this month
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            continue
        elif len(np.unique(normalised_anomalies)) == 1:
            # all the same value, so won't be able to fit a histogram
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-uthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-lthresh".format(month),
                                  "{}".format(utils.MDI),
                                  diagnostics=diagnostics)
            continue

        bins = utils.create_bins(normalised_anomalies, BIN_WIDTH, obs_var.name)
        hist, bin_edges = np.histogram(normalised_anomalies, bins)

        gaussian_fit = utils.fit_gaussian(bins[1:], hist, max(hist), mu=bins[np.argmax(hist)], \
                                          sig=utils.spread(normalised_anomalies), skew=skew(normalised_anomalies.compressed()))

        fitted_curve = utils.skew_gaussian(bins[1:], gaussian_fit)

        # diagnostic plots
        if plots:
            import matplotlib.pyplot as plt
            plt.clf()
            plt.step(bins[1:], hist, color='k', where="pre")
            plt.yscale("log")

            plt.ylabel("Number of Observations")
            plt.xlabel(obs_var.name.capitalize())
            plt.title("{} - month {}".format(station.id, month))

            plt.plot(bins[1:], fitted_curve)
            plt.ylim([0.1, max(hist) * 2])

        # use bins and curve to find points where curve is < FREQUENCY_THRESHOLD
        try:
            lower_threshold = bins[1:][np.where(
                np.logical_and(
                    fitted_curve < FREQUENCY_THRESHOLD,
                    bins[1:] < bins[np.argmax(fitted_curve)]))[0]][-1]
        except:
            lower_threshold = bins[1]
        try:
            if len(np.unique(fitted_curve)) == 1:
                # just a line of zeros perhaps (found on AFA00409906 station_level_pressure 20190913)
                upper_threshold = bins[-1]
            else:
                upper_threshold = bins[1:][np.where(
                    np.logical_and(
                        fitted_curve < FREQUENCY_THRESHOLD,
                        bins[1:] > bins[np.argmax(fitted_curve)]))[0]][0]
        except:
            upper_threshold = bins[-1]

        if plots:
            plt.axvline(upper_threshold, c="r")
            plt.axvline(lower_threshold, c="r")
            plt.show()

        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-uthresh".format(month),
                              "{}".format(upper_threshold),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-lthresh".format(month),
                              "{}".format(lower_threshold),
                              diagnostics=diagnostics)

    return  # find_thresholds
예제 #12
0
def prepare_all_data(obs_var,
                     station,
                     month,
                     config_file,
                     full=False,
                     diagnostics=False):
    """
    Extract data for the month, make & store or read average and spread.
    Use to calculate normalised anomalies.

    :param MetVar obs_var: meteorological variable object
    :param Station station: station object
    :param int month: month to process
    :param str config_file: configuration file to store critical values
    :param bool diagnostics: turn on diagnostic output
    """

    month_locs, = np.where(station.months == month)

    all_month_data = obs_var.data[month_locs]

    if full:

        if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD:
            # have data, now to standardise
            climatology = utils.average(all_month_data)  # mean
            spread = utils.spread(all_month_data)  # IQR currently
        else:
            climatology = utils.MDI
            spread = utils.MDI

        # write out the scaling...
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-clim".format(month),
                              "{}".format(climatology),
                              diagnostics=diagnostics)
        utils.write_qc_config(config_file,
                              "ADISTRIBUTION-{}".format(obs_var.name),
                              "{}-spread".format(month),
                              "{}".format(spread),
                              diagnostics=diagnostics)

    else:

        try:
            climatology = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-clim".format(month)))
            spread = float(
                utils.read_qc_config(config_file,
                                     "ADISTRIBUTION-{}".format(obs_var.name),
                                     "{}-spread".format(month)))
        except KeyError:

            if len(all_month_data.compressed()) >= utils.DATA_COUNT_THRESHOLD:
                # have data, now to standardise
                climatology = utils.average(all_month_data)  # mean
                spread = utils.spread(all_month_data)  # IQR currently
            else:
                climatology = utils.MDI
                spread = utils.MDI

            # write out the scaling...
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-clim".format(month),
                                  "{}".format(climatology),
                                  diagnostics=diagnostics)
            utils.write_qc_config(config_file,
                                  "ADISTRIBUTION-{}".format(obs_var.name),
                                  "{}-spread".format(month),
                                  "{}".format(spread),
                                  diagnostics=diagnostics)

    if climatology == utils.MDI and spread == utils.MDI:
        # these weren't calculable, move on
        return np.ma.array([utils.MDI])
    elif spread == 0:
        # all the same value
        return (all_month_data - climatology)  # prepare_all_data
    else:
        return (all_month_data - climatology) / spread  # prepare_all_data