Exemplo n.º 1
0
def estamt(network, minlenshf=24, **hom_params):
    """
    COPIED FROM ucpmonthly.v24a.f:
    
    The major steps in determining the best adjustment value for each station
    and changepoint. Entire network undergoes each of the following processes.
    In order:
        1) Remove unusable data. Align move swith respect to non-missing data
        and compress out changes that are too close AND the data between them.
        
        2) ISTEP=2 processing begins the adjustment process by removing the 
        non-significant changepoints to lengthen segments.
        
        3) NPASS (:= ISTEP=3) finishes the adjustment process by testing for the
        minimum number of months in a segment and number of neighbors with which
        the difference series can be examined.
        
        4) Final adjusted output is written.
    """

    ## FILTER 4
    ## Since the amplitude estimate MUST rely upon a minimum of MINLEN months to
    ## get even close to a reliable estimate at this point, it is assumed that
    ## the changepoints are as good as the station history files. Therefore,
    ## align moves with respect to non-missing data and compress out changes
    ## that are too close AND the data between them (i.e., less than MINLEN
    ## apart)

    # station_list = network.stations.keys()
    all_station_list = network.stations.keys()
    # station_list = ["215887", ]
    station_list = all_station_list

    # for each station...
    for id in station_list:
        station_index = station_list.index(id)
        station_series = network.raw_series[id]
        station_data = station_series.monthly_series[:]
        missing_val = station_series.MISSING_VAL

        # ... gen arrays for alignment
        move, amt, mday = [], [], []
        changepoints = station_series.changepoints
        cps = sorted(changepoints.keys())
        for cp in cps:
            print "  Hist move: ", len(move) + 1, station_index + 1, imo2iym(cp)
            move.append(cp)
            amt.append(changepoints[cp]["jsum"])
            mday.append(31)
        movnum = len(changepoints)

        if movnum > 0:
            ## At this point, the Fortran code executes alignmoves() in
            ## SHAPinp.v6c.f to reconcile the fact that station history files
            ## report dates of moves. It also removes segments that eare too short
            ## - less than minlenshf. Instead of implementing alignmoves(). Right
            ## now, I'll only implement this second functionality.
            # alignmoves()

            ####################################################################
            # Seek to find first and last month indices
            first_set = False
            for month in range(len(station_data)):
                # Skip first year
                if month < 12:
                    continue

                if station_data[month] != missing_val:
                    if not first_set:
                        first = month
                        first_set = True
                    last = month

            cps = sorted(changepoints.keys())
            cps.insert(0, first)
            cps.append(last)
            for (cp1, cp2) in zip(cps[:], cps[1:]):
                if (cp2 - cp1) < minlenshf:
                    months_to_delete = range(cp1 + 1, cp2 + 1)
                    network.raw_series[id].delete_months(months_to_delete)

                    if cp2 == last:
                        del_key = cp1
                    else:
                        del_key = cp2
                    if del_key in network.raw_series[id].changepoints:
                        # print len(network.raw_series[id].changepoints.keys()),
                        del network.raw_series[id].changepoints[del_key]
                        # print len(network.raw_series[id].changepoints.keys()),
                        # raw_input("pause")

                    del_str = "Del 1st segment: " if cp1 == first else "Delete segment: "

                    print id, station_index + 1, del_str, imo2iym(cp1), cp1, imo2iym(cp2), cp2

            new_changepoints = network.raw_series[id].changepoints
            new_cps = sorted(new_changepoints.keys())
            print "  First data value: ", imo2iym(first)
            for cp in new_cps:
                print "  End seg:", new_cps.index(cp), " ym: ", imo2iym(cp), cp, new_changepoints[cp]["jsum"]
            print "    End segment ym: ", imo2iym(last), last

            # Finally, add first and last value to the list of changepoints.
            first_stats = dict(ahigh=0.0, astd=0.0, jsum=0)
            last_stats = dict(ahigh=0.0, astd=0.0, jsum=0)
            network.raw_series[id].changepoints[first] = first_stats
            network.raw_series[id].changepoints[last] = last_stats
            ####################################################################

    ## Series of debug print statements summarizing the final list of
    ## changepoints. Not necessary at the moment

    ############################################################################
    # The subnetwork processing became a multi-step process plus a "post-process
    # pass" to manage:
    #    1) problems with documented changepoints with NO undocumented support
    #    2) determine the best amplitude estimation for each confirmed changepoint

    for step in [2, 3]:
        ## Setup output strings based on the step used. Only cosmetic differences
        ## really.

        iminlen = hom_params["minlen"]
        numclim = 3
        ## STEP 1 - NEVER USED (technically the history-consideration done previously
        if step == 1:
            continue
        elif step == 2:
            ## STEP 2 - NOT SIG REMOVAL
            ## equivalent to ipass loopback for istep == 2 in Fortran PHA
            print " ---------------- NOT SIG REMOVAL --------------- "
            tstr = "Not sig: "
            outid = "NS"
            ipass = 1
        elif step == 3:
            ## STEP 3 - ADJUSTMENT OF DISCONTINUITIES
            # equivalent to ipass loopback for istep == in FORTRAN PHA
            print " ---------------- ADJUST DISCONTINUITY STEP --------------- "
            print "Adjpass, iminlen, numclim", "--", iminlen, numclim
            print " ---------------- NPASS --------------- "
            tstr = "Dstep Dtrend: "
            outid = "WM"
            ipass = ipass + 1

        final_results = dict()
        print "  NET   STN    FILT TECH      ------ AFTER ------    ------ BEFORE ------"
        # Process each station and its network of neighbors
        for id in station_list:
            station_index = station_list.index(id)

            station_cp_dict = network.raw_series[id].changepoints
            sorted_cps = sorted(station_cp_dict.keys())
            ## If there are no breakpoints...
            if not sorted_cps:
                final_results[id] = dict()
                continue

            station_series = network.raw_series[id]
            missing_val = station_series.MISSING_VAL

            # compute monthly anomalies for this station data
            station_anomalies = station_series.monthly_anomaly_series

            # What are the first and last valid months in this station's data set?
            # We've saved them as the first and last changepoint before...
            first = sorted_cps[0]
            last = sorted_cps[-1]

            # What are the pairs to this station that we need to consider?
            station_pairs = []
            for other_id in all_station_list:
                pair = tuple(sorted([id, other_id]))
                if pair in hom_params["pairs"]:
                    station_pairs.append(pair)
            print station_pairs

            # List the remaining changepoints after the "confirmfilt" process
            for cp in sorted_cps:
                cp_stats = station_cp_dict[cp]
                hit_count = cp_stats["jsum"]
                iy, im = imo2iym(cp)
                print (
                    "%3d %5d %6s Estamt chgin: -- %4d %2d %4d %3d" % (ipass, station_index, id, iy, im, cp, hit_count)
                )

            ## ACCUMULATE PAIRED CHANGEPOINTS AND AMPLITUDE ESTIMATES
            # Loop over "brackets" of changepoints - that is, for changepoints
            # [a, b, c, d], consider the two brackets [a,b,c] and [b,c,d] with
            # the center value of the changepoints. Note that in the Fortran PHA,
            # we go through these brackets in reverse order - right to left.
            brackets = zip(sorted_cps[-3::-1], sorted_cps[-2::-1], sorted_cps[::-1])
            final_results[id] = dict()
            for bracket in brackets:
                # for bracket in brackets[:1]:
                (left, cp, right) = bracket[:]

                ly, lm = imo2iym(left)
                cpy, cpm = imo2iym(cp)
                ry, rm = imo2iym(right)
                print "Oriented: ", "--", "--", "--", left, cp, cp + 1, right

                # setup the output string for this bracket's tests
                chgptstr = "  Win1: %5d %4d%2d %5d %4d%2dto Win2: %5d %4d%2d %5d %4d%2d" % (
                    left,
                    ly,
                    lm,
                    cp,
                    cpy,
                    cpm,
                    cp,
                    cpy,
                    cpm,
                    right,
                    ry,
                    rm,
                )

                ## THIS SECTION ACCUMULATES TARGET-NEIGHBOR COMPARISONS
                # See if there are enough homogeneous data in the target;
                # check each window
                valid_count_right = len(get_valid_data(station_data[cp + 1 : right + 1], missing_val))
                valid_count_left = len(get_valid_data(station_data[left : cp + 1], missing_val))
                # if the segment length (valid count) is too short, skip this
                # changepoint (for now)
                if valid_count_left < iminlen:
                    print "Adjpass seg2 short ", station_index, id, chgptstr, valid_count_left
                    continue
                if valid_count_right < iminlen:
                    print "Adjpass seg1 short ", station_index, id, chgptstr, valid_count_right
                    continue

                ## We've pass the too-little-data pitfall. Now, we are actually going
                ## to go back through our paired neighbors and compute some final
                ## statistics about these changepoints. We'll store them in a
                ## dictionary for later, just like the pair_results dictionary
                ## from splitmerge
                pair_results = dict()
                # for (id1, id2) in [("215887", "200779")]:
                for (id1, id2) in station_pairs:
                    # Reset the left, cp, and right indices to the original
                    # bracket we're considering. We are going to be changing them
                    # while we look at this pair
                    (left, cp, right) = bracket[:]

                    ## Figure out which station is the neighbor (not the target
                    ## we're currently considering). At the same time, note that if
                    ## the target is the 2nd changepoint, the adjustments will be
                    ## flipped in sign, so we need to have a correction factor ready
                    correction = 1.0
                    if id == id1:
                        neighb_id = id2
                    else:
                        neighb_id = id1
                        # correction = -1.0

                    # Add this pair to pair_results if it's not already there
                    (ida, idb) = sorted([id1, id2])
                    pair_str = "%s-%s" % (ida, idb)
                    if pair_str not in pair_results:
                        pair_results[neighb_id] = dict()
                    print pair_str

                    neighb_index = all_station_list.index(neighb_id)
                    neighb_cp_dict = network.raw_series[neighb_id].changepoints

                    neighb_series = network.raw_series[neighb_id]
                    neighb_anomalies = neighb_series.monthly_anomaly_series

                    ## Generature a difference data set for this pair of stations
                    diff_data = diff(station_anomalies, neighb_anomalies)

                    ## It's possible that in the [left, right] bracket we're looking
                    ## at, there's a changepoint in the paired neighbor. We need
                    ## to adjust the endpoints of the bracket to exclude those
                    ## breakpoints
                    # Check right-hand side first and break out if ...
                    right_seg_len = len(get_valid_data(diff_data[cp + 1 : right + 1]))
                    # right_seg_len = len(diff_data[cp+1:right+1])
                    for month in range(cp + 1, right + 1):
                        if month == last:
                            continue

                        # ... we hit a changepoint in the neighbor ...
                        if month in neighb_cp_dict:
                            neighb_hits = neighb_cp_dict[month]["jsum"]
                            right_seg_len = len(get_valid_data(diff_data[cp + 1 : month + 1]))
                            # right_seg_len = len(diff_data[cp+1:month+1])
                            print (
                                "CHG2: ",
                                neighb_index,
                                neighb_id,
                                "num,edit,2b,2e,imo,nhits",
                                right_seg_len,
                                "--",
                                cp + 1,
                                right,
                                month,
                                neighb_hits,
                            )

                            right = month
                            break
                    # ... and the final right-segment is too short
                    print left, cp, right
                    if right_seg_len < iminlen:
                        print (
                            "Low2: ",
                            neighb_index,
                            neighb_id,
                            "num,edit,2b,2e,imo,nhits",
                            right_seg_len,
                            "--",
                            cp + 1,
                            right,
                            month,
                            "--",
                        )
                        continue

                    # Now, check the left-hand side and break out if ...
                    left_seg_len = len(get_valid_data(diff_data[left : cp + 1]))
                    for month in range(cp - 1, left, -1):
                        if month == first:
                            continue

                        # ... we hit a changepoint in the neighbor ...
                        if month in neighb_cp_dict:
                            neighb_hits = neighb_cp_dict[month]["jsum"]
                            left_seg_len = len(get_valid_data(diff_data[month:cp]))
                            # left_seg_len = len(diff_data[month:cp])
                            print (
                                "CHG1: ",
                                neighb_index,
                                neighb_id,
                                "num,edit,1b,1e,imo,nhits",
                                left_seg_len,
                                "--",
                                cp + 1,
                                left,
                                month,
                                neighb_hits,
                            )

                            left = month
                            break
                    # ... and the final left-segment is too short
                    if left_seg_len < iminlen:
                        print (
                            "Low1: ",
                            neighb_index,
                            neighb_id,
                            "num,edit,1b,1e,imo,nhits",
                            left_seg_len,
                            "--",
                            cp + 1,
                            left,
                            month,
                            "--",
                        )
                        continue

                    ## We can now estimate the raw changepoint amplitude using minbic.
                    ## However, we'll short-circuit a lot of the work by telling it to only
                    ## use the KTHTPR0 model (simple step-change model)
                    (seg_x, seg_data) = range(left + 1, right + 1), diff_data[left + 1 : right + 1]
                    bp_index = cp - (left + 1)
                    # print left, cp, right, "|", bp_index
                    # print left_seg_len, right_seg_len
                    bic_result = minbic(seg_x, seg_data, bp_index, missing_val, models=[("KTHTPR0", kthtpr0)])
                    ## Also check the first difference correlations between the
                    ## monthly anomalies
                    station_first_diff = compute_first_diff(station_anomalies, missing_val)
                    neighb_first_diff = compute_first_diff(neighb_anomalies, missing_val)
                    corr = compute_corr(station_anomalies, neighb_anomalies)

                    ## Write out the results of this testing process so far
                    cmodel = bic_result["cmodel"]
                    bic = bic_result["bic"]
                    test_stat = bic_result["test_stat"]
                    crit_val = bic_result["crit_val"]
                    offset = bic_result["offset"]
                    slopes = bic_result["slopes"]
                    left_slope, right_slope = slopes
                    print (
                        "%s %6s-%6s %s %7.2f %7.2f %7.2f %7.2f %7.3f %7.3f -- %d --"
                        % (
                            tstr,
                            id,
                            neighb_id,
                            chgptstr,
                            crit_val,
                            test_stat,
                            offset,
                            corr,
                            left_slope,
                            right_slope,
                            right_seg_len,
                        )
                    )

                    ## Analysis is done.
                    ## Keep the adjustment (offset) for each neighbor/segment,
                    ## set/reset trend for each neighbor/segment
                    ##     the first segment is the left-segment,
                    ##     the second segment is the right-segment
                    ##
                    ## Note that we reset left/right potentially to avoid conflicts
                    ## within the paired neighbor data. However, our estimates of
                    ## trends/offsets associated with the "right" adjacent changepoint
                    ## actually refers to that original right changepoint. We'll
                    ## reset left, cp, and right from the bracket before continuing
                    (left, cp, right) = bracket[:]
                    # Do the left segment first
                    left_dict = dict()
                    left_dict["adj"] = offset * correction
                    left_dict["cor"] = corr
                    left_dict["bic"] = bic
                    left_dict["cmodel"] = cmodel
                    left_dict["trend"] = left_slope
                    left_dict["spanob"] = left_seg_len
                    pair_results[neighb_id][cp] = left_dict

                    # Do the right segment now
                    right_dict = dict()
                    right_dict["adj"] = offset * correction
                    right_dict["cor"] = corr
                    right_dict["bic"] = bic
                    right_dict["cmodel"] = cmodel
                    right_dict["trend"] = right_slope
                    right_dict["spanob"] = right_seg_len

                    if right not in pair_results[neighb_id]:
                        pair_results[neighb_id][right] = right_dict
                    else:
                        # We've already recorded this segment before for the last
                        # changepoint. Update the slopes/spanob count (length of
                        # preceding segment) if the slopes are different and the
                        # length is different.
                        new_trend = slopes[1]
                        new_spanob = right_seg_len
                        old_trend = pair_results[neighb_id][right]["trend"]
                        old_spanob = pair_results[neighb_id][right]["spanob"]
                        if old_trend != new_trend:
                            print (
                                " Seg2 diff: %s %4d old: %7.2f %4d new: %7.2f %4d"
                                % (pair_str, right, old_trend, old_spanob, new_trend, new_spanob)
                            )
                            # if the new count is greater than the old one, the slope
                            # is probably more robust so update those entries.
                            if new_spanob > old_spanob:
                                pair_results[neighb_id][right]["trend"] = new_trend
                                pair_results[neighb_id][right]["spanob"] = new_spanob

                    ## We're done with this pair/changepoint. Summary output -
                    if step == 2:
                        print "itarg,ipair,ichg,numc,iqt,adj,trends: -- -- -- --", cmodel, offset, slopes
                # raw_input("pause")

                ####################################################################
                ## ADJUSTMENT DETERMINATION SECTION
                # Recall the paired-changepoint analyses we just performed, and
                # determine if the potential adjustment is statistically valid
                (left, cp, right) = bracket[:]

                pair_data = []
                for neighb_id in pair_results:
                    if not cp in pair_results[neighb_id]:
                        continue

                    cp_stats = pair_results[neighb_id][cp]
                    adjacent_stats = pair_results[neighb_id][right]

                    trends = (cp_stats["trend"], adjacent_stats["trend"])

                    pair_dict = dict(
                        neighb_id=neighb_id, adj=cp_stats["adj"], cor=cp_stats["cor"], trends=trends, used=True
                    )
                    pair_data.append(pair_dict)

                npairs = len(pair_data)
                if npairs < numclim:
                    print "Adjpass numc low --", station_index, id, left, cp, right, npairs
                    continue

                # Process -
                #    1) Remove both adjustment and trend outliers
                #    2) Calculate median adjustment
                #
                #    filter around inter-quartile range
                qscale = hom_params["qscale"]
                pair_data = sorted(pair_data, key=operator.itemgetter("adj"))
                pair_chgs = [p["adj"] for p in pair_data]
                chg_25th, chg_median, chg_75th = tukey_med(pair_chgs)

                chg_iqr = chg_75th - chg_25th
                chg_low = chg_25th - (chg_median - chg_25th) * 1.0 * qscale
                chg_high = chg_75th + (chg_75th - chg_median) * 1.0 * qscale
                print (
                    " TRIM p25, p75, pct50, rng, lo, hi: %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
                    % (chg_25th, chg_75th, chg_median, chg_iqr, chg_low, chg_high)
                )
                # If any of the estimated changepoints are outside the statistically
                # robust range we just computed, then flag them as we print them and
                for data in pair_data:
                    neighb_id = data["neighb_id"]
                    neighb_index = all_station_list.index(neighb_id)
                    adj = data["adj"]
                    cor = data["cor"]
                    trends = data["trends"]

                    if not (chg_low < adj < chg_high):
                        data["used"] = False
                    flag = "U" if data["used"] else "X"
                    print ("%s %4d %7.2f %8.4f %8.4f %7.2f" % (flag, neighb_index, adj, trends[0], trends[1], cor))

                valid_adj_count = len([d for d in pair_data if d["used"]])
                if valid_adj_count < numclim:
                    if step == 2:
                        print (
                            "Insuff trimmed mean -- %4d %s %5d %5d %5d %5d"
                            % (station_index, id, left, cp, right, valid_adj_count)
                        )
                        continue

                ## BUG: The code here re-computes the inter-quartile range by
                ##     scaling qscale by 1.0. Curiously, it doesn't reject any
                ##     pairs based on this new range.
                chg_iqr = chg_75th - chg_25th
                chg_low = chg_25th - (chg_median - chg_25th) * qscale
                chg_high = chg_75th + (chg_75th - chg_median) * qscale

                ## Tweak the inter-quartile range to check if the adjustment is
                ## Check whether the computed adjustment is significant. That is,
                ## if 0 is included within the inter-quartile range we computed, then
                ## we can't reject the null hypothesis that the changepoint is significant
                if chg_high * chg_low > 0.0:
                    # signs are the same, so 0 isn't included in the range.
                    procstr = "CONSHF"
                    sigadj = chg_median
                else:
                    procstr = "ZERSHF"
                    sigadj = 0.0

                final_results[id][cp] = dict(adj=sigadj, std=chg_iqr * 1.0 * qscale, num=npairs)

                print ("%2d %s-%s %s %7.2f" % (station_index, id, procstr, chgptstr, sigadj))

            ## Print some final output about what changepoints remain for this station
            final_station_results = final_results[id]
            final_cps = sorted(final_station_results.keys())
            for cp in final_cps:
                adj = final_station_results[cp]["adj"]
                std = final_station_results[cp]["std"]

                cp_stats = station_cp_dict[cp]
                hit_count = cp_stats["jsum"]
                iy, im = imo2iym(cp)

                print (
                    "-- %5d %s Estamt chgout: -- %4d%2d %5d %5d %7.2f %7.2f"
                    % (station_index + 1, id, iy, im, cp, hit_count, adj, std)
                )
            # raw_input("pause")

        ## Remove the accumulated non-significant changepoints (either non-sig because
        ## there was too much missing data, the target segment was too short, or the
        ## trimmed mean test could not reject the null hypothesis of no change
        for id in station_list:
            station_index = station_list.index(id)

            final_station_results = final_results[id]
            final_cps = sorted(final_station_results.keys())
            for cp in final_cps:
                iy, im = imo2iym(cp)
                cp_index = final_cps.index(cp)
                adj = final_station_results[cp]["adj"]
                std = final_station_results[cp]["std"]
                if adj == 0.0:
                    print ("%s %5d Remove chgpt %5d %4d %2d %4d" % (id, station_index, cp_index, iy, im, cp))
                    del network.raw_series[id].changepoints[cp]
                else:
                    # Update the network's record of changepoints with this new list
                    network.raw_series[id].changepoints[cp]["ahigh"] = adj
                    network.raw_series[id].changepoints[cp]["astd"] = std
            # the changepoint at first month has been removed; add it back in
            network.raw_series[id].changepoints[first] = dict(ahigh=0.0, astd=0.0, jsum=0)
Exemplo n.º 2
0
def splitmerge(network, pairs=None, beg_year=1, end_year=2, **kwargs):
    
    ## EXPERIMENTAL PLACEHOLDERS - will eventually be replaced with a master
    ## loop to do all the id pairs.
    id_list = network.stations.keys()
    pair_results = dict()
    
    def dict_to_tuples(d):
        keys = d.keys()
        return [(key, d[key]) for key in keys]
    ## Generate station pairs for use in splitmerge by iteratively going through the
    ## station_list and adding stations in order of decreasing correlation. Skip a 
    ## neighbor if the pair is already present; want 20 stations or until all the
    ## correlated neighbors are used up.
#    pairs = []
#    for id1 in id_list:
#        neighbors = dict_to_tuples(network.correlations[id1])
#        sorted_neighbors = sorted(neighbors, key=operator.itemgetter(1))
#        added_pairs = 0
#        while sorted_neighbors and (added_pairs < 5):
#            id2, _ = sorted_neighbors.pop()
#            ordered_pair = tuple(sorted((id1, id2)))
#            if not ordered_pair in pairs:
#                pairs.append(ordered_pair)
#                added_pairs += 1
    
    for (id1, id2) in pairs:
        print "Pair %s with %s" % (id1, id2)
        pair_str = "%6s-%6s" % (id1, id2)
        #if pair_str != "051528-298107":
        #    continue
        
        raw_series = network.raw_series
        stations = network.stations
        series_copy = deepcopy(raw_series)
        
        min_ann = 5
        num_years = end_year - beg_year
        num_months = num_years*12
            
        for s in series_copy.itervalues():
            data = s.series
            scaled = scale_series(data, 0.1, s.MISSING_VAL)
            anomalies = compute_monthly_anomalies(scaled, s.MISSING_VAL)
            s.set_series(anomalies, s.years)
        
        ## Retrieve the data for each of the stations.
        station1 = stations[id1]
        series1 = series_copy[id1]
        data1 = series1.monthly_series
                
        station2 = stations[id2]
        series2 = series_copy[id2]
        data2 = series2.monthly_series
        
        
        #print data1[:50]
        #print data2[:50]
        #print "################################################################"
        ## Compute the difference series        
        diff_data = diff(data1, data2)
        MISS = series1.MISSING_VAL # Missing value placeholder
        
        ## Quickly pass through the data to find where it starts. We need to do this
        ## because it's possible that beg_year is earlier than the first year of 
        ## valid data in either data1 or data2. Furthermore, the original PHA code
        ## deliberately clipped off the first year of good data, so emulate that 
        ## effect here as well.
        ##
        ## Ultimately, we save the extreme early and extreme late month with valid
        ## data to use as our first guess at the undocumented changepoints.
        first = 0
        first_set = False
        last = 0
        for (i, d1, d2) in zip(xrange(num_months), data1, data2):
            if d1!=MISS and d2!=MISS:
                if first < 12:
                    first = i
                    #first_set = True
                #if not first_set:
                #    first = i
                #    first_set = True
                last = i
                
        ## Set the initial breakpoints and the list of already-found, homogenous
        ## segments.    
        breakpoints = [first, last, ]
        homog_segs = []
        
        #####################################################################
        ## BEGIN SPLITMERGE PROCESS TO GENERATE FIRST GUESS AT UNDOCUMENTED
        ## CHANGEPOINTS
        iter = 0 # counts how many times we've repeated the splitmerge process
        enter_BIC = False # break out of iterations into the BIC process?
        last_breakpoints = []
        while (iter < 10) and not enter_BIC:
            
            seg_bounds = zip(breakpoints[:-1], breakpoints[1:])
            last_breakpoints = deepcopy(breakpoints)
            new_breakpoints = deepcopy(breakpoints)
                
            new_homog_segs = []
        
            print "Parse segments (isplit = 1), ipass: "******"Too short: ", imo2iym(l), imo2iym(r)
                    continue
                
            ## If we've previously found that this segment is homogenous (has no
            ## potential changepoint), then we can skip it as well and proceed to
            ## the next one.
                # Set the within() method to check if this segment is within any
                # previously found homogenous ones. Use lambda, since we can't pass
                # keyword or positional arguments to map().
                within_this_seg = lambda seg: within((l, r), seg)
                within_stable_segs = map(within_this_seg, homog_segs)
                if any(within_stable_segs):
                    print "Stable segment: ", imo2iym(l), imo2iym(r)
                    if l == first: 
                        new_breakpoints.append(first)
                    continue
                
            ## The standard normal homogeneity test - which is the statistical test
            ## we'll use to see if there is a potential changepoint in this segment
            ## - requires us to normalize our paired difference series. We can do
            ## that in snht(), but we'll do it right now so we can inspect those
            ## standardized values later.
                z = standardize(segment, MISS)

            ## Apply standard normal homogeneity test. 
            ## For mechanics, see Alexandersson and Moberg 1997, Int'l Jrnl of
            ## Climatology (pp 25-34)
                likelihood_ratios = snht(z, MISS, standardized=True)
                z_count = len(get_valid_data(z))
                        
            ## We're left with the likelihood ratio for each value being a potential
            ## changepoint. Find the max ratio, and if that value is significant, let
            ## it be the newest potential changepoint.
                ind_max_ratio = 0
                max_ratio = 0.0
                clip_ratios = likelihood_ratios[2:-2] # clip the beginning and end,
                                                      # they can't be changepoints.
                for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios):
                    if ratio > max_ratio:
                        ind_max_ratio = ind
                        max_ratio = ratio
            ## Now we find the critical value for this data set, and check our max
            ## likelihood ratio against it
                crit_val = lrt_lookup(z_count)
                
                # The possible changepoint is the index of the max ratio we found. 
                # We have to shift it the following ways to align it to the original
                # data -
                #    1) shift by 2 re-aligns it from clip_ratios to likelihood_ratios
                #    2) shift by adjust re-aligns it to this segment in diff_data
                #    3) shift by l re-aligns it to the first index in diff_data
                possible_changepoint = l + ind_max_ratio + 2 + adjust
                
                y_new, m_new = imo2iym(possible_changepoint) # year, month
                
            ## If this is the first iteration, we indicate as such, and add the new
            ## changepoint
                if iter == 0: 
                    print "%6s-%6s MD        FIRST series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val)
                    breakpoints.append(possible_changepoint)
                    breakpoints = sorted(breakpoints)
            
                else:
            ## Else, if we found a new possible changepoint, add it to our list.
                    if max_ratio > crit_val:
                        print "%6s-%6s MD Inhomogenity for series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count)
                        new_breakpoints.append(possible_changepoint)
                        
            ## If not, record that we found a homogeneous segment.   
                    else:
                        print "%6s-%6s MD      Homogeneous series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count)
                        new_homog_segs.append((l, r))
            
            ## Now we need to update our account of which segments were homogeneous,
            ## because we need to know during the next iteration. We will do this,
            ## as well as condense stable segments that lie adjacent to each other
            ## i.e, if we have the segments [(1,5), (5, 10,),, (12, 15)], then we 
            ## really have [(1,10), (12, 15)].
            homog_segs.extend(new_homog_segs)
            if homog_segs:
                homog_segs = sorted(homog_segs, key=operator.itemgetter(0))
                final_homog_segs = [homog_segs[0], ] # this will be like a stack
                for seg in homog_segs[1:]:
                    last_seg = final_homog_segs[-1]
                    if last_seg[1] == seg[0]:
                        new_seg = (last_seg[0], seg[1])
                        final_homog_segs.pop()
                        final_homog_segs.append(new_seg)
                    else:
                        final_homog_segs.append(seg)
                homog_segs = final_homog_segs
        
            ## So we have new segments that can be generated from these new
            ## breakpoints. Now, the PHA routine enters a "merge" process
            ## to see whether or not to keep these newly found changepoints or throw
            ## them out as false alarms. 
            ##
            ## We do this by "leapfrogging" every other breakpoint. This gives us
            ## a set of segments that all have another breakpoint in them. We want
            ## to see if these segments are homogeneous, because if they are, it
            ## means that the breakpoint we previously found in the segment has 
            ## been superseded.
            new_breakpoints = sorted(new_breakpoints)
            seg_bounds = zip(new_breakpoints[:-2], new_breakpoints[2:])
            
            remove_breakpoints = set()
            merged_breakpoints = set()
            if iter > 0:
                
                print "Merge segments (isplit = 0), ipass: "******"Stable segment: ", imo2iym(l), imo2iym(r)
    #                    if l == first: 
    #                        new_breakpoints.append(first)
    #                    seg_lookup.append(((l, r), 'stable'))
    #                    continue
                    # Set the within() method to check if this segment is within any
                    # previously found homogenous ones. Use lambda, since we can't pass
                    # keyword or positional arguments to map().
                    within_this_seg = lambda seg: within((l, r), seg)
                    within_stable_segs = map(within_this_seg, homog_segs)
                    if any(within_stable_segs):
                        print "Stable segment: ", imo2iym(l), imo2iym(r)
                        #if l == first: 
                        #    new_breakpoints.append(first)
                        merged_breakpoints.update([l, r])
                        continue
            
            ## Apply the same adjustments and the same standard normal homogeneity
            ## test that we did in the previous splitting process. There is no 
            ## difference here until we consider what to do if we find a new 
            ## homogeneous segment.
                    adjust = int(seg_bounds.index((l, r)) > 0)
                    segment = diff_data[l+adjust:r+1]
                    
                    z = standardize(segment, MISS)
                    likelihood_ratios = snht(z, MISS, standardized=True)
                    z_count = len(get_valid_data(z))
                        
                    ind_max_ratio = 0
                    max_ratio = 0.0
                    clip_ratios = likelihood_ratios[2:-2] # We clip the beginning and end
                    for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios):
                        if ratio > max_ratio:
                            ind_max_ratio = ind
                            max_ratio = ratio
                            
                    crit_val = lrt_lookup(z_count)
                    possible_changepoint = l + ind_max_ratio + 2 + adjust
                    
                    y_new, m_new = imo2iym(possible_changepoint)
                    
    
                    if z_count < 2:
                        y1, m1 = imo2iym(l)
                        y2, m2 = imo2iym(r)
                        print "%6s-%6s MD  No found peaks %4d %2d to %4d %2d" % (id1,id2,y1,m1,y2,m2)
                        print "%6s-%6s MD  Compress 1 out peak at %4d %2d" % (id1,id2,y_new,m_new)
                        #remove_breakpoints.add_
            ## If we found a new breakpoint that is statistically significant, then
            ## great! Let's keep it.
                    if max_ratio > crit_val:
                        print "%6s-%6s MD  Peak kept in merge at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val)
                        merged_breakpoints.add(l)
                        merged_breakpoints.add(new_bp)
                        merged_breakpoints.add(r)
            ## If not, then this segment was homogeneous, so the breakpoint which
            ## already exists in it is no good.
                    else:
                        print "%6s-%6s MD Compress 2 out peak at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val)
                        # Crap, if there are any potential breakpoints in this segment,
                        # we need to remove them because this segment is homogeneous. Let's
                        # remember this homogeneous segment for now and come back once
                        # we've found all of them.    
                        merged_breakpoints.update([l, r])
                        remove_breakpoints.add(new_bp)
            
            ## At this point, we have a set of all the breakpoints we've accumulated
            ## during this iteration of split/merge, as well as a set of breakpoints
            ## which we've found to be of no further use. We can difference update
            ## our set of breakpoints to remove these guys, and let those merged
            ## breakpoints be the set of newest breakpoints for the next splitmerge
            ## iteration.
                merged_breakpoints.difference_update(remove_breakpoints)
                breakpoints = list(merged_breakpoints)
            
            breakpoints = sorted(breakpoints)
            
            ## Did we actually find new breakpoints? If not, then we're done
            ## with splitmerge and can move on to the BIC process.
            enter_BIC = (breakpoints == last_breakpoints)
            iter = iter + 1
            
        ## Okay wow, we've potentially made it to the BIC stage now... !
        if first not in breakpoints:
            breakpoints.insert(0, first)
        ym_breakpoints = map(imo2iym, breakpoints)
        #print ym_breakpoints
        
        ## ENTERING MINBIC    
        bp_dictionary = dict()
####################################
##### MULTIPROCESS
        from multiprocessing import Pool

        global counter
        multi_bp_dict = {}
        counter = 0
        def cb(r):
            global counter
            #print counter, r
            counter += 1
        
        start = time.clock()         
        po = Pool(processes=4)
        for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]):
                    
            if left != first:
                left = left + 1
            # recall that we only consider data after the first full year. we will be 
            # computing regressions with the independent variable indexed from this 
            # starting point, so we need to shift these indices. we also need to shift them
            # by +1 if this is any segment beyond the first one, so that we don't include
            # changepoints in more than one analysis.
            # TOTAL_SHIFT = -12 + 1 = -11
            # 
            # However, this shift is only necessary while looking at the array indices that
            # we generate using range(). the data should already be aligned correctly.
            total_shift = -12 + 1
            left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift
            y1, m1 = imo2iym(left)
            yb, mb = imo2iym(bp)
            y2, m2 = imo2iym(right)
            #print "Entering MINBIC - %4d %2d    %4d %2d    %4d %2d" % (y1, m1, yb,
            #                                                           mb, y2, m2)
            (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1]
            bp_index = bp-left
            #print len(seg_x), len(seg_data), bp_index
            #bp_analysis = minbic(seg_x, seg_data, bp_index, MISS)
            multi_bp_dict[bp] = po.apply_async(minbic,(seg_x,seg_data,bp_index,MISS,),callback=cb)
        po.close()
        po.join()
        for bp in multi_bp_dict:
            r = multi_bp_dict[bp]
            multi_bp_dict[bp] = r.get()
        #print "counter - %d" % counter
        elapsed = (time.clock() - start)
        print "ELAPSED TIME - %2.3e" % elapsed
        #print new_bp_dict
####################################
##### NORMAL        
#        start = time.clock()
#        for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]):
#                    
#            if left != first:
#                left = left + 1
#            # recall that we only consider data after the first full year. we will be 
#            # computing regressions with the independent variable indexed from this 
#            # starting point, so we need to shift these indices. we also need to shift them
#            # by +1 if this is any segment beyond the first one, so that we don't include
#            # changepoints in more than one analysis.
#            # TOTAL_SHIFT = -12 + 1 = -11
#            # 
#            # However, this shift is only necessary while looking at the array indices that
#            # we generate using range(). the data should already be aligned correctly.
#            total_shift = -12 + 1
#            left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift
#            y1, m1 = imo2iym(left)
#            yb, mb = imo2iym(bp)
#            y2, m2 = imo2iym(right)
#            print "Entering MINBIC - %4d %2d    %4d %2d    %4d %2d" % (y1, m1, yb,
#                                                                       mb, y2, m2)
#            (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1]
#            bp_index = bp-left
#            #print len(seg_x), len(seg_data), bp_index
#            bp_analysis = minbic(seg_x, seg_data, bp_index, MISS)
#            
#            bp_dictionary[bp] = bp_analysis    
#        elapsed2 = (time.clock() - start)
#        print "ELAPSED TIME = %3.2e" % elapsed2
        
        ##################################3
        ## Print the adjustment summaries
        bp_dictionary = multi_bp_dict
        sorted_bps = sorted(bp_dictionary.keys())
        ndelete = []
        valid_bps = {}
        for bp in sorted_bps:
            stats = bp_dictionary[bp]
            
            cmodel=stats['cmodel']
            iqtype=stats['iqtype']
            asigx=stats['offset']
            azscr=stats['offset_z']
            rslp=stats['slopes']
            
            end1 = bp
            y_end1, m_end1 = imo2iym(end1)
            beg2 = bp+1
            y_beg2, m_beg2 = imo2iym(beg2)
            
            # If cmodel is *SLR*, then there is no breakpoint
            if 'SLR' in cmodel:
                print ("%s-%s  --  -- MD TESTSEG SKIP: %7.2f %5d %5d %3d %5d %5d %3d" %
                       (id1, id2, asigx, end1, y_end1, m_end1, beg2, y_beg2, m_beg2))
                # Don't store it!
            else:
                print ("%6s-%6s  --  -- MD TESTSEG ADJ: %7.2f %7.2f %8.4f %8.4f %5d %5d %3d %5d %5d %3d %2d" % 
                       (id1,id2, asigx, azscr, rslp[0], rslp[1], end1, y_end1, m_end1, beg2, y_beg2, m_beg2, iqtype))
                # Store it!
                valid_bps[bp] = stats
        
        ###############################
        ## Go back and see if we can get rid of some of the change points.
        ## If 2 or more of the chgpts are within MINLEN,
        ##    a) if the chgpt estimates are the same sign, then test each
        ##        singly with same endpoints and keep lowest BIC 
        ##    b) if not the same sign,
        ##        retain earliest changepoint
        # add the first, last to valid_bps
        interior_bps = valid_bps.keys()
        # Add first, last if not already in interior_bps
        for bp in [first, last]:
            if bp not in interior_bps:
                interior_bps.append(bp)
        sorted_bps = sorted(interior_bps)
        for left in sorted_bps:
            print sorted_bps, left
            ## We're looking for the next interim breakpoint that satisfies two
            ## conditions:
            ##    1) at least MINLEN valid data (non-missing to the right)
            ##    2) has at least one breakpoint between 'left' and it
            right = 0
            close_bps = []
            for right in sorted_bps: 
                if right <= left: continue
                
                if not close_bps:
                    close_bps.append(right)
                else:
                    valid_between_bps = diff_data[close_bps[-1]:right]
                    valid_length = len(get_valid_data(valid_between_bps, MISS))
                    print imo2iym(close_bps[-1]),valid_length,imo2iym(right)
                    if valid_length > MINLEN:
                        break
                    close_bps.append(right)
            # We could actually run out of things in sorted_bps, and wind up with
            # right == close_bps[-1]. Detect that and break out of this analysis
            # if that happens.
            if close_bps[-1]==right: break
            
            if left != first:
                left = left + 1
            close_bp_results = {}
            for bp in close_bps:
                        
#                # recall that we only consider data after the first full year. we will be 
#                # computing regressions with the independent variable indexed from this 
#                # starting point, so we need to shift these indices. we also need to shift them
#                # by +1 if this is any segment beyond the first one, so that we don't include
#                # changepoints in more than one analysis.
#                # TOTAL_SHIFT = -12 + 1 = -11
#                # 
#                # However, this shift is only necessary while looking at the array indices that
#                # we generate using range(). the data should already be aligned correctly.
                total_shift = -12 + 1
                left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift
                y1, m1 = imo2iym(left)
                yb, mb = imo2iym(bp)
                y2, m2 = imo2iym(right)
                
                print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
                print y1,m1,"-",yb,mb,"-",y2,m2
                print "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
                
                (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1]
                bp_index = bp-left
                bp_analysis = minbic(seg_x, seg_data, bp_index, MISS, kthslr0_on=True)
                
                cmodel=bp_analysis['cmodel']
                iqtype= bp_analysis['iqtype']
                offset= bp_analysis['offset']
                rslp= bp_analysis['slopes']
                crit_val = bp_analysis['crit_val']
                test_stat = bp_analysis['test_stat']
                bic = bp_analysis['bic']
                
                print ("Interim chgpt: %s %4d %2d %4d %2d %4d %2d %8.2f %8.2f %8.2f %8.2f %7.3f %7.3f %2d" %
                       (pair_str, y1, m1, yb, mb, y2, m2, bic, test_stat, crit_val, offset, rslp[0], rslp[1], iqtype))                    
                
                close_bp_results[bp] = bp_analysis

            # Now we have a small problem... we might have more than one breakpoint,
            # so we need to choose which one is best. We will check the sign of
            # the breakpoint amplitude changes:
            sign_of_amps = map(sign, [close_bp_results[bp]['offset'] for bp in close_bps])
            positive = lambda x: sign(x) >= 0
            negative = lambda x: sign(x) <= 0
            zero = lambda x: sign(x) == 0
            print "------------>",[close_bp_results[bp]['offset'] for bp in close_bps]
            if (all(map(positive, sign_of_amps)) or 
                all(map(negative, sign_of_amps))):    
                # Pick the best (minimum BIC)          
                bics = [(bp, close_bp_results[bp]['bic']) for bp in close_bps]
                sorted_bics = sorted(bics, key=operator.itemgetter(1))
                smallest_bp = sorted_bics[0][0]
                
                # Remove this smallest-bic bp from the in-interval bps 
                close_bps.remove(smallest_bp)
                valid_bps[smallest_bp] = close_bp_results[smallest_bp] 
                
                #print "leftovers",close_bps
                for bp in close_bps: # The remaining bps which we will reject
                    sorted_bps.remove(bp) # Remove them from this loop
                    del valid_bps[bp] # Remove them as valid 
                    
                yb, mb = imo2iym(smallest_bp)
                print ("Same domain - Lowest Interim: %s %4d %2d" % 
                       (pair_str, yb, mb))
            elif (all(map(zero, sign_of_amps))):
                # Choose the earliest changepoint; the rest of these have
                # amplitude changes which are 0.
                first_bp, last_bp = close_bps[0], close_bps[-1]
                
                # Remove the first interim bp and update valid_bps with this new
                # computation. 
                close_bps.remove(first_bp)
                valid_bps[first_bp] = close_bp_results[first_bp]
                
                # Reject remaining interim bps
                for bp in close_bps:
                    sorted_bps.remove(bp)
                    del valid_bps[bp]
                    
                yb, mb = imo2iym(first_bp)
                print ("Null domain - Earliest Interim : %s %4d %2d" %
                       (pair_str, yb, mb))
            else:
                # We'll use the earliest interim changepoint, but we need
                # to get rid of bad data. Replace all the data between the 
                # interim changepoints as missing and re-compute BIC.
                first_bp, last_bp = close_bps[0], close_bps[-1]
                first_bp_index = first_bp-left
                last_bp_index = last_bp-left
                
                print len(seg_x), len(seg_data)
                print first_bp_index+1, last_bp_index+1
                print left, bp, right
                for i in range(first_bp_index+1, last_bp_index+1):
                    print i, imo2iym(i), i+left, imo2iym(i+left)
                    seg_x[i] = MISS
                    seg_data[i] = MISS
                    # Recall that seg_data[0] == diff_data[left]. ndelete records
                    # the *true month where there is unviable data*, so it needs to
                    # point back to the original element in diff_data we are 
                    # worried about.
                    ndelete.append(i+left) 
                bp_analysis = minbic(seg_x, seg_data, first_bp_index, MISS, kthslr0_on=True)
                
                # Remove the first interim bp and update valid_bps with this new
                # computation. 
                close_bps.remove(first_bp)
                valid_bps[first_bp] = bp_analysis
                
                # Reject remaining interim bps
                for bp in close_bps:
                    sorted_bps.remove(bp)
                    del valid_bps[bp]
                
                yb, mb = imo2iym(first_bp)
                print ("Diff domain - Earliest Interim : %s %4d %2d" %
                       (pair_str, yb, mb))                
    
        ## Remove changepoints which are an SLR model.
        nspan = [0]*num_months
        bp_count = 1
        for bp in sorted(valid_bps.keys()):
            bp_analysis = valid_bps[bp]
            
            if "SLR" in bp_analysis['cmodel']:
                del valid_bps[bp]
                continue
            
            print "   IN: ",bp
            nspan[bp] = bp_count
            ## If adjacent months are missing next to this breakpoint, then
            ## assume that those could be a breakpoint as well and copy this
            ## breakpoint's analysis results for them.
            for month in range(bp+1, last):
                if (month in ndelete) or (diff_data[month] == MISS):
                    nspan[month] = bp_count
                    print "   IN: ",month
                    valid_bps[month] = bp_analysis
                else:
                    break
            bp_count += 1
            
        valid_bps['del'] = ndelete
        valid_bps['nspan'] = nspan
        pair_results[pair_str] = valid_bps
        
        #print "ELAPSED TIMES = %3.2e %3.2e" % (elapsed1, elapsed2)
    print "done"
    ##
    import pickle
    f = open("pair_results", 'w')
    pickle.dump(pair_results, f)
    return pair_results