def summary_table(station_ids, hits, all_data, print_missing=False, 
                  missing_val=-9999, **hom_params):
    """Print out a summary of a specified working array in matrix form. The 
    matrix will contain each entry in hits, and if print_missing is flagged
    as True, it will check all_data and replace an entry in hits with a mark
    that the value is missing in the original dataset.
    
    :Param station_ids:
        A list of the station_ids corresponding to the first axis in both
        hits and all_data.
    :Param hits:
        The working array to print in the matrix.
    :Param all_data:
        The array to check against for missing values.
    :Param print_missing:
        (optional) Enable printing of locations where "missing values" are
        detected in all_data, as denoted by the given missing_val
    :Param missing_val:
        (optional) Placeholder for missing values in all_data. Default is
        -9999.
    :Return:
        Nothing; prints the 'hits' array to console.
    
    """

    ## Print header - 
    head1 = "              |"+"|".join([i[:3] for i in station_ids])+"|"
    head2 = "              |"+"|".join([i[3:] for i in station_ids])+"|"
    print head1
    print head2
    
    ## Print monthly series basic
    def con2str(data, missing_val=-9999):
        """Convert entry in the working array into a string to print"""
        val, hits = data
        
        if val == missing_val and print_missing:
            return "-X-"
        elif hits > 0:
            return "%3d" % hits
        else:
            return "---"
        
    num_stations, num_months = all_data.shape
    for imo in xrange(num_months):
        year, month = imo2iym(imo)
        base_str = " %4d %2d %4d |" % (year, month, imo-11)
        month_strs = "|".join(map(con2str, 
                                  zip(all_data[:,imo],hits[:,imo])))+"|"
        print_month_strs = False
        for i in range(10):
            if str(i) in month_strs: 
                print_month_strs = True
                break
        
        if print_month_strs:
            print base_str+month_strs
def filter1(network, **hom_params):
    """Attempts to 'unconfound' the undocumented changepoints detected for each
    pair of stations and attribute them to a single station. This first filter
    also takes care of book-keeping by setting up arrays to hold the results of
    the filters employed in the Menne/Williams pairwise homogenization algorithm.
    
    :Param network:
        The network object containing the station/series data and metadata, as
        well as the paired results from the splitmerge process, stored in a 
        dictionary called pair_results,
        :Ivar pair_results:
            A dictionary containing the paired results. Should have keys of the
            form "%s-%s" % (id1, id2), and each key should be associated with a
            dictionary element. That dictionary has keys corresponding to the
            breakpoint and the associated analysis computed by minibic() for that
            breakpoint, as well as the keys "nspan" and "del", which are lists of
            months flagged as suspect during the splitmerge process.
    :Param hom_params:
        The parameters object containing important parameter settings for this
        analysis:
        :Ivar nmo:
            The number of months total in all of the raw series in the network.
    :Return:
        Updates the network object with the following fields:
        :Ivar hits_array:
            A numpy array of dimensions [number of stations, number of months]
            recording the filtered set of changepoints. The array will be 0 
            everywhere except where a station logs a changepoint, and in those
            instances, the array will reflect the number of times that 
            changepoint was implicated in a paired neighbor.
        :Ivar amps_array:
            A numpy array of dimensions [number of stations, number of months]
            recording the normalized changepoint amplitude estimated at a given
            month for a given station. If there is no breakpoint at a given
            entry, records 0.0; else, this is the average of the offsets
            for each paired hitpoint divided by their standard deviation.
        
    """

    ## Setup the data in NumPy arrays for easy inspection.
    station_list = network.stations.keys()
    ids = station_list
    all_data = [network.raw_series[id].monthly_series for id in ids]
    all_data = np.array(all_data)
        
    ################################################################################
    ## PRE FILTER 1
    ## Search through the record of pair results and initially attribute every
    ## detected changepoint to *both* stations in the pair. Also, delete spans
    ## of data which were flagged as suspect or troublesome during the changepoint
    ## detection process.    
    hits = np.zeros_like(all_data)
    
    hits_neighbors = [[list() for i in xrange(hom_params['nmo'])] for i in xrange(len(ids))]
    deleted = np.zeros_like(all_data)
    hits_models = np.zeros(all_data.shape, dtype=np.dtype( (str, 7) ))
    
    for pair in network.pair_results:
        id1, id2 = pair.split("-")
        id1_ind, id2_ind = ids.index(id1), ids.index(id2)
    
        result = network.pair_results[pair]
        for (bp, result) in result.iteritems():
            if bp == "nspan":
                continue
            elif bp == 'del':
                #continue
                ## Store in network
                network.raw_series[id1].delete_months(network.pair_results[pair]['del'])
                network.raw_series[id2].delete_months(network.pair_results[pair]['del'])            
                
                print "DEL", pair, network.pair_results[pair]['del']
                for bad_month in network.pair_results[pair]['del']:
                #    hits[id1_ind, bad_month] += 1
                #    hits[id2_ind, bad_month] += 1
                #    deleted[id1_ind, bad_month] += 1
                #    deleted[id2_ind, bad_month] += 1
                    pass
                
            else:
                hits[id1_ind, bp] += 1
                hits[id2_ind, bp] += 1
           
                hits_neighbors[id1_ind][bp].append(id2)
                hits_neighbors[id2_ind][bp].append(id1)
                
                hits_models[id1_ind, bp] = result['cmodel']
                hits_models[id2_ind, bp] = result['cmodel']

    ## Print a summary of the initial list of suspect changepoints
    summary_table(station_list, hits, all_data, False)    
    
    ################################################################################
    ## FILTER 1
    ## Unconfound the paired changepoints by repeatedly finding the most common 
    ## changepoint, and attributing that as the culprit for all of its pairs in 
    ## the month it occurs while deleting the hits the culprit has caused. Repeat
    ## until there is no changepoint which has been a culprit less than two times.
    new_hits = np.zeros_like(hits)
    amps = np.zeros_like(hits)
    
    ## Loop forward through all of the months, analyzing one at a time
    for imo in range(hits.shape[1]):
        
        hits_month = hits[:,imo]
        while hits_month.max() > 1:
            
            max_in_hits = hits_month.max()
            station_index = hits_month.argmax()
        
            station_id = station_list[station_index]
            iy, im = imo2iym(imo)

            ## Find entries in pair_results with this station and a changepoint on this
            ## date
            pr_keys = [key for key in network.pair_results if (station_id in key and
                                                               imo in network.pair_results[key])]

            if not pr_keys: 
                break
            
            offset_sum, offset_z_sum, count = 0.0, 0.0, 0.0
            for key in pr_keys:
                bp_summary = network.pair_results[key][imo]
    
                id1, id2 = key.split("-")
                
                offset, offset_z = bp_summary['offset'], bp_summary['offset_z']
                if station_id == id2: 
                    offset = offset*-1.0            
                    
                offset_sum += offset
                offset_z_sum += abs(offset_z)
                count += 1
            avg_offset = offset_sum/count
            avg_offset_z = offset_z_sum/count
            
            print ("  -- %s-CONFRM MW1 at %5d %4d %2d AVG ADJ: %3.2f %2.2f %3d" % 
                   (station_id, imo, iy, im, avg_offset, avg_offset_z, max_in_hits))
            
            new_hits[station_index,imo] = max_in_hits
            amps[station_index,imo] = avg_offset_z
            hits[station_index,imo] = 1
            hits_month[station_index] = 1
                    
    ## Print a summary of the filtered changepoints.
    summary_table(station_list, new_hits, all_data, False)
    
    ## Update network with this new data
    network.hits_array = new_hits
    network.amps_array = amps
def filter3(network, **hom_params):
    """Reduce the number of detected changepoints by condensing closely-related
    changepoints into one another, by choosing the biggest amplitude change.
    
    :Param network:
        The network object containing the station/series data and metadata, and
        a record of suspect, undocumented changepoints and their approximate
        amplitude:
        :Ivar hits_array:
            A numpy array of dimensions [number of stations, number of months]
            containing the timing of the set of undocumented changepoints 
            filtered so far. 
        :Ivar amps_array:
            A numpy array of dimensions [number of stations, number of months]
            containing the approximate amplitudes of the undocumented 
            changepoints filtered so far.
    :Param hom_params:
        The parameters object containing important parameter settings for this
        analysis:
        <none so far>
    :Return:
        Record the final set of changepoints for each series in the given
        network object as the following field:
        :Ivar changepoints:
            A dictionary containing the changepoint locations and amplitudes. 
            Each key in the dictionary is the integer month index where a 
            breakpoint occurs, and each dictionary has the following fields:
            :Param jsum:
                The number of times this changepoint caused a break to be logged
                in a paired neighbor during splitmerge
            :Param ahigh:
                The amplitude change associated with the changepoint
            :Param astd:
                The error associated with the amplitude change
    """
    
    ################################################################################
    ## FILTER 3 TEST
    ## Try to reduce the number of changepoints by condensing closely-related
    ## changepoints into one another.
    ##
    ## Go back through the years/months
    ##    For each technique
    ##        find the highest remaining hits
    ##        accumulate all of the new_hits and ntests for each month +/- mrgyr 
    ##           (from amps) while skipping missing data
    ##    All new_hits and amps are zeroed out
    ##    For all accumulations within nmrgryr
    ##        when given month are greater or equal to ithresh then add to
    ##           the nhits and ntests arrays
    ##
    ## iconfirm := 2 | the min number of hits for a possible breakpoint to be valid
    ## nmrgyr := -2 | no idea what this does; it defaults to -2 
    station_list = network.stations.keys()
    ids = station_list
        
    inconfirm = 2
    nmrgyr = -2
    
    final_hits = np.zeros_like(network.hits_array)
    ifound = 0

    (num_stations, num_months) = network.amps_array.shape
    for station_index in range(num_stations):
        
        station_id = station_list[station_index]
        data = network.raw_series[station_id].series
        miss = network.raw_series[station_id].MISSING_VAL
        data_monthly = network.raw_series[station_id].monthly_series
        stdk = compute_monthly_avg_std(data)
        
        ## setup temp arrays 
        khits = np.zeros(hom_params['nmo'])
        ktests = np.zeros(hom_params['nmo'])
        akhigh = np.zeros(hom_params['nmo'])
        
        # iterate until there are no more high points
        istop = False
        while not istop:
            
            ihighit, ahigh = 0.0, 0.0
            
            # find the highest count - the most number of hits at a possible breakpoint
            for month in range(num_months):
                
                isum, asum = 0.0, 0.0
                if network.hits_array[station_index, month] >= inconfirm:
                    jhit = network.hits_array[station_index, month]
                    isum += jhit
                    asum += network.amps_array[station_index, month]*jhit
                
                # find the highest chgpt hit station by hits
                if isum > ihighit: 
                    ihighit = isum
                    ihighmo = month
                    ahigh = asum / isum 
            ## now -
            ##    ihighmo := month of highest hit value
            ##    ihighit := sum of hits over all tests
            ##    ahigh := estimated adjustment
            print "----itarg,ihighit,ihighmo,ahigh,stdk",station_index,ihighit,ihighmo,ahigh,stdk
                    
            #Keep going until there are no more hits
            if ihighit > 0:
                # bracket the highest hit +/- nmrgyr
                if nmrgyr != -2:
                    ibracket = nmrgyr*2 + 1
                else:
                    # else bracket using amplitude of chgpt to define month range
                    ## This are PRE-DEFINED in inhomog.parm.system.mthly.incl. They 
                    ## will need to be re-factored later in to a more logical place
                    ## (Parameters maybe?)
                    arange = [0.4, 0.6, 0.8, 1.0, 1.5, 3.0, 5.0]
                    mrgyr = [36, 18, 12, 8, 6, 5, 5]
                    nrange = len(arange)
                    
                    # Create search bracket for looking at station history files
                    for irange in range(len(arange)):
                        astd = abs(ihighit)
                        if astd < arange[irange]: break
                    ibracket = mrgyr[irange]*2 + 1       
                
                # go through the bracket, look for the highest hits already found
                # start at the hit-point index and expand outward in the series
                # i.e, if our ihighmo = 10, look at [10, 11, 9, 12, 8, 13, 7, ...]
                # keep track of missing values in both directions.
                max_radius = ibracket/2
                miss_left, miss_right = 0, 0
                radius = 0
                absorbed = False
                while radius < max_radius:
                    
                    ## DEAL WITH THE RIGHT MONTH
                    right_month = ihighmo + radius + miss_right
                    if right_month == hom_params['nmo']:
                        break
                    if data_monthly[right_month] == miss:
                        # this month is missing, go to the next
                        while data_monthly[right_month] == miss:
                            right_month += 1
                    # Absorb lesser hit into the closest higher hit
                    #print right_month, right_month-5, right_month
                    #print np.where(khits > 0)
                    if khits[right_month] > 0:
                        khits[right_month] += ihighit
                        akhigh[right_month] += ahigh*ihighit
                        print "Absorb hit: ",station_index,ihighmo," to ",right_month,khits[right_month],ktests[right_month],akhigh[right_month]/khits[right_month]
                     
                        # zero test array block for next iter
                        network.hits_array[station_index,ihighmo] = 0
                        network.amps_array[station_index,ihighmo] = 0.0
                        absorbed = True
                        break
                     
                    ## DEAL WITH THE LEFT MONTH
                    left_month = ihighmo - radius - miss_left
                    if left_month == 0:
                        break
                    if data_monthly[left_month] == miss:
                        # this month is missing, go to the next
                        while data_monthly[left_month] == miss:
                            left_month -= 1
                    
                    # Absorb lesser hit into the closest higher hit
                    #print left_month, left_month-5, left_month+5
                    #print np.where(khits > 0)
                    if khits[left_month] > 0:
                        khits[left_month] += ihighit
                        akhigh[left_month] += ahigh*ihighit
                        print "Absorb hit: ",station_index,ihighmo," to ",left_month,khits[left_month],ktests[right_month],akhigh[right_month]/khits[right_month]
                        
                        # zero test array block for next iter
                        network.hits_array[station_index,ihighmo] = 0
                        network.amps_array[station_index,ihighmo] = 0.0
                        absorbed = True
                        break
                
                    radius += 1
                        
                # if no hits found, setup new hit
                if not absorbed:
                    khits[ihighmo] = ihighit
                    ktests[ihighmo] = 1
                    akhigh[ihighmo] = ahigh*ihighit
                    print "New CHG hit: ",station_index,ihighmo,khits[ihighmo],ktests[ihighmo],akhigh[ihighmo]/khits[ihighmo]
            
                    network.hits_array[station_index, ihighmo] = 0
                    network.amps_array[station_index, ihighmo] = 0.0
                
                #raw_input("pause")
            else:
                istop = True
                
        print "----------------------------------------------"
        # examine interim khits array for station's filtered changepoints
        uchgpt_dict = dict()
        for month in range(hom_params['nmo']):
            # ... if highest hits > ithres(npair) then save
            # fetch the numbr of pairs tested
            
            if khits[month] > 0:
                npair = ktests[month]
                jsum = khits[month]
                
                ihthres = 2
                iy,im = imo2iym(month)
                print "itarg,imo,iym,npair,jsum,ihthres,stdk",station_index,month,iy,im,npair,jsum,ihthres,stdk
                if jsum >= ihthres:
                    # passed threshold test- put interim into final
                    final_hits[station_index, month] += jsum
                    ifound += 1
                    
                    # debug stuff
                    ahigh = akhigh[month]/khits[month]
                    astd = ahigh
                    print ("%5d %6s-UCHGPT KW%1d at %4d %3d %4d %6.2f %6.2f %3d %3d %3d" % 
                           (station_index, station_id, 1, iy, im, jsum, ahigh, astd, ibracket, npair, ihthres) )
                    
                    uchgpt_dict[month] = { 'jsum': jsum,
                                           'ahigh': ahigh,
                                           'astd': astd }
                    
        network.raw_series[station_id].changepoints = uchgpt_dict
        
    print "-------------------------------------------------"
    print "Undoc filter: ",ifound
def filter2(network, **hom_params):
    """Reconciles the detected undocumented changepoints with documented ones,
    if available, by "absorbing" detected changepoints near these known breaks
    in the data. 
    
    TODO: implement this functionality
    
    :Param network:
        The network object containing the station/series data and metadata, and
        a record of suspect, undocumented changepoints and their approximate
        amplitude:
        :Ivar hits_array:
            A numpy array of dimensions [number of stations, number of months]
            containing the timing of the set of undocumented changepoints 
            filtered so far. 
        :Ivar amps_array:
            A numpy array of dimensions [number of stations, number of months]
            containing the approximate amplitudes of the undocumented 
            changepoints filtered so far.
    :Param hom_params:
        The parameters object containing important parameter settings for this
        analysis:
        <none so far>
    :Return:
        does nothing right now other than print out some summary information.
        
    """
    
    ################################################################################
    ## FILTER 2
    ## won't actually do anything but print things to console for now
    ## Use station history and metadata to absorb undocumented changepoints
    ## to known ones where possible
    station_list = network.stations.keys()
    ids = station_list
    
    (num_stations, num_months) = network.amps_array.shape
    for station_index in range(num_stations):
        
        station_id = station_list[station_index]
        station_series = network.raw_series[station_id]
        data = station_series.series
        scale_series(data, 0.1, station_series.MISSING_VAL)
        stdk = compute_monthly_avg_std(data)
        
        for month in range(num_months):
            
            if network.hits_array[station_index, month] > 0:
                ahigh = network.amps_array[station_index, month]
                jsum = network.hits_array[station_index, month]
                
                astd = abs(ahigh)
                
                iy, im = imo2iym(month)
                
                ## These are PRE-DEFINED in inhomog.parm.system.mthly.incl. They 
                ## will need to be re-factored later in to a more logical place
                ## (Parameters maybe?)
                arange = [0.4, 0.6, 0.8, 1.0, 1.5, 3.0, 5.0]
                mrgyr = [36, 18, 12, 8, 6, 5, 5]
                nrange = len(arange)
                
                # Create search bracket for looking at station history files
                for irange in range(len(arange)):
                    if astd < arange[irange]: break
                ibracket = mrgyr[irange]*2 + 1
                
                print "ASTD: ",station_index,station_id,"1",iy,im,astd,ahigh,stdk,jsum,ibracket
示例#5
0
## Print monthly series basic
from util import imo2iym
#con2str = lambda val, miss=-9999: "---" if val != miss else "-x-"
def con2str(data, missing_val=-9999):
    val, hits = data
    #if val == missing_val:
    #    return "-x-"
    if hits > 0:
        return "%3d" % hits
    else:
        return "---"
     
for imo in xrange(hom_params.nmo):
#for imo in xrange(10):
    year, month = imo2iym(imo)
    base_str = "%4d %2d %4d |" % (year, month, imo)
    month_strs = "|".join(map(con2str, zip(all_data[:,imo],
                                           new_hits[:,imo])))+"|"
    print_month_strs = False
    for i in range(10):
        if str(i) in month_strs: 
            print_month_strs = True
            break
    
    if print_month_strs:
        print base_str+month_strs


# Test plot
#pair_str = pair_results.keys()[-1]
示例#6
0
def estamt(network, minlenshf=24, **hom_params):
    """
    COPIED FROM ucpmonthly.v24a.f:
    
    The major steps in determining the best adjustment value for each station
    and changepoint. Entire network undergoes each of the following processes.
    In order:
        1) Remove unusable data. Align move swith respect to non-missing data
        and compress out changes that are too close AND the data between them.
        
        2) ISTEP=2 processing begins the adjustment process by removing the 
        non-significant changepoints to lengthen segments.
        
        3) NPASS (:= ISTEP=3) finishes the adjustment process by testing for the
        minimum number of months in a segment and number of neighbors with which
        the difference series can be examined.
        
        4) Final adjusted output is written.
    """

    ## FILTER 4
    ## Since the amplitude estimate MUST rely upon a minimum of MINLEN months to
    ## get even close to a reliable estimate at this point, it is assumed that
    ## the changepoints are as good as the station history files. Therefore,
    ## align moves with respect to non-missing data and compress out changes
    ## that are too close AND the data between them (i.e., less than MINLEN
    ## apart)

    # station_list = network.stations.keys()
    all_station_list = network.stations.keys()
    # station_list = ["215887", ]
    station_list = all_station_list

    # for each station...
    for id in station_list:
        station_index = station_list.index(id)
        station_series = network.raw_series[id]
        station_data = station_series.monthly_series[:]
        missing_val = station_series.MISSING_VAL

        # ... gen arrays for alignment
        move, amt, mday = [], [], []
        changepoints = station_series.changepoints
        cps = sorted(changepoints.keys())
        for cp in cps:
            print "  Hist move: ", len(move) + 1, station_index + 1, imo2iym(cp)
            move.append(cp)
            amt.append(changepoints[cp]["jsum"])
            mday.append(31)
        movnum = len(changepoints)

        if movnum > 0:
            ## At this point, the Fortran code executes alignmoves() in
            ## SHAPinp.v6c.f to reconcile the fact that station history files
            ## report dates of moves. It also removes segments that eare too short
            ## - less than minlenshf. Instead of implementing alignmoves(). Right
            ## now, I'll only implement this second functionality.
            # alignmoves()

            ####################################################################
            # Seek to find first and last month indices
            first_set = False
            for month in range(len(station_data)):
                # Skip first year
                if month < 12:
                    continue

                if station_data[month] != missing_val:
                    if not first_set:
                        first = month
                        first_set = True
                    last = month

            cps = sorted(changepoints.keys())
            cps.insert(0, first)
            cps.append(last)
            for (cp1, cp2) in zip(cps[:], cps[1:]):
                if (cp2 - cp1) < minlenshf:
                    months_to_delete = range(cp1 + 1, cp2 + 1)
                    network.raw_series[id].delete_months(months_to_delete)

                    if cp2 == last:
                        del_key = cp1
                    else:
                        del_key = cp2
                    if del_key in network.raw_series[id].changepoints:
                        # print len(network.raw_series[id].changepoints.keys()),
                        del network.raw_series[id].changepoints[del_key]
                        # print len(network.raw_series[id].changepoints.keys()),
                        # raw_input("pause")

                    del_str = "Del 1st segment: " if cp1 == first else "Delete segment: "

                    print id, station_index + 1, del_str, imo2iym(cp1), cp1, imo2iym(cp2), cp2

            new_changepoints = network.raw_series[id].changepoints
            new_cps = sorted(new_changepoints.keys())
            print "  First data value: ", imo2iym(first)
            for cp in new_cps:
                print "  End seg:", new_cps.index(cp), " ym: ", imo2iym(cp), cp, new_changepoints[cp]["jsum"]
            print "    End segment ym: ", imo2iym(last), last

            # Finally, add first and last value to the list of changepoints.
            first_stats = dict(ahigh=0.0, astd=0.0, jsum=0)
            last_stats = dict(ahigh=0.0, astd=0.0, jsum=0)
            network.raw_series[id].changepoints[first] = first_stats
            network.raw_series[id].changepoints[last] = last_stats
            ####################################################################

    ## Series of debug print statements summarizing the final list of
    ## changepoints. Not necessary at the moment

    ############################################################################
    # The subnetwork processing became a multi-step process plus a "post-process
    # pass" to manage:
    #    1) problems with documented changepoints with NO undocumented support
    #    2) determine the best amplitude estimation for each confirmed changepoint

    for step in [2, 3]:
        ## Setup output strings based on the step used. Only cosmetic differences
        ## really.

        iminlen = hom_params["minlen"]
        numclim = 3
        ## STEP 1 - NEVER USED (technically the history-consideration done previously
        if step == 1:
            continue
        elif step == 2:
            ## STEP 2 - NOT SIG REMOVAL
            ## equivalent to ipass loopback for istep == 2 in Fortran PHA
            print " ---------------- NOT SIG REMOVAL --------------- "
            tstr = "Not sig: "
            outid = "NS"
            ipass = 1
        elif step == 3:
            ## STEP 3 - ADJUSTMENT OF DISCONTINUITIES
            # equivalent to ipass loopback for istep == in FORTRAN PHA
            print " ---------------- ADJUST DISCONTINUITY STEP --------------- "
            print "Adjpass, iminlen, numclim", "--", iminlen, numclim
            print " ---------------- NPASS --------------- "
            tstr = "Dstep Dtrend: "
            outid = "WM"
            ipass = ipass + 1

        final_results = dict()
        print "  NET   STN    FILT TECH      ------ AFTER ------    ------ BEFORE ------"
        # Process each station and its network of neighbors
        for id in station_list:
            station_index = station_list.index(id)

            station_cp_dict = network.raw_series[id].changepoints
            sorted_cps = sorted(station_cp_dict.keys())
            ## If there are no breakpoints...
            if not sorted_cps:
                final_results[id] = dict()
                continue

            station_series = network.raw_series[id]
            missing_val = station_series.MISSING_VAL

            # compute monthly anomalies for this station data
            station_anomalies = station_series.monthly_anomaly_series

            # What are the first and last valid months in this station's data set?
            # We've saved them as the first and last changepoint before...
            first = sorted_cps[0]
            last = sorted_cps[-1]

            # What are the pairs to this station that we need to consider?
            station_pairs = []
            for other_id in all_station_list:
                pair = tuple(sorted([id, other_id]))
                if pair in hom_params["pairs"]:
                    station_pairs.append(pair)
            print station_pairs

            # List the remaining changepoints after the "confirmfilt" process
            for cp in sorted_cps:
                cp_stats = station_cp_dict[cp]
                hit_count = cp_stats["jsum"]
                iy, im = imo2iym(cp)
                print (
                    "%3d %5d %6s Estamt chgin: -- %4d %2d %4d %3d" % (ipass, station_index, id, iy, im, cp, hit_count)
                )

            ## ACCUMULATE PAIRED CHANGEPOINTS AND AMPLITUDE ESTIMATES
            # Loop over "brackets" of changepoints - that is, for changepoints
            # [a, b, c, d], consider the two brackets [a,b,c] and [b,c,d] with
            # the center value of the changepoints. Note that in the Fortran PHA,
            # we go through these brackets in reverse order - right to left.
            brackets = zip(sorted_cps[-3::-1], sorted_cps[-2::-1], sorted_cps[::-1])
            final_results[id] = dict()
            for bracket in brackets:
                # for bracket in brackets[:1]:
                (left, cp, right) = bracket[:]

                ly, lm = imo2iym(left)
                cpy, cpm = imo2iym(cp)
                ry, rm = imo2iym(right)
                print "Oriented: ", "--", "--", "--", left, cp, cp + 1, right

                # setup the output string for this bracket's tests
                chgptstr = "  Win1: %5d %4d%2d %5d %4d%2dto Win2: %5d %4d%2d %5d %4d%2d" % (
                    left,
                    ly,
                    lm,
                    cp,
                    cpy,
                    cpm,
                    cp,
                    cpy,
                    cpm,
                    right,
                    ry,
                    rm,
                )

                ## THIS SECTION ACCUMULATES TARGET-NEIGHBOR COMPARISONS
                # See if there are enough homogeneous data in the target;
                # check each window
                valid_count_right = len(get_valid_data(station_data[cp + 1 : right + 1], missing_val))
                valid_count_left = len(get_valid_data(station_data[left : cp + 1], missing_val))
                # if the segment length (valid count) is too short, skip this
                # changepoint (for now)
                if valid_count_left < iminlen:
                    print "Adjpass seg2 short ", station_index, id, chgptstr, valid_count_left
                    continue
                if valid_count_right < iminlen:
                    print "Adjpass seg1 short ", station_index, id, chgptstr, valid_count_right
                    continue

                ## We've pass the too-little-data pitfall. Now, we are actually going
                ## to go back through our paired neighbors and compute some final
                ## statistics about these changepoints. We'll store them in a
                ## dictionary for later, just like the pair_results dictionary
                ## from splitmerge
                pair_results = dict()
                # for (id1, id2) in [("215887", "200779")]:
                for (id1, id2) in station_pairs:
                    # Reset the left, cp, and right indices to the original
                    # bracket we're considering. We are going to be changing them
                    # while we look at this pair
                    (left, cp, right) = bracket[:]

                    ## Figure out which station is the neighbor (not the target
                    ## we're currently considering). At the same time, note that if
                    ## the target is the 2nd changepoint, the adjustments will be
                    ## flipped in sign, so we need to have a correction factor ready
                    correction = 1.0
                    if id == id1:
                        neighb_id = id2
                    else:
                        neighb_id = id1
                        # correction = -1.0

                    # Add this pair to pair_results if it's not already there
                    (ida, idb) = sorted([id1, id2])
                    pair_str = "%s-%s" % (ida, idb)
                    if pair_str not in pair_results:
                        pair_results[neighb_id] = dict()
                    print pair_str

                    neighb_index = all_station_list.index(neighb_id)
                    neighb_cp_dict = network.raw_series[neighb_id].changepoints

                    neighb_series = network.raw_series[neighb_id]
                    neighb_anomalies = neighb_series.monthly_anomaly_series

                    ## Generature a difference data set for this pair of stations
                    diff_data = diff(station_anomalies, neighb_anomalies)

                    ## It's possible that in the [left, right] bracket we're looking
                    ## at, there's a changepoint in the paired neighbor. We need
                    ## to adjust the endpoints of the bracket to exclude those
                    ## breakpoints
                    # Check right-hand side first and break out if ...
                    right_seg_len = len(get_valid_data(diff_data[cp + 1 : right + 1]))
                    # right_seg_len = len(diff_data[cp+1:right+1])
                    for month in range(cp + 1, right + 1):
                        if month == last:
                            continue

                        # ... we hit a changepoint in the neighbor ...
                        if month in neighb_cp_dict:
                            neighb_hits = neighb_cp_dict[month]["jsum"]
                            right_seg_len = len(get_valid_data(diff_data[cp + 1 : month + 1]))
                            # right_seg_len = len(diff_data[cp+1:month+1])
                            print (
                                "CHG2: ",
                                neighb_index,
                                neighb_id,
                                "num,edit,2b,2e,imo,nhits",
                                right_seg_len,
                                "--",
                                cp + 1,
                                right,
                                month,
                                neighb_hits,
                            )

                            right = month
                            break
                    # ... and the final right-segment is too short
                    print left, cp, right
                    if right_seg_len < iminlen:
                        print (
                            "Low2: ",
                            neighb_index,
                            neighb_id,
                            "num,edit,2b,2e,imo,nhits",
                            right_seg_len,
                            "--",
                            cp + 1,
                            right,
                            month,
                            "--",
                        )
                        continue

                    # Now, check the left-hand side and break out if ...
                    left_seg_len = len(get_valid_data(diff_data[left : cp + 1]))
                    for month in range(cp - 1, left, -1):
                        if month == first:
                            continue

                        # ... we hit a changepoint in the neighbor ...
                        if month in neighb_cp_dict:
                            neighb_hits = neighb_cp_dict[month]["jsum"]
                            left_seg_len = len(get_valid_data(diff_data[month:cp]))
                            # left_seg_len = len(diff_data[month:cp])
                            print (
                                "CHG1: ",
                                neighb_index,
                                neighb_id,
                                "num,edit,1b,1e,imo,nhits",
                                left_seg_len,
                                "--",
                                cp + 1,
                                left,
                                month,
                                neighb_hits,
                            )

                            left = month
                            break
                    # ... and the final left-segment is too short
                    if left_seg_len < iminlen:
                        print (
                            "Low1: ",
                            neighb_index,
                            neighb_id,
                            "num,edit,1b,1e,imo,nhits",
                            left_seg_len,
                            "--",
                            cp + 1,
                            left,
                            month,
                            "--",
                        )
                        continue

                    ## We can now estimate the raw changepoint amplitude using minbic.
                    ## However, we'll short-circuit a lot of the work by telling it to only
                    ## use the KTHTPR0 model (simple step-change model)
                    (seg_x, seg_data) = range(left + 1, right + 1), diff_data[left + 1 : right + 1]
                    bp_index = cp - (left + 1)
                    # print left, cp, right, "|", bp_index
                    # print left_seg_len, right_seg_len
                    bic_result = minbic(seg_x, seg_data, bp_index, missing_val, models=[("KTHTPR0", kthtpr0)])
                    ## Also check the first difference correlations between the
                    ## monthly anomalies
                    station_first_diff = compute_first_diff(station_anomalies, missing_val)
                    neighb_first_diff = compute_first_diff(neighb_anomalies, missing_val)
                    corr = compute_corr(station_anomalies, neighb_anomalies)

                    ## Write out the results of this testing process so far
                    cmodel = bic_result["cmodel"]
                    bic = bic_result["bic"]
                    test_stat = bic_result["test_stat"]
                    crit_val = bic_result["crit_val"]
                    offset = bic_result["offset"]
                    slopes = bic_result["slopes"]
                    left_slope, right_slope = slopes
                    print (
                        "%s %6s-%6s %s %7.2f %7.2f %7.2f %7.2f %7.3f %7.3f -- %d --"
                        % (
                            tstr,
                            id,
                            neighb_id,
                            chgptstr,
                            crit_val,
                            test_stat,
                            offset,
                            corr,
                            left_slope,
                            right_slope,
                            right_seg_len,
                        )
                    )

                    ## Analysis is done.
                    ## Keep the adjustment (offset) for each neighbor/segment,
                    ## set/reset trend for each neighbor/segment
                    ##     the first segment is the left-segment,
                    ##     the second segment is the right-segment
                    ##
                    ## Note that we reset left/right potentially to avoid conflicts
                    ## within the paired neighbor data. However, our estimates of
                    ## trends/offsets associated with the "right" adjacent changepoint
                    ## actually refers to that original right changepoint. We'll
                    ## reset left, cp, and right from the bracket before continuing
                    (left, cp, right) = bracket[:]
                    # Do the left segment first
                    left_dict = dict()
                    left_dict["adj"] = offset * correction
                    left_dict["cor"] = corr
                    left_dict["bic"] = bic
                    left_dict["cmodel"] = cmodel
                    left_dict["trend"] = left_slope
                    left_dict["spanob"] = left_seg_len
                    pair_results[neighb_id][cp] = left_dict

                    # Do the right segment now
                    right_dict = dict()
                    right_dict["adj"] = offset * correction
                    right_dict["cor"] = corr
                    right_dict["bic"] = bic
                    right_dict["cmodel"] = cmodel
                    right_dict["trend"] = right_slope
                    right_dict["spanob"] = right_seg_len

                    if right not in pair_results[neighb_id]:
                        pair_results[neighb_id][right] = right_dict
                    else:
                        # We've already recorded this segment before for the last
                        # changepoint. Update the slopes/spanob count (length of
                        # preceding segment) if the slopes are different and the
                        # length is different.
                        new_trend = slopes[1]
                        new_spanob = right_seg_len
                        old_trend = pair_results[neighb_id][right]["trend"]
                        old_spanob = pair_results[neighb_id][right]["spanob"]
                        if old_trend != new_trend:
                            print (
                                " Seg2 diff: %s %4d old: %7.2f %4d new: %7.2f %4d"
                                % (pair_str, right, old_trend, old_spanob, new_trend, new_spanob)
                            )
                            # if the new count is greater than the old one, the slope
                            # is probably more robust so update those entries.
                            if new_spanob > old_spanob:
                                pair_results[neighb_id][right]["trend"] = new_trend
                                pair_results[neighb_id][right]["spanob"] = new_spanob

                    ## We're done with this pair/changepoint. Summary output -
                    if step == 2:
                        print "itarg,ipair,ichg,numc,iqt,adj,trends: -- -- -- --", cmodel, offset, slopes
                # raw_input("pause")

                ####################################################################
                ## ADJUSTMENT DETERMINATION SECTION
                # Recall the paired-changepoint analyses we just performed, and
                # determine if the potential adjustment is statistically valid
                (left, cp, right) = bracket[:]

                pair_data = []
                for neighb_id in pair_results:
                    if not cp in pair_results[neighb_id]:
                        continue

                    cp_stats = pair_results[neighb_id][cp]
                    adjacent_stats = pair_results[neighb_id][right]

                    trends = (cp_stats["trend"], adjacent_stats["trend"])

                    pair_dict = dict(
                        neighb_id=neighb_id, adj=cp_stats["adj"], cor=cp_stats["cor"], trends=trends, used=True
                    )
                    pair_data.append(pair_dict)

                npairs = len(pair_data)
                if npairs < numclim:
                    print "Adjpass numc low --", station_index, id, left, cp, right, npairs
                    continue

                # Process -
                #    1) Remove both adjustment and trend outliers
                #    2) Calculate median adjustment
                #
                #    filter around inter-quartile range
                qscale = hom_params["qscale"]
                pair_data = sorted(pair_data, key=operator.itemgetter("adj"))
                pair_chgs = [p["adj"] for p in pair_data]
                chg_25th, chg_median, chg_75th = tukey_med(pair_chgs)

                chg_iqr = chg_75th - chg_25th
                chg_low = chg_25th - (chg_median - chg_25th) * 1.0 * qscale
                chg_high = chg_75th + (chg_75th - chg_median) * 1.0 * qscale
                print (
                    " TRIM p25, p75, pct50, rng, lo, hi: %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
                    % (chg_25th, chg_75th, chg_median, chg_iqr, chg_low, chg_high)
                )
                # If any of the estimated changepoints are outside the statistically
                # robust range we just computed, then flag them as we print them and
                for data in pair_data:
                    neighb_id = data["neighb_id"]
                    neighb_index = all_station_list.index(neighb_id)
                    adj = data["adj"]
                    cor = data["cor"]
                    trends = data["trends"]

                    if not (chg_low < adj < chg_high):
                        data["used"] = False
                    flag = "U" if data["used"] else "X"
                    print ("%s %4d %7.2f %8.4f %8.4f %7.2f" % (flag, neighb_index, adj, trends[0], trends[1], cor))

                valid_adj_count = len([d for d in pair_data if d["used"]])
                if valid_adj_count < numclim:
                    if step == 2:
                        print (
                            "Insuff trimmed mean -- %4d %s %5d %5d %5d %5d"
                            % (station_index, id, left, cp, right, valid_adj_count)
                        )
                        continue

                ## BUG: The code here re-computes the inter-quartile range by
                ##     scaling qscale by 1.0. Curiously, it doesn't reject any
                ##     pairs based on this new range.
                chg_iqr = chg_75th - chg_25th
                chg_low = chg_25th - (chg_median - chg_25th) * qscale
                chg_high = chg_75th + (chg_75th - chg_median) * qscale

                ## Tweak the inter-quartile range to check if the adjustment is
                ## Check whether the computed adjustment is significant. That is,
                ## if 0 is included within the inter-quartile range we computed, then
                ## we can't reject the null hypothesis that the changepoint is significant
                if chg_high * chg_low > 0.0:
                    # signs are the same, so 0 isn't included in the range.
                    procstr = "CONSHF"
                    sigadj = chg_median
                else:
                    procstr = "ZERSHF"
                    sigadj = 0.0

                final_results[id][cp] = dict(adj=sigadj, std=chg_iqr * 1.0 * qscale, num=npairs)

                print ("%2d %s-%s %s %7.2f" % (station_index, id, procstr, chgptstr, sigadj))

            ## Print some final output about what changepoints remain for this station
            final_station_results = final_results[id]
            final_cps = sorted(final_station_results.keys())
            for cp in final_cps:
                adj = final_station_results[cp]["adj"]
                std = final_station_results[cp]["std"]

                cp_stats = station_cp_dict[cp]
                hit_count = cp_stats["jsum"]
                iy, im = imo2iym(cp)

                print (
                    "-- %5d %s Estamt chgout: -- %4d%2d %5d %5d %7.2f %7.2f"
                    % (station_index + 1, id, iy, im, cp, hit_count, adj, std)
                )
            # raw_input("pause")

        ## Remove the accumulated non-significant changepoints (either non-sig because
        ## there was too much missing data, the target segment was too short, or the
        ## trimmed mean test could not reject the null hypothesis of no change
        for id in station_list:
            station_index = station_list.index(id)

            final_station_results = final_results[id]
            final_cps = sorted(final_station_results.keys())
            for cp in final_cps:
                iy, im = imo2iym(cp)
                cp_index = final_cps.index(cp)
                adj = final_station_results[cp]["adj"]
                std = final_station_results[cp]["std"]
                if adj == 0.0:
                    print ("%s %5d Remove chgpt %5d %4d %2d %4d" % (id, station_index, cp_index, iy, im, cp))
                    del network.raw_series[id].changepoints[cp]
                else:
                    # Update the network's record of changepoints with this new list
                    network.raw_series[id].changepoints[cp]["ahigh"] = adj
                    network.raw_series[id].changepoints[cp]["astd"] = std
            # the changepoint at first month has been removed; add it back in
            network.raw_series[id].changepoints[first] = dict(ahigh=0.0, astd=0.0, jsum=0)
示例#7
0
def splitmerge(network, pairs=None, beg_year=1, end_year=2, **kwargs):
    
    ## EXPERIMENTAL PLACEHOLDERS - will eventually be replaced with a master
    ## loop to do all the id pairs.
    id_list = network.stations.keys()
    pair_results = dict()
    
    def dict_to_tuples(d):
        keys = d.keys()
        return [(key, d[key]) for key in keys]
    ## Generate station pairs for use in splitmerge by iteratively going through the
    ## station_list and adding stations in order of decreasing correlation. Skip a 
    ## neighbor if the pair is already present; want 20 stations or until all the
    ## correlated neighbors are used up.
#    pairs = []
#    for id1 in id_list:
#        neighbors = dict_to_tuples(network.correlations[id1])
#        sorted_neighbors = sorted(neighbors, key=operator.itemgetter(1))
#        added_pairs = 0
#        while sorted_neighbors and (added_pairs < 5):
#            id2, _ = sorted_neighbors.pop()
#            ordered_pair = tuple(sorted((id1, id2)))
#            if not ordered_pair in pairs:
#                pairs.append(ordered_pair)
#                added_pairs += 1
    
    for (id1, id2) in pairs:
        print "Pair %s with %s" % (id1, id2)
        pair_str = "%6s-%6s" % (id1, id2)
        #if pair_str != "051528-298107":
        #    continue
        
        raw_series = network.raw_series
        stations = network.stations
        series_copy = deepcopy(raw_series)
        
        min_ann = 5
        num_years = end_year - beg_year
        num_months = num_years*12
            
        for s in series_copy.itervalues():
            data = s.series
            scaled = scale_series(data, 0.1, s.MISSING_VAL)
            anomalies = compute_monthly_anomalies(scaled, s.MISSING_VAL)
            s.set_series(anomalies, s.years)
        
        ## Retrieve the data for each of the stations.
        station1 = stations[id1]
        series1 = series_copy[id1]
        data1 = series1.monthly_series
                
        station2 = stations[id2]
        series2 = series_copy[id2]
        data2 = series2.monthly_series
        
        
        #print data1[:50]
        #print data2[:50]
        #print "################################################################"
        ## Compute the difference series        
        diff_data = diff(data1, data2)
        MISS = series1.MISSING_VAL # Missing value placeholder
        
        ## Quickly pass through the data to find where it starts. We need to do this
        ## because it's possible that beg_year is earlier than the first year of 
        ## valid data in either data1 or data2. Furthermore, the original PHA code
        ## deliberately clipped off the first year of good data, so emulate that 
        ## effect here as well.
        ##
        ## Ultimately, we save the extreme early and extreme late month with valid
        ## data to use as our first guess at the undocumented changepoints.
        first = 0
        first_set = False
        last = 0
        for (i, d1, d2) in zip(xrange(num_months), data1, data2):
            if d1!=MISS and d2!=MISS:
                if first < 12:
                    first = i
                    #first_set = True
                #if not first_set:
                #    first = i
                #    first_set = True
                last = i
                
        ## Set the initial breakpoints and the list of already-found, homogenous
        ## segments.    
        breakpoints = [first, last, ]
        homog_segs = []
        
        #####################################################################
        ## BEGIN SPLITMERGE PROCESS TO GENERATE FIRST GUESS AT UNDOCUMENTED
        ## CHANGEPOINTS
        iter = 0 # counts how many times we've repeated the splitmerge process
        enter_BIC = False # break out of iterations into the BIC process?
        last_breakpoints = []
        while (iter < 10) and not enter_BIC:
            
            seg_bounds = zip(breakpoints[:-1], breakpoints[1:])
            last_breakpoints = deepcopy(breakpoints)
            new_breakpoints = deepcopy(breakpoints)
                
            new_homog_segs = []
        
            print "Parse segments (isplit = 1), ipass: "******"Too short: ", imo2iym(l), imo2iym(r)
                    continue
                
            ## If we've previously found that this segment is homogenous (has no
            ## potential changepoint), then we can skip it as well and proceed to
            ## the next one.
                # Set the within() method to check if this segment is within any
                # previously found homogenous ones. Use lambda, since we can't pass
                # keyword or positional arguments to map().
                within_this_seg = lambda seg: within((l, r), seg)
                within_stable_segs = map(within_this_seg, homog_segs)
                if any(within_stable_segs):
                    print "Stable segment: ", imo2iym(l), imo2iym(r)
                    if l == first: 
                        new_breakpoints.append(first)
                    continue
                
            ## The standard normal homogeneity test - which is the statistical test
            ## we'll use to see if there is a potential changepoint in this segment
            ## - requires us to normalize our paired difference series. We can do
            ## that in snht(), but we'll do it right now so we can inspect those
            ## standardized values later.
                z = standardize(segment, MISS)

            ## Apply standard normal homogeneity test. 
            ## For mechanics, see Alexandersson and Moberg 1997, Int'l Jrnl of
            ## Climatology (pp 25-34)
                likelihood_ratios = snht(z, MISS, standardized=True)
                z_count = len(get_valid_data(z))
                        
            ## We're left with the likelihood ratio for each value being a potential
            ## changepoint. Find the max ratio, and if that value is significant, let
            ## it be the newest potential changepoint.
                ind_max_ratio = 0
                max_ratio = 0.0
                clip_ratios = likelihood_ratios[2:-2] # clip the beginning and end,
                                                      # they can't be changepoints.
                for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios):
                    if ratio > max_ratio:
                        ind_max_ratio = ind
                        max_ratio = ratio
            ## Now we find the critical value for this data set, and check our max
            ## likelihood ratio against it
                crit_val = lrt_lookup(z_count)
                
                # The possible changepoint is the index of the max ratio we found. 
                # We have to shift it the following ways to align it to the original
                # data -
                #    1) shift by 2 re-aligns it from clip_ratios to likelihood_ratios
                #    2) shift by adjust re-aligns it to this segment in diff_data
                #    3) shift by l re-aligns it to the first index in diff_data
                possible_changepoint = l + ind_max_ratio + 2 + adjust
                
                y_new, m_new = imo2iym(possible_changepoint) # year, month
                
            ## If this is the first iteration, we indicate as such, and add the new
            ## changepoint
                if iter == 0: 
                    print "%6s-%6s MD        FIRST series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val)
                    breakpoints.append(possible_changepoint)
                    breakpoints = sorted(breakpoints)
            
                else:
            ## Else, if we found a new possible changepoint, add it to our list.
                    if max_ratio > crit_val:
                        print "%6s-%6s MD Inhomogenity for series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count)
                        new_breakpoints.append(possible_changepoint)
                        
            ## If not, record that we found a homogeneous segment.   
                    else:
                        print "%6s-%6s MD      Homogeneous series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count)
                        new_homog_segs.append((l, r))
            
            ## Now we need to update our account of which segments were homogeneous,
            ## because we need to know during the next iteration. We will do this,
            ## as well as condense stable segments that lie adjacent to each other
            ## i.e, if we have the segments [(1,5), (5, 10,),, (12, 15)], then we 
            ## really have [(1,10), (12, 15)].
            homog_segs.extend(new_homog_segs)
            if homog_segs:
                homog_segs = sorted(homog_segs, key=operator.itemgetter(0))
                final_homog_segs = [homog_segs[0], ] # this will be like a stack
                for seg in homog_segs[1:]:
                    last_seg = final_homog_segs[-1]
                    if last_seg[1] == seg[0]:
                        new_seg = (last_seg[0], seg[1])
                        final_homog_segs.pop()
                        final_homog_segs.append(new_seg)
                    else:
                        final_homog_segs.append(seg)
                homog_segs = final_homog_segs
        
            ## So we have new segments that can be generated from these new
            ## breakpoints. Now, the PHA routine enters a "merge" process
            ## to see whether or not to keep these newly found changepoints or throw
            ## them out as false alarms. 
            ##
            ## We do this by "leapfrogging" every other breakpoint. This gives us
            ## a set of segments that all have another breakpoint in them. We want
            ## to see if these segments are homogeneous, because if they are, it
            ## means that the breakpoint we previously found in the segment has 
            ## been superseded.
            new_breakpoints = sorted(new_breakpoints)
            seg_bounds = zip(new_breakpoints[:-2], new_breakpoints[2:])
            
            remove_breakpoints = set()
            merged_breakpoints = set()
            if iter > 0:
                
                print "Merge segments (isplit = 0), ipass: "******"Stable segment: ", imo2iym(l), imo2iym(r)
    #                    if l == first: 
    #                        new_breakpoints.append(first)
    #                    seg_lookup.append(((l, r), 'stable'))
    #                    continue
                    # Set the within() method to check if this segment is within any
                    # previously found homogenous ones. Use lambda, since we can't pass
                    # keyword or positional arguments to map().
                    within_this_seg = lambda seg: within((l, r), seg)
                    within_stable_segs = map(within_this_seg, homog_segs)
                    if any(within_stable_segs):
                        print "Stable segment: ", imo2iym(l), imo2iym(r)
                        #if l == first: 
                        #    new_breakpoints.append(first)
                        merged_breakpoints.update([l, r])
                        continue
            
            ## Apply the same adjustments and the same standard normal homogeneity
            ## test that we did in the previous splitting process. There is no 
            ## difference here until we consider what to do if we find a new 
            ## homogeneous segment.
                    adjust = int(seg_bounds.index((l, r)) > 0)
                    segment = diff_data[l+adjust:r+1]
                    
                    z = standardize(segment, MISS)
                    likelihood_ratios = snht(z, MISS, standardized=True)
                    z_count = len(get_valid_data(z))
                        
                    ind_max_ratio = 0
                    max_ratio = 0.0
                    clip_ratios = likelihood_ratios[2:-2] # We clip the beginning and end
                    for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios):
                        if ratio > max_ratio:
                            ind_max_ratio = ind
                            max_ratio = ratio
                            
                    crit_val = lrt_lookup(z_count)
                    possible_changepoint = l + ind_max_ratio + 2 + adjust
                    
                    y_new, m_new = imo2iym(possible_changepoint)
                    
    
                    if z_count < 2:
                        y1, m1 = imo2iym(l)
                        y2, m2 = imo2iym(r)
                        print "%6s-%6s MD  No found peaks %4d %2d to %4d %2d" % (id1,id2,y1,m1,y2,m2)
                        print "%6s-%6s MD  Compress 1 out peak at %4d %2d" % (id1,id2,y_new,m_new)
                        #remove_breakpoints.add_
            ## If we found a new breakpoint that is statistically significant, then
            ## great! Let's keep it.
                    if max_ratio > crit_val:
                        print "%6s-%6s MD  Peak kept in merge at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val)
                        merged_breakpoints.add(l)
                        merged_breakpoints.add(new_bp)
                        merged_breakpoints.add(r)
            ## If not, then this segment was homogeneous, so the breakpoint which
            ## already exists in it is no good.
                    else:
                        print "%6s-%6s MD Compress 2 out peak at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val)
                        # Crap, if there are any potential breakpoints in this segment,
                        # we need to remove them because this segment is homogeneous. Let's
                        # remember this homogeneous segment for now and come back once
                        # we've found all of them.    
                        merged_breakpoints.update([l, r])
                        remove_breakpoints.add(new_bp)
            
            ## At this point, we have a set of all the breakpoints we've accumulated
            ## during this iteration of split/merge, as well as a set of breakpoints
            ## which we've found to be of no further use. We can difference update
            ## our set of breakpoints to remove these guys, and let those merged
            ## breakpoints be the set of newest breakpoints for the next splitmerge
            ## iteration.
                merged_breakpoints.difference_update(remove_breakpoints)
                breakpoints = list(merged_breakpoints)
            
            breakpoints = sorted(breakpoints)
            
            ## Did we actually find new breakpoints? If not, then we're done
            ## with splitmerge and can move on to the BIC process.
            enter_BIC = (breakpoints == last_breakpoints)
            iter = iter + 1
            
        ## Okay wow, we've potentially made it to the BIC stage now... !
        if first not in breakpoints:
            breakpoints.insert(0, first)
        ym_breakpoints = map(imo2iym, breakpoints)
        #print ym_breakpoints
        
        ## ENTERING MINBIC    
        bp_dictionary = dict()
####################################
##### MULTIPROCESS
        from multiprocessing import Pool

        global counter
        multi_bp_dict = {}
        counter = 0
        def cb(r):
            global counter
            #print counter, r
            counter += 1
        
        start = time.clock()         
        po = Pool(processes=4)
        for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]):
                    
            if left != first:
                left = left + 1
            # recall that we only consider data after the first full year. we will be 
            # computing regressions with the independent variable indexed from this 
            # starting point, so we need to shift these indices. we also need to shift them
            # by +1 if this is any segment beyond the first one, so that we don't include
            # changepoints in more than one analysis.
            # TOTAL_SHIFT = -12 + 1 = -11
            # 
            # However, this shift is only necessary while looking at the array indices that
            # we generate using range(). the data should already be aligned correctly.
            total_shift = -12 + 1
            left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift
            y1, m1 = imo2iym(left)
            yb, mb = imo2iym(bp)
            y2, m2 = imo2iym(right)
            #print "Entering MINBIC - %4d %2d    %4d %2d    %4d %2d" % (y1, m1, yb,
            #                                                           mb, y2, m2)
            (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1]
            bp_index = bp-left
            #print len(seg_x), len(seg_data), bp_index
            #bp_analysis = minbic(seg_x, seg_data, bp_index, MISS)
            multi_bp_dict[bp] = po.apply_async(minbic,(seg_x,seg_data,bp_index,MISS,),callback=cb)
        po.close()
        po.join()
        for bp in multi_bp_dict:
            r = multi_bp_dict[bp]
            multi_bp_dict[bp] = r.get()
        #print "counter - %d" % counter
        elapsed = (time.clock() - start)
        print "ELAPSED TIME - %2.3e" % elapsed
        #print new_bp_dict
####################################
##### NORMAL        
#        start = time.clock()
#        for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]):
#                    
#            if left != first:
#                left = left + 1
#            # recall that we only consider data after the first full year. we will be 
#            # computing regressions with the independent variable indexed from this 
#            # starting point, so we need to shift these indices. we also need to shift them
#            # by +1 if this is any segment beyond the first one, so that we don't include
#            # changepoints in more than one analysis.
#            # TOTAL_SHIFT = -12 + 1 = -11
#            # 
#            # However, this shift is only necessary while looking at the array indices that
#            # we generate using range(). the data should already be aligned correctly.
#            total_shift = -12 + 1
#            left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift
#            y1, m1 = imo2iym(left)
#            yb, mb = imo2iym(bp)
#            y2, m2 = imo2iym(right)
#            print "Entering MINBIC - %4d %2d    %4d %2d    %4d %2d" % (y1, m1, yb,
#                                                                       mb, y2, m2)
#            (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1]
#            bp_index = bp-left
#            #print len(seg_x), len(seg_data), bp_index
#            bp_analysis = minbic(seg_x, seg_data, bp_index, MISS)
#            
#            bp_dictionary[bp] = bp_analysis    
#        elapsed2 = (time.clock() - start)
#        print "ELAPSED TIME = %3.2e" % elapsed2
        
        ##################################3
        ## Print the adjustment summaries
        bp_dictionary = multi_bp_dict
        sorted_bps = sorted(bp_dictionary.keys())
        ndelete = []
        valid_bps = {}
        for bp in sorted_bps:
            stats = bp_dictionary[bp]
            
            cmodel=stats['cmodel']
            iqtype=stats['iqtype']
            asigx=stats['offset']
            azscr=stats['offset_z']
            rslp=stats['slopes']
            
            end1 = bp
            y_end1, m_end1 = imo2iym(end1)
            beg2 = bp+1
            y_beg2, m_beg2 = imo2iym(beg2)
            
            # If cmodel is *SLR*, then there is no breakpoint
            if 'SLR' in cmodel:
                print ("%s-%s  --  -- MD TESTSEG SKIP: %7.2f %5d %5d %3d %5d %5d %3d" %
                       (id1, id2, asigx, end1, y_end1, m_end1, beg2, y_beg2, m_beg2))
                # Don't store it!
            else:
                print ("%6s-%6s  --  -- MD TESTSEG ADJ: %7.2f %7.2f %8.4f %8.4f %5d %5d %3d %5d %5d %3d %2d" % 
                       (id1,id2, asigx, azscr, rslp[0], rslp[1], end1, y_end1, m_end1, beg2, y_beg2, m_beg2, iqtype))
                # Store it!
                valid_bps[bp] = stats
        
        ###############################
        ## Go back and see if we can get rid of some of the change points.
        ## If 2 or more of the chgpts are within MINLEN,
        ##    a) if the chgpt estimates are the same sign, then test each
        ##        singly with same endpoints and keep lowest BIC 
        ##    b) if not the same sign,
        ##        retain earliest changepoint
        # add the first, last to valid_bps
        interior_bps = valid_bps.keys()
        # Add first, last if not already in interior_bps
        for bp in [first, last]:
            if bp not in interior_bps:
                interior_bps.append(bp)
        sorted_bps = sorted(interior_bps)
        for left in sorted_bps:
            print sorted_bps, left
            ## We're looking for the next interim breakpoint that satisfies two
            ## conditions:
            ##    1) at least MINLEN valid data (non-missing to the right)
            ##    2) has at least one breakpoint between 'left' and it
            right = 0
            close_bps = []
            for right in sorted_bps: 
                if right <= left: continue
                
                if not close_bps:
                    close_bps.append(right)
                else:
                    valid_between_bps = diff_data[close_bps[-1]:right]
                    valid_length = len(get_valid_data(valid_between_bps, MISS))
                    print imo2iym(close_bps[-1]),valid_length,imo2iym(right)
                    if valid_length > MINLEN:
                        break
                    close_bps.append(right)
            # We could actually run out of things in sorted_bps, and wind up with
            # right == close_bps[-1]. Detect that and break out of this analysis
            # if that happens.
            if close_bps[-1]==right: break
            
            if left != first:
                left = left + 1
            close_bp_results = {}
            for bp in close_bps:
                        
#                # recall that we only consider data after the first full year. we will be 
#                # computing regressions with the independent variable indexed from this 
#                # starting point, so we need to shift these indices. we also need to shift them
#                # by +1 if this is any segment beyond the first one, so that we don't include
#                # changepoints in more than one analysis.
#                # TOTAL_SHIFT = -12 + 1 = -11
#                # 
#                # However, this shift is only necessary while looking at the array indices that
#                # we generate using range(). the data should already be aligned correctly.
                total_shift = -12 + 1
                left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift
                y1, m1 = imo2iym(left)
                yb, mb = imo2iym(bp)
                y2, m2 = imo2iym(right)
                
                print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
                print y1,m1,"-",yb,mb,"-",y2,m2
                print "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"
                
                (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1]
                bp_index = bp-left
                bp_analysis = minbic(seg_x, seg_data, bp_index, MISS, kthslr0_on=True)
                
                cmodel=bp_analysis['cmodel']
                iqtype= bp_analysis['iqtype']
                offset= bp_analysis['offset']
                rslp= bp_analysis['slopes']
                crit_val = bp_analysis['crit_val']
                test_stat = bp_analysis['test_stat']
                bic = bp_analysis['bic']
                
                print ("Interim chgpt: %s %4d %2d %4d %2d %4d %2d %8.2f %8.2f %8.2f %8.2f %7.3f %7.3f %2d" %
                       (pair_str, y1, m1, yb, mb, y2, m2, bic, test_stat, crit_val, offset, rslp[0], rslp[1], iqtype))                    
                
                close_bp_results[bp] = bp_analysis

            # Now we have a small problem... we might have more than one breakpoint,
            # so we need to choose which one is best. We will check the sign of
            # the breakpoint amplitude changes:
            sign_of_amps = map(sign, [close_bp_results[bp]['offset'] for bp in close_bps])
            positive = lambda x: sign(x) >= 0
            negative = lambda x: sign(x) <= 0
            zero = lambda x: sign(x) == 0
            print "------------>",[close_bp_results[bp]['offset'] for bp in close_bps]
            if (all(map(positive, sign_of_amps)) or 
                all(map(negative, sign_of_amps))):    
                # Pick the best (minimum BIC)          
                bics = [(bp, close_bp_results[bp]['bic']) for bp in close_bps]
                sorted_bics = sorted(bics, key=operator.itemgetter(1))
                smallest_bp = sorted_bics[0][0]
                
                # Remove this smallest-bic bp from the in-interval bps 
                close_bps.remove(smallest_bp)
                valid_bps[smallest_bp] = close_bp_results[smallest_bp] 
                
                #print "leftovers",close_bps
                for bp in close_bps: # The remaining bps which we will reject
                    sorted_bps.remove(bp) # Remove them from this loop
                    del valid_bps[bp] # Remove them as valid 
                    
                yb, mb = imo2iym(smallest_bp)
                print ("Same domain - Lowest Interim: %s %4d %2d" % 
                       (pair_str, yb, mb))
            elif (all(map(zero, sign_of_amps))):
                # Choose the earliest changepoint; the rest of these have
                # amplitude changes which are 0.
                first_bp, last_bp = close_bps[0], close_bps[-1]
                
                # Remove the first interim bp and update valid_bps with this new
                # computation. 
                close_bps.remove(first_bp)
                valid_bps[first_bp] = close_bp_results[first_bp]
                
                # Reject remaining interim bps
                for bp in close_bps:
                    sorted_bps.remove(bp)
                    del valid_bps[bp]
                    
                yb, mb = imo2iym(first_bp)
                print ("Null domain - Earliest Interim : %s %4d %2d" %
                       (pair_str, yb, mb))
            else:
                # We'll use the earliest interim changepoint, but we need
                # to get rid of bad data. Replace all the data between the 
                # interim changepoints as missing and re-compute BIC.
                first_bp, last_bp = close_bps[0], close_bps[-1]
                first_bp_index = first_bp-left
                last_bp_index = last_bp-left
                
                print len(seg_x), len(seg_data)
                print first_bp_index+1, last_bp_index+1
                print left, bp, right
                for i in range(first_bp_index+1, last_bp_index+1):
                    print i, imo2iym(i), i+left, imo2iym(i+left)
                    seg_x[i] = MISS
                    seg_data[i] = MISS
                    # Recall that seg_data[0] == diff_data[left]. ndelete records
                    # the *true month where there is unviable data*, so it needs to
                    # point back to the original element in diff_data we are 
                    # worried about.
                    ndelete.append(i+left) 
                bp_analysis = minbic(seg_x, seg_data, first_bp_index, MISS, kthslr0_on=True)
                
                # Remove the first interim bp and update valid_bps with this new
                # computation. 
                close_bps.remove(first_bp)
                valid_bps[first_bp] = bp_analysis
                
                # Reject remaining interim bps
                for bp in close_bps:
                    sorted_bps.remove(bp)
                    del valid_bps[bp]
                
                yb, mb = imo2iym(first_bp)
                print ("Diff domain - Earliest Interim : %s %4d %2d" %
                       (pair_str, yb, mb))                
    
        ## Remove changepoints which are an SLR model.
        nspan = [0]*num_months
        bp_count = 1
        for bp in sorted(valid_bps.keys()):
            bp_analysis = valid_bps[bp]
            
            if "SLR" in bp_analysis['cmodel']:
                del valid_bps[bp]
                continue
            
            print "   IN: ",bp
            nspan[bp] = bp_count
            ## If adjacent months are missing next to this breakpoint, then
            ## assume that those could be a breakpoint as well and copy this
            ## breakpoint's analysis results for them.
            for month in range(bp+1, last):
                if (month in ndelete) or (diff_data[month] == MISS):
                    nspan[month] = bp_count
                    print "   IN: ",month
                    valid_bps[month] = bp_analysis
                else:
                    break
            bp_count += 1
            
        valid_bps['del'] = ndelete
        valid_bps['nspan'] = nspan
        pair_results[pair_str] = valid_bps
        
        #print "ELAPSED TIMES = %3.2e %3.2e" % (elapsed1, elapsed2)
    print "done"
    ##
    import pickle
    f = open("pair_results", 'w')
    pickle.dump(pair_results, f)
    return pair_results