def standardize(data, missing_val=-9999): """Standardize a subset of data using a normal score. Computes the normal score for a set of data necessary for performing the standard normal homogenity test. The normal scores are defined as Z_i = (Q_i - Qbar) / sigma_Q, For more information, please refer to Alexandersson and Moberg, 1997, Int'l Journal of Climatology Vol. 17, pp 25-34. This is a direct port of splitmerge.v21f.f > subroutine 'standard'. :Param data: The dataset to standardize. :Param missing_val: The placeholder for missing data. :Return: A list of length (right-left), with the standardized reference values computed here. """ ## Find the valid data to use to compute the data_mean, etc. valid_data = get_valid_data(data, missing_val) num_vals = len(valid_data) data_mean = compute_mean(valid_data, valid=True) ## Compute the sum the squared error for each term (variance) variance_sum = 0.0 for d in valid_data: variance_sum = variance_sum + (d-data_mean)**2 ## The standard deviation is the root of the sum of the squared error sum_std = sqrt(variance_sum/(num_vals-2)) ## Normalize each data value using this standard deviation standardized_data = [] #for d in data[left:right+1]: for d in data: if d!= missing_val: standardized_data.append((d-data_mean)/sum_std) else: standardized_data.append(missing_val) return standardized_data
def estamt(network, minlenshf=24, **hom_params): """ COPIED FROM ucpmonthly.v24a.f: The major steps in determining the best adjustment value for each station and changepoint. Entire network undergoes each of the following processes. In order: 1) Remove unusable data. Align move swith respect to non-missing data and compress out changes that are too close AND the data between them. 2) ISTEP=2 processing begins the adjustment process by removing the non-significant changepoints to lengthen segments. 3) NPASS (:= ISTEP=3) finishes the adjustment process by testing for the minimum number of months in a segment and number of neighbors with which the difference series can be examined. 4) Final adjusted output is written. """ ## FILTER 4 ## Since the amplitude estimate MUST rely upon a minimum of MINLEN months to ## get even close to a reliable estimate at this point, it is assumed that ## the changepoints are as good as the station history files. Therefore, ## align moves with respect to non-missing data and compress out changes ## that are too close AND the data between them (i.e., less than MINLEN ## apart) # station_list = network.stations.keys() all_station_list = network.stations.keys() # station_list = ["215887", ] station_list = all_station_list # for each station... for id in station_list: station_index = station_list.index(id) station_series = network.raw_series[id] station_data = station_series.monthly_series[:] missing_val = station_series.MISSING_VAL # ... gen arrays for alignment move, amt, mday = [], [], [] changepoints = station_series.changepoints cps = sorted(changepoints.keys()) for cp in cps: print " Hist move: ", len(move) + 1, station_index + 1, imo2iym(cp) move.append(cp) amt.append(changepoints[cp]["jsum"]) mday.append(31) movnum = len(changepoints) if movnum > 0: ## At this point, the Fortran code executes alignmoves() in ## SHAPinp.v6c.f to reconcile the fact that station history files ## report dates of moves. It also removes segments that eare too short ## - less than minlenshf. Instead of implementing alignmoves(). Right ## now, I'll only implement this second functionality. # alignmoves() #################################################################### # Seek to find first and last month indices first_set = False for month in range(len(station_data)): # Skip first year if month < 12: continue if station_data[month] != missing_val: if not first_set: first = month first_set = True last = month cps = sorted(changepoints.keys()) cps.insert(0, first) cps.append(last) for (cp1, cp2) in zip(cps[:], cps[1:]): if (cp2 - cp1) < minlenshf: months_to_delete = range(cp1 + 1, cp2 + 1) network.raw_series[id].delete_months(months_to_delete) if cp2 == last: del_key = cp1 else: del_key = cp2 if del_key in network.raw_series[id].changepoints: # print len(network.raw_series[id].changepoints.keys()), del network.raw_series[id].changepoints[del_key] # print len(network.raw_series[id].changepoints.keys()), # raw_input("pause") del_str = "Del 1st segment: " if cp1 == first else "Delete segment: " print id, station_index + 1, del_str, imo2iym(cp1), cp1, imo2iym(cp2), cp2 new_changepoints = network.raw_series[id].changepoints new_cps = sorted(new_changepoints.keys()) print " First data value: ", imo2iym(first) for cp in new_cps: print " End seg:", new_cps.index(cp), " ym: ", imo2iym(cp), cp, new_changepoints[cp]["jsum"] print " End segment ym: ", imo2iym(last), last # Finally, add first and last value to the list of changepoints. first_stats = dict(ahigh=0.0, astd=0.0, jsum=0) last_stats = dict(ahigh=0.0, astd=0.0, jsum=0) network.raw_series[id].changepoints[first] = first_stats network.raw_series[id].changepoints[last] = last_stats #################################################################### ## Series of debug print statements summarizing the final list of ## changepoints. Not necessary at the moment ############################################################################ # The subnetwork processing became a multi-step process plus a "post-process # pass" to manage: # 1) problems with documented changepoints with NO undocumented support # 2) determine the best amplitude estimation for each confirmed changepoint for step in [2, 3]: ## Setup output strings based on the step used. Only cosmetic differences ## really. iminlen = hom_params["minlen"] numclim = 3 ## STEP 1 - NEVER USED (technically the history-consideration done previously if step == 1: continue elif step == 2: ## STEP 2 - NOT SIG REMOVAL ## equivalent to ipass loopback for istep == 2 in Fortran PHA print " ---------------- NOT SIG REMOVAL --------------- " tstr = "Not sig: " outid = "NS" ipass = 1 elif step == 3: ## STEP 3 - ADJUSTMENT OF DISCONTINUITIES # equivalent to ipass loopback for istep == in FORTRAN PHA print " ---------------- ADJUST DISCONTINUITY STEP --------------- " print "Adjpass, iminlen, numclim", "--", iminlen, numclim print " ---------------- NPASS --------------- " tstr = "Dstep Dtrend: " outid = "WM" ipass = ipass + 1 final_results = dict() print " NET STN FILT TECH ------ AFTER ------ ------ BEFORE ------" # Process each station and its network of neighbors for id in station_list: station_index = station_list.index(id) station_cp_dict = network.raw_series[id].changepoints sorted_cps = sorted(station_cp_dict.keys()) ## If there are no breakpoints... if not sorted_cps: final_results[id] = dict() continue station_series = network.raw_series[id] missing_val = station_series.MISSING_VAL # compute monthly anomalies for this station data station_anomalies = station_series.monthly_anomaly_series # What are the first and last valid months in this station's data set? # We've saved them as the first and last changepoint before... first = sorted_cps[0] last = sorted_cps[-1] # What are the pairs to this station that we need to consider? station_pairs = [] for other_id in all_station_list: pair = tuple(sorted([id, other_id])) if pair in hom_params["pairs"]: station_pairs.append(pair) print station_pairs # List the remaining changepoints after the "confirmfilt" process for cp in sorted_cps: cp_stats = station_cp_dict[cp] hit_count = cp_stats["jsum"] iy, im = imo2iym(cp) print ( "%3d %5d %6s Estamt chgin: -- %4d %2d %4d %3d" % (ipass, station_index, id, iy, im, cp, hit_count) ) ## ACCUMULATE PAIRED CHANGEPOINTS AND AMPLITUDE ESTIMATES # Loop over "brackets" of changepoints - that is, for changepoints # [a, b, c, d], consider the two brackets [a,b,c] and [b,c,d] with # the center value of the changepoints. Note that in the Fortran PHA, # we go through these brackets in reverse order - right to left. brackets = zip(sorted_cps[-3::-1], sorted_cps[-2::-1], sorted_cps[::-1]) final_results[id] = dict() for bracket in brackets: # for bracket in brackets[:1]: (left, cp, right) = bracket[:] ly, lm = imo2iym(left) cpy, cpm = imo2iym(cp) ry, rm = imo2iym(right) print "Oriented: ", "--", "--", "--", left, cp, cp + 1, right # setup the output string for this bracket's tests chgptstr = " Win1: %5d %4d%2d %5d %4d%2dto Win2: %5d %4d%2d %5d %4d%2d" % ( left, ly, lm, cp, cpy, cpm, cp, cpy, cpm, right, ry, rm, ) ## THIS SECTION ACCUMULATES TARGET-NEIGHBOR COMPARISONS # See if there are enough homogeneous data in the target; # check each window valid_count_right = len(get_valid_data(station_data[cp + 1 : right + 1], missing_val)) valid_count_left = len(get_valid_data(station_data[left : cp + 1], missing_val)) # if the segment length (valid count) is too short, skip this # changepoint (for now) if valid_count_left < iminlen: print "Adjpass seg2 short ", station_index, id, chgptstr, valid_count_left continue if valid_count_right < iminlen: print "Adjpass seg1 short ", station_index, id, chgptstr, valid_count_right continue ## We've pass the too-little-data pitfall. Now, we are actually going ## to go back through our paired neighbors and compute some final ## statistics about these changepoints. We'll store them in a ## dictionary for later, just like the pair_results dictionary ## from splitmerge pair_results = dict() # for (id1, id2) in [("215887", "200779")]: for (id1, id2) in station_pairs: # Reset the left, cp, and right indices to the original # bracket we're considering. We are going to be changing them # while we look at this pair (left, cp, right) = bracket[:] ## Figure out which station is the neighbor (not the target ## we're currently considering). At the same time, note that if ## the target is the 2nd changepoint, the adjustments will be ## flipped in sign, so we need to have a correction factor ready correction = 1.0 if id == id1: neighb_id = id2 else: neighb_id = id1 # correction = -1.0 # Add this pair to pair_results if it's not already there (ida, idb) = sorted([id1, id2]) pair_str = "%s-%s" % (ida, idb) if pair_str not in pair_results: pair_results[neighb_id] = dict() print pair_str neighb_index = all_station_list.index(neighb_id) neighb_cp_dict = network.raw_series[neighb_id].changepoints neighb_series = network.raw_series[neighb_id] neighb_anomalies = neighb_series.monthly_anomaly_series ## Generature a difference data set for this pair of stations diff_data = diff(station_anomalies, neighb_anomalies) ## It's possible that in the [left, right] bracket we're looking ## at, there's a changepoint in the paired neighbor. We need ## to adjust the endpoints of the bracket to exclude those ## breakpoints # Check right-hand side first and break out if ... right_seg_len = len(get_valid_data(diff_data[cp + 1 : right + 1])) # right_seg_len = len(diff_data[cp+1:right+1]) for month in range(cp + 1, right + 1): if month == last: continue # ... we hit a changepoint in the neighbor ... if month in neighb_cp_dict: neighb_hits = neighb_cp_dict[month]["jsum"] right_seg_len = len(get_valid_data(diff_data[cp + 1 : month + 1])) # right_seg_len = len(diff_data[cp+1:month+1]) print ( "CHG2: ", neighb_index, neighb_id, "num,edit,2b,2e,imo,nhits", right_seg_len, "--", cp + 1, right, month, neighb_hits, ) right = month break # ... and the final right-segment is too short print left, cp, right if right_seg_len < iminlen: print ( "Low2: ", neighb_index, neighb_id, "num,edit,2b,2e,imo,nhits", right_seg_len, "--", cp + 1, right, month, "--", ) continue # Now, check the left-hand side and break out if ... left_seg_len = len(get_valid_data(diff_data[left : cp + 1])) for month in range(cp - 1, left, -1): if month == first: continue # ... we hit a changepoint in the neighbor ... if month in neighb_cp_dict: neighb_hits = neighb_cp_dict[month]["jsum"] left_seg_len = len(get_valid_data(diff_data[month:cp])) # left_seg_len = len(diff_data[month:cp]) print ( "CHG1: ", neighb_index, neighb_id, "num,edit,1b,1e,imo,nhits", left_seg_len, "--", cp + 1, left, month, neighb_hits, ) left = month break # ... and the final left-segment is too short if left_seg_len < iminlen: print ( "Low1: ", neighb_index, neighb_id, "num,edit,1b,1e,imo,nhits", left_seg_len, "--", cp + 1, left, month, "--", ) continue ## We can now estimate the raw changepoint amplitude using minbic. ## However, we'll short-circuit a lot of the work by telling it to only ## use the KTHTPR0 model (simple step-change model) (seg_x, seg_data) = range(left + 1, right + 1), diff_data[left + 1 : right + 1] bp_index = cp - (left + 1) # print left, cp, right, "|", bp_index # print left_seg_len, right_seg_len bic_result = minbic(seg_x, seg_data, bp_index, missing_val, models=[("KTHTPR0", kthtpr0)]) ## Also check the first difference correlations between the ## monthly anomalies station_first_diff = compute_first_diff(station_anomalies, missing_val) neighb_first_diff = compute_first_diff(neighb_anomalies, missing_val) corr = compute_corr(station_anomalies, neighb_anomalies) ## Write out the results of this testing process so far cmodel = bic_result["cmodel"] bic = bic_result["bic"] test_stat = bic_result["test_stat"] crit_val = bic_result["crit_val"] offset = bic_result["offset"] slopes = bic_result["slopes"] left_slope, right_slope = slopes print ( "%s %6s-%6s %s %7.2f %7.2f %7.2f %7.2f %7.3f %7.3f -- %d --" % ( tstr, id, neighb_id, chgptstr, crit_val, test_stat, offset, corr, left_slope, right_slope, right_seg_len, ) ) ## Analysis is done. ## Keep the adjustment (offset) for each neighbor/segment, ## set/reset trend for each neighbor/segment ## the first segment is the left-segment, ## the second segment is the right-segment ## ## Note that we reset left/right potentially to avoid conflicts ## within the paired neighbor data. However, our estimates of ## trends/offsets associated with the "right" adjacent changepoint ## actually refers to that original right changepoint. We'll ## reset left, cp, and right from the bracket before continuing (left, cp, right) = bracket[:] # Do the left segment first left_dict = dict() left_dict["adj"] = offset * correction left_dict["cor"] = corr left_dict["bic"] = bic left_dict["cmodel"] = cmodel left_dict["trend"] = left_slope left_dict["spanob"] = left_seg_len pair_results[neighb_id][cp] = left_dict # Do the right segment now right_dict = dict() right_dict["adj"] = offset * correction right_dict["cor"] = corr right_dict["bic"] = bic right_dict["cmodel"] = cmodel right_dict["trend"] = right_slope right_dict["spanob"] = right_seg_len if right not in pair_results[neighb_id]: pair_results[neighb_id][right] = right_dict else: # We've already recorded this segment before for the last # changepoint. Update the slopes/spanob count (length of # preceding segment) if the slopes are different and the # length is different. new_trend = slopes[1] new_spanob = right_seg_len old_trend = pair_results[neighb_id][right]["trend"] old_spanob = pair_results[neighb_id][right]["spanob"] if old_trend != new_trend: print ( " Seg2 diff: %s %4d old: %7.2f %4d new: %7.2f %4d" % (pair_str, right, old_trend, old_spanob, new_trend, new_spanob) ) # if the new count is greater than the old one, the slope # is probably more robust so update those entries. if new_spanob > old_spanob: pair_results[neighb_id][right]["trend"] = new_trend pair_results[neighb_id][right]["spanob"] = new_spanob ## We're done with this pair/changepoint. Summary output - if step == 2: print "itarg,ipair,ichg,numc,iqt,adj,trends: -- -- -- --", cmodel, offset, slopes # raw_input("pause") #################################################################### ## ADJUSTMENT DETERMINATION SECTION # Recall the paired-changepoint analyses we just performed, and # determine if the potential adjustment is statistically valid (left, cp, right) = bracket[:] pair_data = [] for neighb_id in pair_results: if not cp in pair_results[neighb_id]: continue cp_stats = pair_results[neighb_id][cp] adjacent_stats = pair_results[neighb_id][right] trends = (cp_stats["trend"], adjacent_stats["trend"]) pair_dict = dict( neighb_id=neighb_id, adj=cp_stats["adj"], cor=cp_stats["cor"], trends=trends, used=True ) pair_data.append(pair_dict) npairs = len(pair_data) if npairs < numclim: print "Adjpass numc low --", station_index, id, left, cp, right, npairs continue # Process - # 1) Remove both adjustment and trend outliers # 2) Calculate median adjustment # # filter around inter-quartile range qscale = hom_params["qscale"] pair_data = sorted(pair_data, key=operator.itemgetter("adj")) pair_chgs = [p["adj"] for p in pair_data] chg_25th, chg_median, chg_75th = tukey_med(pair_chgs) chg_iqr = chg_75th - chg_25th chg_low = chg_25th - (chg_median - chg_25th) * 1.0 * qscale chg_high = chg_75th + (chg_75th - chg_median) * 1.0 * qscale print ( " TRIM p25, p75, pct50, rng, lo, hi: %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" % (chg_25th, chg_75th, chg_median, chg_iqr, chg_low, chg_high) ) # If any of the estimated changepoints are outside the statistically # robust range we just computed, then flag them as we print them and for data in pair_data: neighb_id = data["neighb_id"] neighb_index = all_station_list.index(neighb_id) adj = data["adj"] cor = data["cor"] trends = data["trends"] if not (chg_low < adj < chg_high): data["used"] = False flag = "U" if data["used"] else "X" print ("%s %4d %7.2f %8.4f %8.4f %7.2f" % (flag, neighb_index, adj, trends[0], trends[1], cor)) valid_adj_count = len([d for d in pair_data if d["used"]]) if valid_adj_count < numclim: if step == 2: print ( "Insuff trimmed mean -- %4d %s %5d %5d %5d %5d" % (station_index, id, left, cp, right, valid_adj_count) ) continue ## BUG: The code here re-computes the inter-quartile range by ## scaling qscale by 1.0. Curiously, it doesn't reject any ## pairs based on this new range. chg_iqr = chg_75th - chg_25th chg_low = chg_25th - (chg_median - chg_25th) * qscale chg_high = chg_75th + (chg_75th - chg_median) * qscale ## Tweak the inter-quartile range to check if the adjustment is ## Check whether the computed adjustment is significant. That is, ## if 0 is included within the inter-quartile range we computed, then ## we can't reject the null hypothesis that the changepoint is significant if chg_high * chg_low > 0.0: # signs are the same, so 0 isn't included in the range. procstr = "CONSHF" sigadj = chg_median else: procstr = "ZERSHF" sigadj = 0.0 final_results[id][cp] = dict(adj=sigadj, std=chg_iqr * 1.0 * qscale, num=npairs) print ("%2d %s-%s %s %7.2f" % (station_index, id, procstr, chgptstr, sigadj)) ## Print some final output about what changepoints remain for this station final_station_results = final_results[id] final_cps = sorted(final_station_results.keys()) for cp in final_cps: adj = final_station_results[cp]["adj"] std = final_station_results[cp]["std"] cp_stats = station_cp_dict[cp] hit_count = cp_stats["jsum"] iy, im = imo2iym(cp) print ( "-- %5d %s Estamt chgout: -- %4d%2d %5d %5d %7.2f %7.2f" % (station_index + 1, id, iy, im, cp, hit_count, adj, std) ) # raw_input("pause") ## Remove the accumulated non-significant changepoints (either non-sig because ## there was too much missing data, the target segment was too short, or the ## trimmed mean test could not reject the null hypothesis of no change for id in station_list: station_index = station_list.index(id) final_station_results = final_results[id] final_cps = sorted(final_station_results.keys()) for cp in final_cps: iy, im = imo2iym(cp) cp_index = final_cps.index(cp) adj = final_station_results[cp]["adj"] std = final_station_results[cp]["std"] if adj == 0.0: print ("%s %5d Remove chgpt %5d %4d %2d %4d" % (id, station_index, cp_index, iy, im, cp)) del network.raw_series[id].changepoints[cp] else: # Update the network's record of changepoints with this new list network.raw_series[id].changepoints[cp]["ahigh"] = adj network.raw_series[id].changepoints[cp]["astd"] = std # the changepoint at first month has been removed; add it back in network.raw_series[id].changepoints[first] = dict(ahigh=0.0, astd=0.0, jsum=0)
def splitmerge(network, pairs=None, beg_year=1, end_year=2, **kwargs): ## EXPERIMENTAL PLACEHOLDERS - will eventually be replaced with a master ## loop to do all the id pairs. id_list = network.stations.keys() pair_results = dict() def dict_to_tuples(d): keys = d.keys() return [(key, d[key]) for key in keys] ## Generate station pairs for use in splitmerge by iteratively going through the ## station_list and adding stations in order of decreasing correlation. Skip a ## neighbor if the pair is already present; want 20 stations or until all the ## correlated neighbors are used up. # pairs = [] # for id1 in id_list: # neighbors = dict_to_tuples(network.correlations[id1]) # sorted_neighbors = sorted(neighbors, key=operator.itemgetter(1)) # added_pairs = 0 # while sorted_neighbors and (added_pairs < 5): # id2, _ = sorted_neighbors.pop() # ordered_pair = tuple(sorted((id1, id2))) # if not ordered_pair in pairs: # pairs.append(ordered_pair) # added_pairs += 1 for (id1, id2) in pairs: print "Pair %s with %s" % (id1, id2) pair_str = "%6s-%6s" % (id1, id2) #if pair_str != "051528-298107": # continue raw_series = network.raw_series stations = network.stations series_copy = deepcopy(raw_series) min_ann = 5 num_years = end_year - beg_year num_months = num_years*12 for s in series_copy.itervalues(): data = s.series scaled = scale_series(data, 0.1, s.MISSING_VAL) anomalies = compute_monthly_anomalies(scaled, s.MISSING_VAL) s.set_series(anomalies, s.years) ## Retrieve the data for each of the stations. station1 = stations[id1] series1 = series_copy[id1] data1 = series1.monthly_series station2 = stations[id2] series2 = series_copy[id2] data2 = series2.monthly_series #print data1[:50] #print data2[:50] #print "################################################################" ## Compute the difference series diff_data = diff(data1, data2) MISS = series1.MISSING_VAL # Missing value placeholder ## Quickly pass through the data to find where it starts. We need to do this ## because it's possible that beg_year is earlier than the first year of ## valid data in either data1 or data2. Furthermore, the original PHA code ## deliberately clipped off the first year of good data, so emulate that ## effect here as well. ## ## Ultimately, we save the extreme early and extreme late month with valid ## data to use as our first guess at the undocumented changepoints. first = 0 first_set = False last = 0 for (i, d1, d2) in zip(xrange(num_months), data1, data2): if d1!=MISS and d2!=MISS: if first < 12: first = i #first_set = True #if not first_set: # first = i # first_set = True last = i ## Set the initial breakpoints and the list of already-found, homogenous ## segments. breakpoints = [first, last, ] homog_segs = [] ##################################################################### ## BEGIN SPLITMERGE PROCESS TO GENERATE FIRST GUESS AT UNDOCUMENTED ## CHANGEPOINTS iter = 0 # counts how many times we've repeated the splitmerge process enter_BIC = False # break out of iterations into the BIC process? last_breakpoints = [] while (iter < 10) and not enter_BIC: seg_bounds = zip(breakpoints[:-1], breakpoints[1:]) last_breakpoints = deepcopy(breakpoints) new_breakpoints = deepcopy(breakpoints) new_homog_segs = [] print "Parse segments (isplit = 1), ipass: "******"Too short: ", imo2iym(l), imo2iym(r) continue ## If we've previously found that this segment is homogenous (has no ## potential changepoint), then we can skip it as well and proceed to ## the next one. # Set the within() method to check if this segment is within any # previously found homogenous ones. Use lambda, since we can't pass # keyword or positional arguments to map(). within_this_seg = lambda seg: within((l, r), seg) within_stable_segs = map(within_this_seg, homog_segs) if any(within_stable_segs): print "Stable segment: ", imo2iym(l), imo2iym(r) if l == first: new_breakpoints.append(first) continue ## The standard normal homogeneity test - which is the statistical test ## we'll use to see if there is a potential changepoint in this segment ## - requires us to normalize our paired difference series. We can do ## that in snht(), but we'll do it right now so we can inspect those ## standardized values later. z = standardize(segment, MISS) ## Apply standard normal homogeneity test. ## For mechanics, see Alexandersson and Moberg 1997, Int'l Jrnl of ## Climatology (pp 25-34) likelihood_ratios = snht(z, MISS, standardized=True) z_count = len(get_valid_data(z)) ## We're left with the likelihood ratio for each value being a potential ## changepoint. Find the max ratio, and if that value is significant, let ## it be the newest potential changepoint. ind_max_ratio = 0 max_ratio = 0.0 clip_ratios = likelihood_ratios[2:-2] # clip the beginning and end, # they can't be changepoints. for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios): if ratio > max_ratio: ind_max_ratio = ind max_ratio = ratio ## Now we find the critical value for this data set, and check our max ## likelihood ratio against it crit_val = lrt_lookup(z_count) # The possible changepoint is the index of the max ratio we found. # We have to shift it the following ways to align it to the original # data - # 1) shift by 2 re-aligns it from clip_ratios to likelihood_ratios # 2) shift by adjust re-aligns it to this segment in diff_data # 3) shift by l re-aligns it to the first index in diff_data possible_changepoint = l + ind_max_ratio + 2 + adjust y_new, m_new = imo2iym(possible_changepoint) # year, month ## If this is the first iteration, we indicate as such, and add the new ## changepoint if iter == 0: print "%6s-%6s MD FIRST series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val) breakpoints.append(possible_changepoint) breakpoints = sorted(breakpoints) else: ## Else, if we found a new possible changepoint, add it to our list. if max_ratio > crit_val: print "%6s-%6s MD Inhomogenity for series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count) new_breakpoints.append(possible_changepoint) ## If not, record that we found a homogeneous segment. else: print "%6s-%6s MD Homogeneous series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count) new_homog_segs.append((l, r)) ## Now we need to update our account of which segments were homogeneous, ## because we need to know during the next iteration. We will do this, ## as well as condense stable segments that lie adjacent to each other ## i.e, if we have the segments [(1,5), (5, 10,),, (12, 15)], then we ## really have [(1,10), (12, 15)]. homog_segs.extend(new_homog_segs) if homog_segs: homog_segs = sorted(homog_segs, key=operator.itemgetter(0)) final_homog_segs = [homog_segs[0], ] # this will be like a stack for seg in homog_segs[1:]: last_seg = final_homog_segs[-1] if last_seg[1] == seg[0]: new_seg = (last_seg[0], seg[1]) final_homog_segs.pop() final_homog_segs.append(new_seg) else: final_homog_segs.append(seg) homog_segs = final_homog_segs ## So we have new segments that can be generated from these new ## breakpoints. Now, the PHA routine enters a "merge" process ## to see whether or not to keep these newly found changepoints or throw ## them out as false alarms. ## ## We do this by "leapfrogging" every other breakpoint. This gives us ## a set of segments that all have another breakpoint in them. We want ## to see if these segments are homogeneous, because if they are, it ## means that the breakpoint we previously found in the segment has ## been superseded. new_breakpoints = sorted(new_breakpoints) seg_bounds = zip(new_breakpoints[:-2], new_breakpoints[2:]) remove_breakpoints = set() merged_breakpoints = set() if iter > 0: print "Merge segments (isplit = 0), ipass: "******"Stable segment: ", imo2iym(l), imo2iym(r) # if l == first: # new_breakpoints.append(first) # seg_lookup.append(((l, r), 'stable')) # continue # Set the within() method to check if this segment is within any # previously found homogenous ones. Use lambda, since we can't pass # keyword or positional arguments to map(). within_this_seg = lambda seg: within((l, r), seg) within_stable_segs = map(within_this_seg, homog_segs) if any(within_stable_segs): print "Stable segment: ", imo2iym(l), imo2iym(r) #if l == first: # new_breakpoints.append(first) merged_breakpoints.update([l, r]) continue ## Apply the same adjustments and the same standard normal homogeneity ## test that we did in the previous splitting process. There is no ## difference here until we consider what to do if we find a new ## homogeneous segment. adjust = int(seg_bounds.index((l, r)) > 0) segment = diff_data[l+adjust:r+1] z = standardize(segment, MISS) likelihood_ratios = snht(z, MISS, standardized=True) z_count = len(get_valid_data(z)) ind_max_ratio = 0 max_ratio = 0.0 clip_ratios = likelihood_ratios[2:-2] # We clip the beginning and end for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios): if ratio > max_ratio: ind_max_ratio = ind max_ratio = ratio crit_val = lrt_lookup(z_count) possible_changepoint = l + ind_max_ratio + 2 + adjust y_new, m_new = imo2iym(possible_changepoint) if z_count < 2: y1, m1 = imo2iym(l) y2, m2 = imo2iym(r) print "%6s-%6s MD No found peaks %4d %2d to %4d %2d" % (id1,id2,y1,m1,y2,m2) print "%6s-%6s MD Compress 1 out peak at %4d %2d" % (id1,id2,y_new,m_new) #remove_breakpoints.add_ ## If we found a new breakpoint that is statistically significant, then ## great! Let's keep it. if max_ratio > crit_val: print "%6s-%6s MD Peak kept in merge at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val) merged_breakpoints.add(l) merged_breakpoints.add(new_bp) merged_breakpoints.add(r) ## If not, then this segment was homogeneous, so the breakpoint which ## already exists in it is no good. else: print "%6s-%6s MD Compress 2 out peak at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val) # Crap, if there are any potential breakpoints in this segment, # we need to remove them because this segment is homogeneous. Let's # remember this homogeneous segment for now and come back once # we've found all of them. merged_breakpoints.update([l, r]) remove_breakpoints.add(new_bp) ## At this point, we have a set of all the breakpoints we've accumulated ## during this iteration of split/merge, as well as a set of breakpoints ## which we've found to be of no further use. We can difference update ## our set of breakpoints to remove these guys, and let those merged ## breakpoints be the set of newest breakpoints for the next splitmerge ## iteration. merged_breakpoints.difference_update(remove_breakpoints) breakpoints = list(merged_breakpoints) breakpoints = sorted(breakpoints) ## Did we actually find new breakpoints? If not, then we're done ## with splitmerge and can move on to the BIC process. enter_BIC = (breakpoints == last_breakpoints) iter = iter + 1 ## Okay wow, we've potentially made it to the BIC stage now... ! if first not in breakpoints: breakpoints.insert(0, first) ym_breakpoints = map(imo2iym, breakpoints) #print ym_breakpoints ## ENTERING MINBIC bp_dictionary = dict() #################################### ##### MULTIPROCESS from multiprocessing import Pool global counter multi_bp_dict = {} counter = 0 def cb(r): global counter #print counter, r counter += 1 start = time.clock() po = Pool(processes=4) for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]): if left != first: left = left + 1 # recall that we only consider data after the first full year. we will be # computing regressions with the independent variable indexed from this # starting point, so we need to shift these indices. we also need to shift them # by +1 if this is any segment beyond the first one, so that we don't include # changepoints in more than one analysis. # TOTAL_SHIFT = -12 + 1 = -11 # # However, this shift is only necessary while looking at the array indices that # we generate using range(). the data should already be aligned correctly. total_shift = -12 + 1 left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift y1, m1 = imo2iym(left) yb, mb = imo2iym(bp) y2, m2 = imo2iym(right) #print "Entering MINBIC - %4d %2d %4d %2d %4d %2d" % (y1, m1, yb, # mb, y2, m2) (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] bp_index = bp-left #print len(seg_x), len(seg_data), bp_index #bp_analysis = minbic(seg_x, seg_data, bp_index, MISS) multi_bp_dict[bp] = po.apply_async(minbic,(seg_x,seg_data,bp_index,MISS,),callback=cb) po.close() po.join() for bp in multi_bp_dict: r = multi_bp_dict[bp] multi_bp_dict[bp] = r.get() #print "counter - %d" % counter elapsed = (time.clock() - start) print "ELAPSED TIME - %2.3e" % elapsed #print new_bp_dict #################################### ##### NORMAL # start = time.clock() # for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]): # # if left != first: # left = left + 1 # # recall that we only consider data after the first full year. we will be # # computing regressions with the independent variable indexed from this # # starting point, so we need to shift these indices. we also need to shift them # # by +1 if this is any segment beyond the first one, so that we don't include # # changepoints in more than one analysis. # # TOTAL_SHIFT = -12 + 1 = -11 # # # # However, this shift is only necessary while looking at the array indices that # # we generate using range(). the data should already be aligned correctly. # total_shift = -12 + 1 # left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift # y1, m1 = imo2iym(left) # yb, mb = imo2iym(bp) # y2, m2 = imo2iym(right) # print "Entering MINBIC - %4d %2d %4d %2d %4d %2d" % (y1, m1, yb, # mb, y2, m2) # (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] # bp_index = bp-left # #print len(seg_x), len(seg_data), bp_index # bp_analysis = minbic(seg_x, seg_data, bp_index, MISS) # # bp_dictionary[bp] = bp_analysis # elapsed2 = (time.clock() - start) # print "ELAPSED TIME = %3.2e" % elapsed2 ##################################3 ## Print the adjustment summaries bp_dictionary = multi_bp_dict sorted_bps = sorted(bp_dictionary.keys()) ndelete = [] valid_bps = {} for bp in sorted_bps: stats = bp_dictionary[bp] cmodel=stats['cmodel'] iqtype=stats['iqtype'] asigx=stats['offset'] azscr=stats['offset_z'] rslp=stats['slopes'] end1 = bp y_end1, m_end1 = imo2iym(end1) beg2 = bp+1 y_beg2, m_beg2 = imo2iym(beg2) # If cmodel is *SLR*, then there is no breakpoint if 'SLR' in cmodel: print ("%s-%s -- -- MD TESTSEG SKIP: %7.2f %5d %5d %3d %5d %5d %3d" % (id1, id2, asigx, end1, y_end1, m_end1, beg2, y_beg2, m_beg2)) # Don't store it! else: print ("%6s-%6s -- -- MD TESTSEG ADJ: %7.2f %7.2f %8.4f %8.4f %5d %5d %3d %5d %5d %3d %2d" % (id1,id2, asigx, azscr, rslp[0], rslp[1], end1, y_end1, m_end1, beg2, y_beg2, m_beg2, iqtype)) # Store it! valid_bps[bp] = stats ############################### ## Go back and see if we can get rid of some of the change points. ## If 2 or more of the chgpts are within MINLEN, ## a) if the chgpt estimates are the same sign, then test each ## singly with same endpoints and keep lowest BIC ## b) if not the same sign, ## retain earliest changepoint # add the first, last to valid_bps interior_bps = valid_bps.keys() # Add first, last if not already in interior_bps for bp in [first, last]: if bp not in interior_bps: interior_bps.append(bp) sorted_bps = sorted(interior_bps) for left in sorted_bps: print sorted_bps, left ## We're looking for the next interim breakpoint that satisfies two ## conditions: ## 1) at least MINLEN valid data (non-missing to the right) ## 2) has at least one breakpoint between 'left' and it right = 0 close_bps = [] for right in sorted_bps: if right <= left: continue if not close_bps: close_bps.append(right) else: valid_between_bps = diff_data[close_bps[-1]:right] valid_length = len(get_valid_data(valid_between_bps, MISS)) print imo2iym(close_bps[-1]),valid_length,imo2iym(right) if valid_length > MINLEN: break close_bps.append(right) # We could actually run out of things in sorted_bps, and wind up with # right == close_bps[-1]. Detect that and break out of this analysis # if that happens. if close_bps[-1]==right: break if left != first: left = left + 1 close_bp_results = {} for bp in close_bps: # # recall that we only consider data after the first full year. we will be # # computing regressions with the independent variable indexed from this # # starting point, so we need to shift these indices. we also need to shift them # # by +1 if this is any segment beyond the first one, so that we don't include # # changepoints in more than one analysis. # # TOTAL_SHIFT = -12 + 1 = -11 # # # # However, this shift is only necessary while looking at the array indices that # # we generate using range(). the data should already be aligned correctly. total_shift = -12 + 1 left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift y1, m1 = imo2iym(left) yb, mb = imo2iym(bp) y2, m2 = imo2iym(right) print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" print y1,m1,"-",yb,mb,"-",y2,m2 print "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] bp_index = bp-left bp_analysis = minbic(seg_x, seg_data, bp_index, MISS, kthslr0_on=True) cmodel=bp_analysis['cmodel'] iqtype= bp_analysis['iqtype'] offset= bp_analysis['offset'] rslp= bp_analysis['slopes'] crit_val = bp_analysis['crit_val'] test_stat = bp_analysis['test_stat'] bic = bp_analysis['bic'] print ("Interim chgpt: %s %4d %2d %4d %2d %4d %2d %8.2f %8.2f %8.2f %8.2f %7.3f %7.3f %2d" % (pair_str, y1, m1, yb, mb, y2, m2, bic, test_stat, crit_val, offset, rslp[0], rslp[1], iqtype)) close_bp_results[bp] = bp_analysis # Now we have a small problem... we might have more than one breakpoint, # so we need to choose which one is best. We will check the sign of # the breakpoint amplitude changes: sign_of_amps = map(sign, [close_bp_results[bp]['offset'] for bp in close_bps]) positive = lambda x: sign(x) >= 0 negative = lambda x: sign(x) <= 0 zero = lambda x: sign(x) == 0 print "------------>",[close_bp_results[bp]['offset'] for bp in close_bps] if (all(map(positive, sign_of_amps)) or all(map(negative, sign_of_amps))): # Pick the best (minimum BIC) bics = [(bp, close_bp_results[bp]['bic']) for bp in close_bps] sorted_bics = sorted(bics, key=operator.itemgetter(1)) smallest_bp = sorted_bics[0][0] # Remove this smallest-bic bp from the in-interval bps close_bps.remove(smallest_bp) valid_bps[smallest_bp] = close_bp_results[smallest_bp] #print "leftovers",close_bps for bp in close_bps: # The remaining bps which we will reject sorted_bps.remove(bp) # Remove them from this loop del valid_bps[bp] # Remove them as valid yb, mb = imo2iym(smallest_bp) print ("Same domain - Lowest Interim: %s %4d %2d" % (pair_str, yb, mb)) elif (all(map(zero, sign_of_amps))): # Choose the earliest changepoint; the rest of these have # amplitude changes which are 0. first_bp, last_bp = close_bps[0], close_bps[-1] # Remove the first interim bp and update valid_bps with this new # computation. close_bps.remove(first_bp) valid_bps[first_bp] = close_bp_results[first_bp] # Reject remaining interim bps for bp in close_bps: sorted_bps.remove(bp) del valid_bps[bp] yb, mb = imo2iym(first_bp) print ("Null domain - Earliest Interim : %s %4d %2d" % (pair_str, yb, mb)) else: # We'll use the earliest interim changepoint, but we need # to get rid of bad data. Replace all the data between the # interim changepoints as missing and re-compute BIC. first_bp, last_bp = close_bps[0], close_bps[-1] first_bp_index = first_bp-left last_bp_index = last_bp-left print len(seg_x), len(seg_data) print first_bp_index+1, last_bp_index+1 print left, bp, right for i in range(first_bp_index+1, last_bp_index+1): print i, imo2iym(i), i+left, imo2iym(i+left) seg_x[i] = MISS seg_data[i] = MISS # Recall that seg_data[0] == diff_data[left]. ndelete records # the *true month where there is unviable data*, so it needs to # point back to the original element in diff_data we are # worried about. ndelete.append(i+left) bp_analysis = minbic(seg_x, seg_data, first_bp_index, MISS, kthslr0_on=True) # Remove the first interim bp and update valid_bps with this new # computation. close_bps.remove(first_bp) valid_bps[first_bp] = bp_analysis # Reject remaining interim bps for bp in close_bps: sorted_bps.remove(bp) del valid_bps[bp] yb, mb = imo2iym(first_bp) print ("Diff domain - Earliest Interim : %s %4d %2d" % (pair_str, yb, mb)) ## Remove changepoints which are an SLR model. nspan = [0]*num_months bp_count = 1 for bp in sorted(valid_bps.keys()): bp_analysis = valid_bps[bp] if "SLR" in bp_analysis['cmodel']: del valid_bps[bp] continue print " IN: ",bp nspan[bp] = bp_count ## If adjacent months are missing next to this breakpoint, then ## assume that those could be a breakpoint as well and copy this ## breakpoint's analysis results for them. for month in range(bp+1, last): if (month in ndelete) or (diff_data[month] == MISS): nspan[month] = bp_count print " IN: ",month valid_bps[month] = bp_analysis else: break bp_count += 1 valid_bps['del'] = ndelete valid_bps['nspan'] = nspan pair_results[pair_str] = valid_bps #print "ELAPSED TIMES = %3.2e %3.2e" % (elapsed1, elapsed2) print "done" ## import pickle f = open("pair_results", 'w') pickle.dump(pair_results, f) return pair_results
def snht(data, missing_val=-9999, valid_count=None, standardized=False): """Standard normal homogeneity test Loops over the given data and computes the likelihood ratio statistic for every value, using that value as the pivot to divide the data in two segments. :Param data: The data, as a list of floats, on which to conduct the test. :Param missing_val: (optional) The placeholder for missing values which should be excluded from computations. :Param valid_count: (optional) The number of valid values in the dataset. :Param standardized: (optional) Boolean flag indicating whether or not the data has already been standardized into a reference series. If not, will invoke the standardization procedure on the data. :Return: A list with the same shape as the original list of data, containing the likelihood ratio test statistic for each data point. Missing values in the input data will be carried into the output list. """ ## Standardize the data if this hasn't already been done if not standardized: data = standardize(data, missing_val) ## Return array of computed test statistics, initialized to missing_val's ts = [missing_val for d in data] if not valid_count: valid_count = len(get_valid_data(data, missing_val)) pivot_count = range(valid_count-1) # BUG: This is *really* counter-intuitive, and probably a bug in the original # PHA code. Here, and in the PHA code, we use valid_count to effectively truncate # the right tail of the data. The catch is, valid_count is the number of 'valid' # data points - not *all* the data points. Because we end the right-seek # at valid_count, we end up missing some of the data on the far right hand side # of the array. # # -- CONFIRMED that this is a bug. Fix commented out below to be deployed # when a new copy of the Fortran PHA is available. for pivot in pivot_count: ## Loop over the data, using each point as a pivot for computing the ## likelihood ratio statistic. if data[pivot] != missing_val: left_series = get_valid_data(data[:pivot+1]) right_series = get_valid_data(data[pivot+1:valid_count]) # BUG Line #right_series = get_valid_data(data[pivot+1:]) # FIX Line ## Compute mean of data left of the pivot; skip to next pivot value ## if no good data was found in this segment. sum_left = sum(left_series) nleft = len(left_series) if nleft != 0: mean_left = sum_left/nleft else: break ## Do the same for the data right of the pivot. sum_right = sum(right_series) nright = len(right_series) if nright != 0: mean_right = sum_right/nright else: break ## Compute and store test statistics #if pivot == valid_count-2: # print nleft, mean_left, sum_left, nright, mean_right, sum_right ts[pivot] = nleft*(mean_left**2) + nright*(mean_right**2) return ts