def monthly_anomaly_series(self): """Returns the monthly anomalies computed from this series as a flat list of length (len(self.years)*12) """ anomalies = compute_monthly_anomalies(self.series, self.MISSING_VAL) flat_anomalies = self._flatten_months(anomalies) return flat_anomalies
def preprocess(network, **params): """Performs the pre-processing necessary to run the pairwise homogenization algorithm on a network of USHCN coop station data. This involves computing neighborhoods of stations which are close to each other, as well as finding stations within these neighborhoods that are highly correlate. :Param network: The network to pre-process, which by this point should have the instance variables :Ivar stations: A dictionary mapping of station coop ids as strings to the Station object holding metadata for that station. :Ivar raw_series: A dictionary mapping of station coop ids as strings to the Series object containing the data read in for that station. :Return: Modifies network by adding the instance variables :Ivar neighborhoods: A dictionary mapping of station coop ids as strings to a list of other station coop id strings which are considered the "neighbors" of the key station. :Ivar correlations: A dictionary mapping of station coop ids as strings to a dictionary which maps the highest correlated neighbors of this key station to their correlation coefficient. For instance, if station "111111" correlates to "222222" with r = 0.45 and to "333333" with r = 0.98, then network.correlations['111111'] == dict("222222":0.45, "333333":0.98 ) """ print "Analyzing geographic network neighborhoods" all_neighbors = dict() stations_list = network.stations.values() for station in stations_list: print station.coop_id print "...computing neighbor distances" neighbors = find_neighborhood(station, stations_list, **params) all_neighbors[station.coop_id] = neighbors network.neighborhoods = all_neighbors # Write a neighborhood output file. Since my algorithm for computing distance # is slightly different than ushcn_dist_2004.v3, it won't produce exactly the # same distance output file. However, all the distances are within 10km, which # is perfectly fine. A bigger problem is that the ushcn_dist_2004.v3 outputs a # list of pointers for referencing the various stations. Since my code doesn't # emulate the Fortran code in terms of having arrays with lengths hard-coded, # I don't pass around pointers in the same way, so the ptr_str produced and # written here is garbage. It should not be a problem for coding MW2009, though, # because I can find information about stations dynamically and easily. print "...Assembling neighborhood output file" dist_out = open(params["dist_file"], "wb") for station in stations_list: print " ", station neighbors = all_neighbors[station.coop_id] out_strings = neighborhood_strings(station, neighbors, stations_list) dist_out.writelines(out_strings) dist_out.close() ########################################################################## # Go through all the data we have, and replace the read-in values with # monthly anomalies. Then, flatten the data into a list with all the data # and length (endyr-begyr)*12 for s in network.raw_series.itervalues(): data = s.series anomalies = compute_monthly_anomalies(data, -9999) s.set_series(anomalies, s.years) print "Determining correlated neighbors" if os.path.exists(params["corr_file"]): print "...great, I'm gonna read it from disk...", corr_file = open(params["corr_file"]) all_lines = corr_file.readlines() station_lines = all_lines[::2] corr_lines = all_lines[1::2] all_corrs = dict() for (sta_line, corr_line) in zip(station_lines, corr_lines): stations = sta_line.strip().split() this, others = stations[0], stations[1:] corrs = map(float, corr_line.strip().split()[1:]) corr_dict = dict() for (id, corr) in zip(others, corrs): if not id == "000000": corr_dict[id] = corr all_corrs[this] = corr_dict print " that was fast!" else: print "...need to do all the compuations" all_corrs = dict() for cand_series in network.raw_series.itervalues(): coop_id1 = cand_series.coop_id corr_dict = find_correlations(cand_series, network.raw_series, network.neighborhoods[coop_id1], **params) all_corrs[coop_id1] = dict(corr=corr_dict) # Write a correlation output file. The actual correlations between stations # matches *perfectly* those computed with the ushcn_corr_2004.v3 code. However, # there are still issues with Fortran array pointers since I don't use any here # and I don't bother to pad the output file with bogus stations and correlations # if there are less than we hoped to find. print "...Assembling neighborhood correlation file\n" corr_out = open(params["corr_file"], "wb") station_list = network.stations.keys() for sta_id in station_list: correlations = all_corrs[sta_id]["corr"] sorted_neighbors = sorted(correlations.iteritems(), key=itemgetter(1), reverse=True)[ : params["numcorr"] - 1 ] # PAD PAD PAD ## Just pad the output with '000000' stations, r = 0.0 to make it look ## like the normal output. while len(sorted_neighbors) < params["numcorr"] - 1: sorted_neighbors.append(("000000", 0.0)) ids, corrs = zip(*sorted_neighbors) id_str = ("%6s " % sta_id) + "".join(("%6s " % id for id in ids)) + "\n" # ptr_str = ("{0: >6d} ".format(test_stations.index(sta_id)+1))+"".join(("{0: >6d} ".format(test_stations.index(id)+1) for id in ids))+"\n" corr_str = (" 1.00 ") + "".join(("{0: >6.2f} ".format(c) for c in corrs)) + "\n" # corr_out.writelines((id_str, ptr_str, corr_str)) corr_out.writelines((id_str, corr_str)) corr_out.close() network.correlations = all_corrs
def splitmerge(network, pairs=None, beg_year=1, end_year=2, **kwargs): ## EXPERIMENTAL PLACEHOLDERS - will eventually be replaced with a master ## loop to do all the id pairs. id_list = network.stations.keys() pair_results = dict() def dict_to_tuples(d): keys = d.keys() return [(key, d[key]) for key in keys] ## Generate station pairs for use in splitmerge by iteratively going through the ## station_list and adding stations in order of decreasing correlation. Skip a ## neighbor if the pair is already present; want 20 stations or until all the ## correlated neighbors are used up. # pairs = [] # for id1 in id_list: # neighbors = dict_to_tuples(network.correlations[id1]) # sorted_neighbors = sorted(neighbors, key=operator.itemgetter(1)) # added_pairs = 0 # while sorted_neighbors and (added_pairs < 5): # id2, _ = sorted_neighbors.pop() # ordered_pair = tuple(sorted((id1, id2))) # if not ordered_pair in pairs: # pairs.append(ordered_pair) # added_pairs += 1 for (id1, id2) in pairs: print "Pair %s with %s" % (id1, id2) pair_str = "%6s-%6s" % (id1, id2) #if pair_str != "051528-298107": # continue raw_series = network.raw_series stations = network.stations series_copy = deepcopy(raw_series) min_ann = 5 num_years = end_year - beg_year num_months = num_years*12 for s in series_copy.itervalues(): data = s.series scaled = scale_series(data, 0.1, s.MISSING_VAL) anomalies = compute_monthly_anomalies(scaled, s.MISSING_VAL) s.set_series(anomalies, s.years) ## Retrieve the data for each of the stations. station1 = stations[id1] series1 = series_copy[id1] data1 = series1.monthly_series station2 = stations[id2] series2 = series_copy[id2] data2 = series2.monthly_series #print data1[:50] #print data2[:50] #print "################################################################" ## Compute the difference series diff_data = diff(data1, data2) MISS = series1.MISSING_VAL # Missing value placeholder ## Quickly pass through the data to find where it starts. We need to do this ## because it's possible that beg_year is earlier than the first year of ## valid data in either data1 or data2. Furthermore, the original PHA code ## deliberately clipped off the first year of good data, so emulate that ## effect here as well. ## ## Ultimately, we save the extreme early and extreme late month with valid ## data to use as our first guess at the undocumented changepoints. first = 0 first_set = False last = 0 for (i, d1, d2) in zip(xrange(num_months), data1, data2): if d1!=MISS and d2!=MISS: if first < 12: first = i #first_set = True #if not first_set: # first = i # first_set = True last = i ## Set the initial breakpoints and the list of already-found, homogenous ## segments. breakpoints = [first, last, ] homog_segs = [] ##################################################################### ## BEGIN SPLITMERGE PROCESS TO GENERATE FIRST GUESS AT UNDOCUMENTED ## CHANGEPOINTS iter = 0 # counts how many times we've repeated the splitmerge process enter_BIC = False # break out of iterations into the BIC process? last_breakpoints = [] while (iter < 10) and not enter_BIC: seg_bounds = zip(breakpoints[:-1], breakpoints[1:]) last_breakpoints = deepcopy(breakpoints) new_breakpoints = deepcopy(breakpoints) new_homog_segs = [] print "Parse segments (isplit = 1), ipass: "******"Too short: ", imo2iym(l), imo2iym(r) continue ## If we've previously found that this segment is homogenous (has no ## potential changepoint), then we can skip it as well and proceed to ## the next one. # Set the within() method to check if this segment is within any # previously found homogenous ones. Use lambda, since we can't pass # keyword or positional arguments to map(). within_this_seg = lambda seg: within((l, r), seg) within_stable_segs = map(within_this_seg, homog_segs) if any(within_stable_segs): print "Stable segment: ", imo2iym(l), imo2iym(r) if l == first: new_breakpoints.append(first) continue ## The standard normal homogeneity test - which is the statistical test ## we'll use to see if there is a potential changepoint in this segment ## - requires us to normalize our paired difference series. We can do ## that in snht(), but we'll do it right now so we can inspect those ## standardized values later. z = standardize(segment, MISS) ## Apply standard normal homogeneity test. ## For mechanics, see Alexandersson and Moberg 1997, Int'l Jrnl of ## Climatology (pp 25-34) likelihood_ratios = snht(z, MISS, standardized=True) z_count = len(get_valid_data(z)) ## We're left with the likelihood ratio for each value being a potential ## changepoint. Find the max ratio, and if that value is significant, let ## it be the newest potential changepoint. ind_max_ratio = 0 max_ratio = 0.0 clip_ratios = likelihood_ratios[2:-2] # clip the beginning and end, # they can't be changepoints. for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios): if ratio > max_ratio: ind_max_ratio = ind max_ratio = ratio ## Now we find the critical value for this data set, and check our max ## likelihood ratio against it crit_val = lrt_lookup(z_count) # The possible changepoint is the index of the max ratio we found. # We have to shift it the following ways to align it to the original # data - # 1) shift by 2 re-aligns it from clip_ratios to likelihood_ratios # 2) shift by adjust re-aligns it to this segment in diff_data # 3) shift by l re-aligns it to the first index in diff_data possible_changepoint = l + ind_max_ratio + 2 + adjust y_new, m_new = imo2iym(possible_changepoint) # year, month ## If this is the first iteration, we indicate as such, and add the new ## changepoint if iter == 0: print "%6s-%6s MD FIRST series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val) breakpoints.append(possible_changepoint) breakpoints = sorted(breakpoints) else: ## Else, if we found a new possible changepoint, add it to our list. if max_ratio > crit_val: print "%6s-%6s MD Inhomogenity for series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count) new_breakpoints.append(possible_changepoint) ## If not, record that we found a homogeneous segment. else: print "%6s-%6s MD Homogeneous series %4d %2d to %4d %2d | at %4d %2d ts: %4.2f limit >: %3.2f %4d" % (id1,id2,y1,m1,y2,m2,y_new,m_new,max_ratio,crit_val,z_count) new_homog_segs.append((l, r)) ## Now we need to update our account of which segments were homogeneous, ## because we need to know during the next iteration. We will do this, ## as well as condense stable segments that lie adjacent to each other ## i.e, if we have the segments [(1,5), (5, 10,),, (12, 15)], then we ## really have [(1,10), (12, 15)]. homog_segs.extend(new_homog_segs) if homog_segs: homog_segs = sorted(homog_segs, key=operator.itemgetter(0)) final_homog_segs = [homog_segs[0], ] # this will be like a stack for seg in homog_segs[1:]: last_seg = final_homog_segs[-1] if last_seg[1] == seg[0]: new_seg = (last_seg[0], seg[1]) final_homog_segs.pop() final_homog_segs.append(new_seg) else: final_homog_segs.append(seg) homog_segs = final_homog_segs ## So we have new segments that can be generated from these new ## breakpoints. Now, the PHA routine enters a "merge" process ## to see whether or not to keep these newly found changepoints or throw ## them out as false alarms. ## ## We do this by "leapfrogging" every other breakpoint. This gives us ## a set of segments that all have another breakpoint in them. We want ## to see if these segments are homogeneous, because if they are, it ## means that the breakpoint we previously found in the segment has ## been superseded. new_breakpoints = sorted(new_breakpoints) seg_bounds = zip(new_breakpoints[:-2], new_breakpoints[2:]) remove_breakpoints = set() merged_breakpoints = set() if iter > 0: print "Merge segments (isplit = 0), ipass: "******"Stable segment: ", imo2iym(l), imo2iym(r) # if l == first: # new_breakpoints.append(first) # seg_lookup.append(((l, r), 'stable')) # continue # Set the within() method to check if this segment is within any # previously found homogenous ones. Use lambda, since we can't pass # keyword or positional arguments to map(). within_this_seg = lambda seg: within((l, r), seg) within_stable_segs = map(within_this_seg, homog_segs) if any(within_stable_segs): print "Stable segment: ", imo2iym(l), imo2iym(r) #if l == first: # new_breakpoints.append(first) merged_breakpoints.update([l, r]) continue ## Apply the same adjustments and the same standard normal homogeneity ## test that we did in the previous splitting process. There is no ## difference here until we consider what to do if we find a new ## homogeneous segment. adjust = int(seg_bounds.index((l, r)) > 0) segment = diff_data[l+adjust:r+1] z = standardize(segment, MISS) likelihood_ratios = snht(z, MISS, standardized=True) z_count = len(get_valid_data(z)) ind_max_ratio = 0 max_ratio = 0.0 clip_ratios = likelihood_ratios[2:-2] # We clip the beginning and end for (ind, ratio) in zip(xrange(len(clip_ratios)), clip_ratios): if ratio > max_ratio: ind_max_ratio = ind max_ratio = ratio crit_val = lrt_lookup(z_count) possible_changepoint = l + ind_max_ratio + 2 + adjust y_new, m_new = imo2iym(possible_changepoint) if z_count < 2: y1, m1 = imo2iym(l) y2, m2 = imo2iym(r) print "%6s-%6s MD No found peaks %4d %2d to %4d %2d" % (id1,id2,y1,m1,y2,m2) print "%6s-%6s MD Compress 1 out peak at %4d %2d" % (id1,id2,y_new,m_new) #remove_breakpoints.add_ ## If we found a new breakpoint that is statistically significant, then ## great! Let's keep it. if max_ratio > crit_val: print "%6s-%6s MD Peak kept in merge at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val) merged_breakpoints.add(l) merged_breakpoints.add(new_bp) merged_breakpoints.add(r) ## If not, then this segment was homogeneous, so the breakpoint which ## already exists in it is no good. else: print "%6s-%6s MD Compress 2 out peak at %4d %2d | ts: %4.2f limit >: %3.2f" % (id1,id2,y_new,m_new,max_ratio,crit_val) # Crap, if there are any potential breakpoints in this segment, # we need to remove them because this segment is homogeneous. Let's # remember this homogeneous segment for now and come back once # we've found all of them. merged_breakpoints.update([l, r]) remove_breakpoints.add(new_bp) ## At this point, we have a set of all the breakpoints we've accumulated ## during this iteration of split/merge, as well as a set of breakpoints ## which we've found to be of no further use. We can difference update ## our set of breakpoints to remove these guys, and let those merged ## breakpoints be the set of newest breakpoints for the next splitmerge ## iteration. merged_breakpoints.difference_update(remove_breakpoints) breakpoints = list(merged_breakpoints) breakpoints = sorted(breakpoints) ## Did we actually find new breakpoints? If not, then we're done ## with splitmerge and can move on to the BIC process. enter_BIC = (breakpoints == last_breakpoints) iter = iter + 1 ## Okay wow, we've potentially made it to the BIC stage now... ! if first not in breakpoints: breakpoints.insert(0, first) ym_breakpoints = map(imo2iym, breakpoints) #print ym_breakpoints ## ENTERING MINBIC bp_dictionary = dict() #################################### ##### MULTIPROCESS from multiprocessing import Pool global counter multi_bp_dict = {} counter = 0 def cb(r): global counter #print counter, r counter += 1 start = time.clock() po = Pool(processes=4) for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]): if left != first: left = left + 1 # recall that we only consider data after the first full year. we will be # computing regressions with the independent variable indexed from this # starting point, so we need to shift these indices. we also need to shift them # by +1 if this is any segment beyond the first one, so that we don't include # changepoints in more than one analysis. # TOTAL_SHIFT = -12 + 1 = -11 # # However, this shift is only necessary while looking at the array indices that # we generate using range(). the data should already be aligned correctly. total_shift = -12 + 1 left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift y1, m1 = imo2iym(left) yb, mb = imo2iym(bp) y2, m2 = imo2iym(right) #print "Entering MINBIC - %4d %2d %4d %2d %4d %2d" % (y1, m1, yb, # mb, y2, m2) (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] bp_index = bp-left #print len(seg_x), len(seg_data), bp_index #bp_analysis = minbic(seg_x, seg_data, bp_index, MISS) multi_bp_dict[bp] = po.apply_async(minbic,(seg_x,seg_data,bp_index,MISS,),callback=cb) po.close() po.join() for bp in multi_bp_dict: r = multi_bp_dict[bp] multi_bp_dict[bp] = r.get() #print "counter - %d" % counter elapsed = (time.clock() - start) print "ELAPSED TIME - %2.3e" % elapsed #print new_bp_dict #################################### ##### NORMAL # start = time.clock() # for left,bp,right in zip(breakpoints[0:], breakpoints[1:], breakpoints[2:]): # # if left != first: # left = left + 1 # # recall that we only consider data after the first full year. we will be # # computing regressions with the independent variable indexed from this # # starting point, so we need to shift these indices. we also need to shift them # # by +1 if this is any segment beyond the first one, so that we don't include # # changepoints in more than one analysis. # # TOTAL_SHIFT = -12 + 1 = -11 # # # # However, this shift is only necessary while looking at the array indices that # # we generate using range(). the data should already be aligned correctly. # total_shift = -12 + 1 # left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift # y1, m1 = imo2iym(left) # yb, mb = imo2iym(bp) # y2, m2 = imo2iym(right) # print "Entering MINBIC - %4d %2d %4d %2d %4d %2d" % (y1, m1, yb, # mb, y2, m2) # (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] # bp_index = bp-left # #print len(seg_x), len(seg_data), bp_index # bp_analysis = minbic(seg_x, seg_data, bp_index, MISS) # # bp_dictionary[bp] = bp_analysis # elapsed2 = (time.clock() - start) # print "ELAPSED TIME = %3.2e" % elapsed2 ##################################3 ## Print the adjustment summaries bp_dictionary = multi_bp_dict sorted_bps = sorted(bp_dictionary.keys()) ndelete = [] valid_bps = {} for bp in sorted_bps: stats = bp_dictionary[bp] cmodel=stats['cmodel'] iqtype=stats['iqtype'] asigx=stats['offset'] azscr=stats['offset_z'] rslp=stats['slopes'] end1 = bp y_end1, m_end1 = imo2iym(end1) beg2 = bp+1 y_beg2, m_beg2 = imo2iym(beg2) # If cmodel is *SLR*, then there is no breakpoint if 'SLR' in cmodel: print ("%s-%s -- -- MD TESTSEG SKIP: %7.2f %5d %5d %3d %5d %5d %3d" % (id1, id2, asigx, end1, y_end1, m_end1, beg2, y_beg2, m_beg2)) # Don't store it! else: print ("%6s-%6s -- -- MD TESTSEG ADJ: %7.2f %7.2f %8.4f %8.4f %5d %5d %3d %5d %5d %3d %2d" % (id1,id2, asigx, azscr, rslp[0], rslp[1], end1, y_end1, m_end1, beg2, y_beg2, m_beg2, iqtype)) # Store it! valid_bps[bp] = stats ############################### ## Go back and see if we can get rid of some of the change points. ## If 2 or more of the chgpts are within MINLEN, ## a) if the chgpt estimates are the same sign, then test each ## singly with same endpoints and keep lowest BIC ## b) if not the same sign, ## retain earliest changepoint # add the first, last to valid_bps interior_bps = valid_bps.keys() # Add first, last if not already in interior_bps for bp in [first, last]: if bp not in interior_bps: interior_bps.append(bp) sorted_bps = sorted(interior_bps) for left in sorted_bps: print sorted_bps, left ## We're looking for the next interim breakpoint that satisfies two ## conditions: ## 1) at least MINLEN valid data (non-missing to the right) ## 2) has at least one breakpoint between 'left' and it right = 0 close_bps = [] for right in sorted_bps: if right <= left: continue if not close_bps: close_bps.append(right) else: valid_between_bps = diff_data[close_bps[-1]:right] valid_length = len(get_valid_data(valid_between_bps, MISS)) print imo2iym(close_bps[-1]),valid_length,imo2iym(right) if valid_length > MINLEN: break close_bps.append(right) # We could actually run out of things in sorted_bps, and wind up with # right == close_bps[-1]. Detect that and break out of this analysis # if that happens. if close_bps[-1]==right: break if left != first: left = left + 1 close_bp_results = {} for bp in close_bps: # # recall that we only consider data after the first full year. we will be # # computing regressions with the independent variable indexed from this # # starting point, so we need to shift these indices. we also need to shift them # # by +1 if this is any segment beyond the first one, so that we don't include # # changepoints in more than one analysis. # # TOTAL_SHIFT = -12 + 1 = -11 # # # # However, this shift is only necessary while looking at the array indices that # # we generate using range(). the data should already be aligned correctly. total_shift = -12 + 1 left_shift, bp_shift, right_shift = left+total_shift, bp+total_shift, right+total_shift y1, m1 = imo2iym(left) yb, mb = imo2iym(bp) y2, m2 = imo2iym(right) print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" print y1,m1,"-",yb,mb,"-",y2,m2 print "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" (seg_x, seg_data) = range(left_shift, right_shift+1), diff_data[left:right+1] bp_index = bp-left bp_analysis = minbic(seg_x, seg_data, bp_index, MISS, kthslr0_on=True) cmodel=bp_analysis['cmodel'] iqtype= bp_analysis['iqtype'] offset= bp_analysis['offset'] rslp= bp_analysis['slopes'] crit_val = bp_analysis['crit_val'] test_stat = bp_analysis['test_stat'] bic = bp_analysis['bic'] print ("Interim chgpt: %s %4d %2d %4d %2d %4d %2d %8.2f %8.2f %8.2f %8.2f %7.3f %7.3f %2d" % (pair_str, y1, m1, yb, mb, y2, m2, bic, test_stat, crit_val, offset, rslp[0], rslp[1], iqtype)) close_bp_results[bp] = bp_analysis # Now we have a small problem... we might have more than one breakpoint, # so we need to choose which one is best. We will check the sign of # the breakpoint amplitude changes: sign_of_amps = map(sign, [close_bp_results[bp]['offset'] for bp in close_bps]) positive = lambda x: sign(x) >= 0 negative = lambda x: sign(x) <= 0 zero = lambda x: sign(x) == 0 print "------------>",[close_bp_results[bp]['offset'] for bp in close_bps] if (all(map(positive, sign_of_amps)) or all(map(negative, sign_of_amps))): # Pick the best (minimum BIC) bics = [(bp, close_bp_results[bp]['bic']) for bp in close_bps] sorted_bics = sorted(bics, key=operator.itemgetter(1)) smallest_bp = sorted_bics[0][0] # Remove this smallest-bic bp from the in-interval bps close_bps.remove(smallest_bp) valid_bps[smallest_bp] = close_bp_results[smallest_bp] #print "leftovers",close_bps for bp in close_bps: # The remaining bps which we will reject sorted_bps.remove(bp) # Remove them from this loop del valid_bps[bp] # Remove them as valid yb, mb = imo2iym(smallest_bp) print ("Same domain - Lowest Interim: %s %4d %2d" % (pair_str, yb, mb)) elif (all(map(zero, sign_of_amps))): # Choose the earliest changepoint; the rest of these have # amplitude changes which are 0. first_bp, last_bp = close_bps[0], close_bps[-1] # Remove the first interim bp and update valid_bps with this new # computation. close_bps.remove(first_bp) valid_bps[first_bp] = close_bp_results[first_bp] # Reject remaining interim bps for bp in close_bps: sorted_bps.remove(bp) del valid_bps[bp] yb, mb = imo2iym(first_bp) print ("Null domain - Earliest Interim : %s %4d %2d" % (pair_str, yb, mb)) else: # We'll use the earliest interim changepoint, but we need # to get rid of bad data. Replace all the data between the # interim changepoints as missing and re-compute BIC. first_bp, last_bp = close_bps[0], close_bps[-1] first_bp_index = first_bp-left last_bp_index = last_bp-left print len(seg_x), len(seg_data) print first_bp_index+1, last_bp_index+1 print left, bp, right for i in range(first_bp_index+1, last_bp_index+1): print i, imo2iym(i), i+left, imo2iym(i+left) seg_x[i] = MISS seg_data[i] = MISS # Recall that seg_data[0] == diff_data[left]. ndelete records # the *true month where there is unviable data*, so it needs to # point back to the original element in diff_data we are # worried about. ndelete.append(i+left) bp_analysis = minbic(seg_x, seg_data, first_bp_index, MISS, kthslr0_on=True) # Remove the first interim bp and update valid_bps with this new # computation. close_bps.remove(first_bp) valid_bps[first_bp] = bp_analysis # Reject remaining interim bps for bp in close_bps: sorted_bps.remove(bp) del valid_bps[bp] yb, mb = imo2iym(first_bp) print ("Diff domain - Earliest Interim : %s %4d %2d" % (pair_str, yb, mb)) ## Remove changepoints which are an SLR model. nspan = [0]*num_months bp_count = 1 for bp in sorted(valid_bps.keys()): bp_analysis = valid_bps[bp] if "SLR" in bp_analysis['cmodel']: del valid_bps[bp] continue print " IN: ",bp nspan[bp] = bp_count ## If adjacent months are missing next to this breakpoint, then ## assume that those could be a breakpoint as well and copy this ## breakpoint's analysis results for them. for month in range(bp+1, last): if (month in ndelete) or (diff_data[month] == MISS): nspan[month] = bp_count print " IN: ",month valid_bps[month] = bp_analysis else: break bp_count += 1 valid_bps['del'] = ndelete valid_bps['nspan'] = nspan pair_results[pair_str] = valid_bps #print "ELAPSED TIMES = %3.2e %3.2e" % (elapsed1, elapsed2) print "done" ## import pickle f = open("pair_results", 'w') pickle.dump(pair_results, f) return pair_results