def prepare_series(from_year, combined, urban_series, counts): """Prepares for the linearity fitting by returning a series of data points *(x,d)*, where *x* is a calendar year number and *d* is the difference between the combined rural station anomaly series *combined* and the urban station series *urban_series* (each of these is an annual series, one datum per year). The returned points only include valid years, from the first quorate year to the last quorate year. A valid year is one in which both the urban station and the combined rural series have valid data. A quorate year is a valid year in which there are at least *parameters.urban_adjustment_min_rural_stations* contributing. The algorithm is restricted to only considering years starting at *from_year* (and ending at the end of the series); it is a calendar year. The *counts* argument is a sequence that contains the number of stations contributing to each datum in *combined*. Returns a tuple: (*points*, *count*). *points* is the series of points, *count* is a count of the valid quorate years. """ # Calendar year corresponding to first datum in series. year_offset = giss_data.BASE_YEAR # Number of valid quorate years quorate_count = 0 # Used to truncate the series to the last quorate year, immediately # before returning it. length = 0 points = [] assert len(combined) >= len(urban_series) def quorate(): """True when *iy* corresponds to a quorate year; used in inner loop, below.""" return counts[iy] >= parameters.urban_adjustment_min_rural_stations for iy in range(from_year - year_offset, len(urban_series)): if valid(combined[iy]) and valid(urban_series[iy]): if quorate(): quorate_count += 1 if quorate_count == 0: continue points.append((iy + year_offset, combined[iy] - urban_series[iy])) if quorate(): length = len(points) return points[:length], quorate_count
def get_longest_overlap(target, begin, records): """Find the record in the *records* set that has the longest overlap with the *target* by considering annual anomalies. *target* is a sequence of monthly values starting in the year *begin*. A triple (record, diff, overlap) is returned; *diff* is the average difference in annual anomalies between *record* and *target* (positive when *record* is higher); *overlap* is the number of years in the overlap. Even when there is no overlap _some_ record is returned and in that case *diff* is None and *overlap* is 0. Like other functions, assumes (and asserts) that *begin* is the first year for all the records. """ # Annual mean, and annual anomaly sequence. mean, anoms = series.monthly_annual(target) overlap = 0 diff = None # :todo: the records are consulted in an essentially arbitrary # order (which depends on the implementation), but the order # may affect the result. Tie breaks go to the last record consulted. # For exact compatiblity with previous versions, we create a # temporary dict. t = dict((record.uid, record) for record in records) for record in t.values(): common = [(rec_anom, anom) for rec_anom, anom in zip(record.ann_anoms, anoms) if valid(rec_anom) and valid(anom)] if len(common) < overlap: continue overlap = len(common) best_record = record s = sum((record.ann_mean + rec_anom) - (mean + anom) for rec_anom, anom in common) if common: diff = s / len(common) return best_record, diff, overlap
def fresh_arrays(record, years): """Make and return a fresh pair of arrays: (*sums*, *wgts*). Each array is list (of length 12 * years; the input record should not be longer). The start of the result arrays will be the same as the start of the input *record*, which should generally be the same for all inputs. """ nmonths = years * 12 # Number of months in record. rec_months = len(record) assert rec_months <= nmonths sums = [0.0] * nmonths # Copy valid data rec_data into sums, assigning 0 for invalid data. sums[:rec_months] = (valid(x) * x for x in record.series) # Let wgts[i] be 1 where sums[i] is valid. wgts = [0] * nmonths wgts[:rec_months] = (int(valid(x)) for x in record.series) return sums, wgts
def valid_mean(seq, min=1): """Takes a sequence, *seq*, and computes the mean of the valid items (using the valid() function). If there are fewer than *min* valid items, the mean is MISSING.""" count = 0 sum = 0.0 for x in seq: if valid(x): sum += x count += 1 if count >= min: return sum / float(count) else: return MISSING
def monthly_anomalies(data, reference_period=None, base_year=-9999): """Calculate monthly anomalies, by subtracting from every datum the mean for its month. A pair of (monthly_mean, monthly_anom) is returned. *monthly_mean* is a 12-long sequence giving the mean for each of the 12 months; *monthly_anom* is a 12-long sequence giving the anomalized series for each of the 12 months. If *reference_period* is supplied then it should be a pair (*first*, *last) and the mean for a month is taken over the period (an example would be reference_period=(1951,1980)). *base_year* specifies the first year of the data. The input data is a flat sequence, one datum per month. Effectively the data changes shape as it passes through this function. """ years = len(data) // 12 if reference_period: base = reference_period[0] - base_year limit = reference_period[1] - base_year + 1 else: # Setting base, limit to (0,0) is a bit of a hack, but it # does work. base = 0 limit = 0 monthly_mean = [] monthly_anom = [] for m in range(12): row = data[m::12] mean = valid_mean(row[base:limit]) if invalid(mean): # Fall back to using entire period mean = valid_mean(row) monthly_mean.append(mean) if valid(mean): def asanom(datum): """Convert a single datum to anomaly.""" if valid(datum): return datum - mean return MISSING monthly_anom.append([asanom(x) for x in row]) else: monthly_anom.append([MISSING] * years) return monthly_mean, monthly_anom
def extend_range(series, count, first, last): """Extend the range for adjusting, if possible. *first* and *last* are the calendar years that define the range of quorate years. *count* gives the total number of quorate years in that range (these are computed in `prepare_series`). *series* is the annual anomalies (based at BASE_YEAR) for the urban station. Returns a pair of calendar years for the extended range. If no extension is possible, the quorate range *first* to *last* is returned. """ iyxtnd = int(round(count / parameters.urban_adjustment_proportion_good) - (last - first + 1)) if iyxtnd == 0: # No extension possible. return first, last assert iyxtnd > 0 # The first and last years for which the urban station has a # valid annual anomaly. valid_years = [i for i, x in enumerate(series) if valid(x)] urban_first = min(valid_years) urban_last = max(valid_years) # Convert to calendar years, and extend by 1 year in each # direction to include possible partial years. urban_first += giss_data.BASE_YEAR - 1 urban_last += giss_data.BASE_YEAR + 1 # When extending, extend to include all of the recent part # of the urban record... lxend = urban_last - last if iyxtnd > lxend: # ... and if we have enough "spare years" extend some or # all of the earlier part of the urban record. first -= (iyxtnd - lxend) first = max(first, urban_first) last = urban_last return first, last
def combine_neighbours(iyrm, neighbours): """Combines the neighbour stations *neighbours*, weighted according to their .weight property (previously computed to be based on distance from the urban station being considered), to give a combined annual anomaly series. *iyrm* is the length of the resulting combined series. This function assumes that each of the neighbours annual anomaly series begins in the same year; the result series begins in that year also. Returns a tuple: (*counts*, *combined*), where *counts* is a per-year list of the number of stations combined, *combined* is the combined neighbour series. """ weights = [0.0] * iyrm counts = [0] * iyrm combined = [MISSING] * iyrm # Generally, *neighbours* has been sorted, so that the first element # is the neighbour with the longest time record (most valid years). # We start with that one ... rs = neighbours[0] assert len(rs.anomalies) <= iyrm combined[:len(rs.anomalies)] = rs.anomalies for i, anom in enumerate(rs.anomalies): if valid(anom): weights[i] = rs.weight counts[i] = 1 # ... and add in the remaining stations. for rs in neighbours[1:]: cmbine(combined, weights, counts, rs.anomalies, rs.weight) return counts, combined
def adjust_record(record, fit, adjust_first, adjust_last): """Adjust the series according to the previously computed parameters. *record* is a (monthly) station record. Its data series is replaced, but its length is not changed. Data outside the adjustment range (see below) will become MISSING. *adjust_first*, *adjust_last* are calendar years: the first and last years that are subject to adjustment. Adjustment years run from December prior to the year in question through to November (because the anomaly years do too). *fit* contains the parameters for two slopes that are used to make the adjustment: of slope *fit.slope1* between year *fit.first* and *fit.knee*, and of slope *fit.slope2* between year *fit.knee* and *fit.last*. Any adjustment can be biased up or down without affecting the trend; the adjustment is chosen so that it is zero in the year *fit.last*. Outside the range *fit.first* to *fit.last* the adjustment is constant (zero for the recent part, and the same adjustment as for year *fit.first* for the earlier part). """ # We assume the series starts in January. assert record.first_month % 12 == 1 # A fresh array for the new (adjusted) series. nseries = [MISSING] * len(record.series) sl1 = fit.slope1 sl2 = fit.slope2 if not good_two_part_fit(fit): # Use linear approximation. sl1 = sl2 = fit.slope # (because the adjustment range is extended by 1 on either # end) the adjustment range can be outside the range of data for the # series. There are checks inside the loop to ignore those indexes. # *iy* is a calendar year. for iy in range(adjust_first, adjust_last + 1): sl = sl1 if iy > fit.knee: sl = sl2 # For the purposes of calculating the adjustment for the year, # clamp to the range [fit.first, fit.last]. iya = max(fit.first, min(iy, fit.last)) adj = (iya - fit.knee) * sl - (fit.last - fit.knee) * sl2 # The anomaly years run from Dec to Nov. So the adjustment # years do too. # The index into *series* that corresponds to December # immediately before the beginning of year *iy*. dec = 12 * (iy - record.first_year) - 1 # *m* is an index into the *series* array. for m in range(dec, dec + 12): try: if m >= 0 and valid(record.series[m]): nseries[m] = record.series[m] + adj except IndexError: break record.set_series(record.first_month, nseries)
def annual_anomaly(record): """Computes annual anomalies for the station record *record*. Returns a list of annual anomalies, one datum for each year (12 months) of the input record. Years for which an annual anomaly cannot be computed are recorded as MISSING. The returned series is padded so that it begins in BASE_YEAR (that is, 1880). If no anomalies can be computed, then None is returned. The algorithm is as follows: compute monthly averages, then monthly anomalies, then seasonal anomalies (means of monthly anomalies for at least two months) then annual anomalies (means of seasonal anomalies for at least three seasons). This function assumes that the series starts in January. """ # Set to True if we have an annual anomaly for at least one year. good = False series = record.series monthly_means = [] for m in range(12): month_data = series[m::12] # Neglect December of final year, as we do not use its season. if m == 11: month_data = month_data[:-1] month_data = [x for x in month_data if int(x) != 9999] monthly_means.append(float(sum(month_data)) / len(month_data)) annual_anoms = [] for y in range(int(len(series) / 12)): # Seasons are Dec-Feb, Mar-May, Jun-Aug, Sep-Nov. # (Dec from previous year). total = [0.0] * 4 # total monthly anomaly for each season count = [0] * 4 # number of valid months in each season for m in range(-1, 11): index = y * 12 + m if index >= 0: # no Dec value in year -1 datum = series[index] if valid(datum): # season number 0-3 season = (m + 1) // 3 total[season] += datum - monthly_means[m % 12] count[season] += 1 season_anomalies = [] # list of valid seasonal anomalies for s in range(4): # valid seasonal anomaly requires at least 2 valid months if count[s] >= 2: season_anomalies.append(total[s] / count[s]) # valid annual anomaly requires at least 3 valid seasons if len(season_anomalies) > 2: good = True annual_anoms.append(sum(season_anomalies) / len(season_anomalies)) else: annual_anoms.append(MISSING) if good: assert record.first_year >= giss_data.BASE_YEAR # Pad beginning of series so that it starts in # giss_data.BASE_YEAR pad = [MISSING] * (record.first_year - giss_data.BASE_YEAR) return pad + annual_anoms else: return None
def reclen(s): return len([v for v in s.anomalies if valid(v)])
def asanom(datum): """Convert a single datum to anomaly.""" if valid(datum): return datum - mean return MISSING
def iter_subbox_grid(station_records, max_months, first_year, radius): """Convert the input *station_records*, into a gridded anomaly dataset which is returned as an iterator. *max_months* is the maximum number of months in any station record. *first_year* is the first year in the dataset. *radius* is the combining radius in kilometres. """ # Clear Climate Code from steps import earth # required for radius. # Convert to list because we re-use it for each box (region). station_records = list(station_records) # Descending sort by number of good records. station_records = sorted(station_records, key=lambda x: x.good_count, reverse=True) # A dribble of progress messages. import sys dribble = sys.stdout progress = open(PROGRESS_DIR + 'progress.txt', 'a') progress.write("COMPUTING 80 REGIONS from 8000 SUBBOXES:") # Critical radius as an angle of arc arc = radius / earth.radius arcdeg = arc * 180 / math.pi regions = list(eqarea.gridsub()) for region in regions: box, subboxes = region[0], list(region[1]) # Count how many cells are empty n_empty_cells = 0 for subbox in subboxes: # Select and weight stations # Treat all boxes that touch the poles as a single box. centre = eqarea.centre(subbox) if round(centre[0]) >= 84: centre = (90, 0) if round(centre[0]) <= -84: centre = (-90, 0) dribble.write("\rsubbox at %+05.1f%+06.1f (%d empty)" % (centre + (n_empty_cells, ))) dribble.flush() # Determine the contributing stations to this grid cell. contributors = list(incircle(station_records, arc, *centre)) # Combine data. subbox_series = [MISSING] * max_months if not contributors: box_obj = giss_data.Series(series=subbox_series, box=list(subbox), stations=0, station_months=0, d=MISSING) n_empty_cells += 1 yield box_obj continue # Initialise series and weight arrays with first station. record, wt = contributors[0] total_good_months = record.good_count total_stations = 1 offset = record.rel_first_month - 1 a = record.series # just a temporary subbox_series[offset:offset + len(a)] = a max_weight = wt weight = [wt * valid(v) for v in subbox_series] # For logging, keep a list of stations that contributed. # Each item in this list is a triple (in list form, so that # it can be converted to JSON easily) of [id12, weight, # months]. *id12* is the 12 character station identifier; # *weight* (a float) is the weight (computed based on # distance) of the station's series; *months* is a 12 digit # string that records whether each of the 12 months is used. # '0' in position *i* indicates that the month was not used, # a '1' indicates that is was used. January is position 0. l = [ any(valid(v) for v in subbox_series[i::12]) for i in range(12) ] s = ''.join('01'[x] for x in l) contributed = [[record.uid, wt, s]] # Add in the remaining stations for record, wt in contributors[1:]: new = [MISSING] * max_months aa, bb = record.rel_first_month, record.rel_last_month new[aa - 1:bb] = record.series station_months = series.combine( subbox_series, weight, new, wt, parameters.gridding_min_overlap) n_good_months = sum(station_months) total_good_months += n_good_months if n_good_months == 0: contributed.append([record.uid, 0.0, '0' * 12]) continue total_stations += 1 s = ''.join('01'[bool(x)] for x in station_months) contributed.append([record.uid, wt, s]) max_weight = max(max_weight, wt) series.anomalize(subbox_series, parameters.gridding_reference_period, first_year) box_obj = giss_data.Series(series=subbox_series, n=max_months, box=list(subbox), stations=total_stations, station_months=total_good_months, d=radius * (1 - max_weight)) log.write("%s stations %s\n" % (box_obj.uid, asjson(contributed))) yield box_obj plural_suffix = 's' if n_empty_cells == 1: plural_suffix = '' dribble.write( '\rRegion (%+03.0f/%+03.0f S/N %+04.0f/%+04.0f W/E): %d empty cell%s.\n' % (tuple(box) + (n_empty_cells, plural_suffix))) progress.write( '\rRegion (%+03.0f/%+03.0f S/N %+04.0f/%+04.0f W/E): %d empty cell%s.' % (tuple(box) + (n_empty_cells, plural_suffix))) progress.flush() dribble.write("\n")
def zonav(meta, boxed_data): """Zonal Averaging. The input *boxed_data* is an iterator of boxed time series. The data in the boxes are combined to produce averages over various latitudinal zones. Returns an iterator of (averages, weights, title) tuples, one per zone. 16 zones are produced. The first 8 are the basic belts that are used for the equal area grid, the remaining 8 are combinations: 0 64N - 90N \ 1 44N - 64N (asin 0.9) - 8 24N - 90 N (0 + 1 + 2) 2 24N - 44N (asin 0.7) / 3 Equ - 24N (asin 0.4) \_ 9 24S - 24 N (3 + 4) 4 24S - Equ / 5 44S - 24S \ 6 64S - 44S - 10 90S - 24 S (5 + 6 + 7) 7 90S - 64S / 11 northern mid-latitudes (1 + 2) 12 southern mid-latitudes (5 + 6) 13 northern hemisphere (0 + 1 + 2 + 3) 14 southern hemisphere (4 + 5 + 6 + 7) 15 global (all belts 0 to 7) """ iyrbeg = meta.yrbeg monm = meta.monm boxes_in_band, band_in_zone = zones() bands = len(boxes_in_band) lenz = [None] * bands wt = [None] * bands avg = [None] * bands # For each band, combine all the boxes in that band to create a band # record. for band in range(bands): # The temperature (anomaly) series for each of the boxes in this # band. box_series = [None] * boxes_in_band[band] # The weight series for each of the boxes in this band. box_weights = [None] * boxes_in_band[band] # "length" is the number of months (with valid data) in the box # series. For each box in this band. box_length = [None] * boxes_in_band[band] for box in range(boxes_in_band[band]): # The last element in the tuple is the boundaries of the # box. We ignore it. box_series[box], box_weights[box], box_length[box], _ = ( next(boxed_data)) # total number of valid data in band's boxes total_length = sum(box_length) if total_length == 0: wt[band] = [0.0] * monm avg[band] = [MISSING] * monm else: box_length, IORD = sort_perm(box_length) nr = IORD[0] # Copy the longest box record into *wt* and *avg*. # Using list both performs a copy and converts into a mutable # list. wt[band] = list(box_weights[nr]) avg[band] = list(box_series[nr]) # And combine the remaining series. for n in range(1, boxes_in_band[band]): nr = IORD[n] if box_length[n] == 0: # Nothing in this box, and since we sorted by length, # all the remaining boxes will also be empty. We can # stop combining boxes. break series.combine(avg[band], wt[band], box_series[nr], box_weights[nr], parameters.box_min_overlap) series.anomalize(avg[band], parameters.box_reference_period, iyrbeg) lenz[band] = sum(valid(a) for a in avg[band]) yield (avg[band], wt[band]) # We expect to have consumed all the boxes (the first 8 bands form a # partition of the boxes). We check that the boxed_data stream is # exhausted and contains no more boxes. try: next(boxed_data) assert 0, "Too many boxes found" except StopIteration: # We fully expect to get here. pass # *lenz* contains the lengths of each zone 0 to 7 (the number of # valid months in each zone). lenz, iord = sort_perm(lenz) for zone in range(len(band_in_zone)): # Find the longest band that is in the compound zone. for j1 in range(bands): if iord[j1] in band_in_zone[zone]: break else: # Should be an assertion really. raise Exception('No band in compound zone %d.' % zone) band = iord[j1] if lenz[band] == 0: print('**** NO DATA FOR ZONE %d' % band) wtg = list(wt[band]) avgg = list(avg[band]) # Add in the remaining bands, in length order. for j in range(j1 + 1, bands): band = iord[j] if band not in band_in_zone[zone]: continue series.combine(avgg, wtg, avg[band], wt[band], parameters.box_min_overlap) series.anomalize(avgg, parameters.box_reference_period, iyrbeg) yield (avgg, wtg)
def subbox_to_box(meta, cells, celltype='BOX'): """Aggregate the subboxes (aka cells, typically 8000 per globe) into boxes (typically 80 boxes per globe), and combine records to produce one time series per box. *celltype* is used for logging, using a distinct (3 character) code will allow the log output for the land, ocean, and land--ocean analyses to be separated. *meta* specifies the meta data and is used to determine the first year (meta.yrbeg) and length (meta.monm) for all the resulting series. Returns an iterator of box data: for each box a quadruple of (*anom*, *weight*, *ngood*, *box*) is yielded. *anom* is the temperature anomaly series, *weight* is the weights for the series (number of cells contributing for each month), *ngood* is total number of valid data in the series, *box* is a 4-tuple that describes the regions bounds: (southern, northern, western, eastern). """ # The (80) large boxes. boxes = list(eqarea.grid()) # For each box, make a list of contributors (cells that contribute # to the box time series); initially empty. contributordict = dict((box, []) for box in boxes) # Partition the cells into the boxes. for cell in cells: box = whichbox(boxes, cell.box) contributordict[box].append(cell) def padded_series(s): """Produce a series, that is padded to start in meta.yrbeg and is of length meta.monm months. *s* should be a giss_data.Series instance. """ result = [MISSING] * meta.monm offset = 12 * (s.first_year - meta.yrbeg) result[offset:offset + len(s)] = s.series return result # For each box, sort and combine the contributing cells, and output # the result (by yielding it). for idx, box in enumerate(boxes): contributors = sorted(contributordict[box], key=lambda x: x.good_count, reverse=True) best = contributors[0] box_series = padded_series(best) box_weight = [float(valid(a)) for a in box_series] # Start the *contributed* list with this cell. l = [any(valid(v) for v in box_series[i::12]) for i in range(12)] s = ''.join('01'[x] for x in l) contributed = [[best.uid, 1.0, s]] # Loop over the remaining contributors. for cell in contributors[1:]: if cell.good_count >= parameters.subbox_min_valid: addend_series = padded_series(cell) weight = 1.0 station_months = series.combine(box_series, box_weight, addend_series, weight, parameters.box_min_overlap) s = ''.join('01'[bool(x)] for x in station_months) else: weight = 0.0 s = '0' * 12 contributed.append([cell.uid, weight, s]) box_first_year = meta.yrbeg series.anomalize(box_series, parameters.subbox_reference_period, box_first_year) uid = giss_data.boxuid(box, celltype=celltype) log.write("%s cells %s\n" % (uid, asjson(contributed))) ngood = sum(valid(a) for a in box_series) yield (box_series, box_weight, ngood, box)
def find_quintuples(sums, wgts, record, new_id, log): """The *sums* and *wgts* arrays are assumed to begin in the same year as *record*. Returns a boolean.""" # An identifier common to all the log output. logid = "%s %s" % (new_id, record.uid) rec_begin = record.first_valid_year() rec_end = record.last_valid_year() actual_begin, actual_end = get_actual_endpoints(wgts, record.first_year) max_begin = max(actual_begin, rec_begin) min_end = min(actual_end, rec_end) # Since max_begin and min_end are integers, this rounds fractional # middle years up. middle_year = int(.5 * (max_begin + min_end) + 0.5) offset = (middle_year - record.first_year) log.write("max begin: %s\tmin end: %s\n" % (max_begin, min_end)) new_data = average(sums, wgts) new_ann_mean, new_ann_anoms = series.monthly_annual(new_data) ann_std_dev = sigma(new_ann_anoms) log.write("ann_std_dev = %s\n" % ann_std_dev) rec_ann_anoms = record.ann_anoms rec_ann_mean = record.ann_mean # Whether we have an "overlap" or not. We have an "overlap" if # within *rad* years either side of *middle_year* both records have # *parameters.station_combine_min_mid_year* valid annnual anomalies. ov_success = False # The overlap is "okay" when the difference in annual temperature is # below a certain threshold. okay_flag = False for rad in range(1, parameters.station_combine_bucket_radius + 1): # For the two series, get data from from -rad to rad (inclusive) # around the middle year. base = offset - rad base = max(0, base) limit = offset + rad + 1 new_middle = [x for x in new_ann_anoms[base:limit] if valid(x)] rec_middle = [x for x in rec_ann_anoms[base:limit] if valid(x)] if (len(new_middle) >= parameters.station_combine_min_mid_years and len(rec_middle) >= parameters.station_combine_min_mid_years): log.write("overlap success: %s\n" % logid) ov_success = True avg1 = sum(anom + new_ann_mean for anom in new_middle) / float(len(new_middle)) avg2 = sum(anom + rec_ann_mean for anom in rec_middle) / float(len(rec_middle)) diff = abs(avg1 - avg2) log.write("diff = %s\n" % diff) if diff < ann_std_dev: okay_flag = True log.write("combination success: %s\n" % logid) else: log.write("combination failure: %s\n" % logid) break if not ov_success: log.write("overlap failure: %s\n" % logid) log.write("counts: %d %d\n" % (len(new_middle), len(rec_middle))) return okay_flag