def combine(average, weight, new, new_weight, first_year, last_year, min_overlap): """Run the GISTEMP combining algorithm. This combines the data in the *new* array into the *average* array. *new* has weight *new_weight*, *average* has weights in the *weight* array. Only data for years in *range(first_year, last_year)* are considered and combined. *new_weight* can be either a constant or an array of weights for each datum in *new*. The number of month records combined is returned. Each month of the year is considered separately. For the set of times where both *average* and *new* have data the mean difference (a bias) is computed. If there are fewer than *min_overlap* years in common the data (for that month of the year) are not combined. The bias is subtracted from the *new* record and it is point-wise combined into *average* according to the weight *new_weight* and the existing weights for *average*. """ new_weight = container(new_weight) months_combined = 0 for m in range(12): sum_new = 0.0 # Sum of data in new sum = 0.0 # Sum of data in average count = 0 # Number of years where both new and average are valid for a, n in itertools.izip( average[first_year * 12 + m:last_year * 12:12], new[first_year * 12 + m:last_year * 12:12]): if invalid(a) or invalid(n): continue count += 1 sum += a sum_new += n if count < min_overlap: continue bias = (sum - sum_new) / count # Update period of valid data, averages and weights for i in range(first_year * 12 + m, last_year * 12, 12): if invalid(new[i]): continue new_month_weight = weight[i] + new_weight[i] average[i] = (weight[i] * average[i] + new_weight[i] * (new[i] + bias)) / new_month_weight weight[i] = new_month_weight months_combined += 1 return months_combined
def combine(average, weight, new, new_weight, first_year, last_year, min_overlap): """Run the GISTEMP combining algorithm. This combines the data in the *new* array into the *average* array. *new* has weight *new_weight*, *average* has weights in the *weight* array. Only data for years in *range(first_year, last_year)* are considered and combined. *new_weight* can be either a constant or an array of weights for each datum in *new*. The number of month records combined is returned. Each month of the year is considered separately. For the set of times where both *average* and *new* have data the mean difference (a bias) is computed. If there are fewer than *min_overlap* years in common the data (for that month of the year) are not combined. The bias is subtracted from the *new* record and it is point-wise combined into *average* according to the weight *new_weight* and the existing weights for *average*. """ new_weight = container(new_weight) months_combined = 0 for m in range(12): sum_new = 0.0 # Sum of data in new sum = 0.0 # Sum of data in average count = 0 # Number of years where both new and average are valid for a,n in itertools.izip(average[first_year*12+m: last_year*12: 12], new[first_year*12+m: last_year*12: 12]): if invalid(a) or invalid(n): continue count += 1 sum += a sum_new += n if count < min_overlap: continue bias = (sum-sum_new)/count # Update period of valid data, averages and weights for i in range(first_year*12+m, last_year*12, 12): if invalid(new[i]): continue new_month_weight = weight[i] + new_weight[i] average[i] = (weight[i]*average[i] + new_weight[i]*(new[i]+bias))/new_month_weight weight[i] = new_month_weight months_combined += 1 return months_combined
def combine(composite, weight, new, new_weight, min_overlap): """Run the GISTEMP combining algorithm. This combines the data in the *new* array into the *composite* array. *new* has weight *new_weight*; *composite* has weights in the *weight* array. *new_weight* can be either a constant or an array of weights for each datum in *new*. For each of the 12 months of the year, track is kept of how many new data are combined. This list of 12 elements is returned. Each month of the year is considered separately. For the set of times where both *composite* and *new* have data the mean difference (a bias) is computed. If there are fewer than *min_overlap* years in common the data (for that month of the year) are not combined. The bias is subtracted from the *new* record and it is point-wise combined into *composite* according to the weight *new_weight* and the existing weights for *composite*. """ new_weight = ensure_array(weight, new_weight) # A count (of combined data) for each month. data_combined = [0] * 12 for m in range(12): sum_new = 0.0 # Sum of data in new sum = 0.0 # Sum of data in composite # Number of years where both new and composite are valid. count = 0 for a,n in itertools.izip(composite[m::12], new[m::12]): if invalid(a) or invalid(n): continue count += 1 sum += a sum_new += n if count < min_overlap: continue bias = (sum-sum_new)/count # Update period of valid data, composite and weights. for i in range(m, len(new), 12): if invalid(new[i]): continue new_month_weight = weight[i] + new_weight[i] composite[i] = (weight[i]*composite[i] + new_weight[i]*(new[i]+bias))/new_month_weight weight[i] = new_month_weight data_combined[m] += 1 return data_combined
def trend2(points, xmid, min): """Finds a fit to the data *points[]*, using regression analysis, by a line with a change in slope at *xmid*. Returned is a 4-tuple (*sl1*, *sl2*, *rms*, *sl*): the left-hand slope, the right-hand slope, the RMS error, and the slope of an overall linear fit. """ # Todo: incorporate into getfit. count0 = count1 = 0 sx0 = sx1 = 0 sxx0 = sxx1 = 0 sxa0 = sxa1 = 0 sa = 0.0 saa = 0.0 for (x, v) in points: if invalid(v): continue x -= xmid sa += v saa += v**2 if x > 0.0: count1 += 1 sx1 += x sxx1 += x**2 sxa1 += x * v else: count0 += 1 sx0 += x sxx0 += x**2 sxa0 += x * v if count0 < min or count1 < min: return MISSING, MISSING, MISSING, MISSING count = count0 + count1 denom = (count * sxx0 * sxx1 - sxx0 * sx1**2 - sxx1 * sx0**2) sl1 = (sx0 * (sx1 * sxa1 - sxx1 * sa) + sxa0 * (count * sxx1 - sx1**2)) / denom sl2 = (sx1 * (sx0 * sxa0 - sxx0 * sa) + sxa1 * (count * sxx0 - sx0**2)) / denom ymid = (sa - sl1 * sx0 - sl2 * sx1) / count rms = (count * ymid**2 + saa - 2 * ymid * (sa - sl1 * sx0 - sl2 * sx1) + sl1 * sl1 * sxx0 + sl2 * sl2 * sxx1 - 2 * sl1 * sxa0 - 2 * sl2 * sxa1) # linear regression sx = sx0 + sx1 sxx = sxx0 + sxx1 sxa = sxa0 + sxa1 sl = (count * sxa - sa * sx) / (count * sxx - sx**2) return sl1, sl2, rms, sl
def merge_ocean(ocean, sst, dates): """Adds the array *sst* of new monthly sea-surface temperature readings, which has data for the dates *dates*, to the boxed iterator *ocean*. Returns a new boxed iterator. """ clim = giss_io.step4_load_clim() first_new_year = dates[0][0] last_new_year = dates[-1][0] last_new_month = dates[-1][1] reader = iter(ocean) meta = reader.next() meta.monm = 12 * (last_new_year - IYRBEG + 1) meta.monm4 = meta.monm + 8 meta.title = (meta.title[:40] + " Had: 1880-11/1981, oi2: 12/1981-%2d/%04d" % (last_new_month, last_new_year)) yield meta # Average into Sergej's subbox grid for box in reader: box.pad_with_missing(meta.monm) # identify all the degree boxes which are included in this subbox js = int(box.lat_S + 90.01) jn = int(box.lat_N + 89.99) iw = int(box.lon_W + 360.01) ie = int(box.lon_E + 359.99) if ie >= 360: iw = iw - 360 ie = ie - 360 for y, m in dates: mm = (y - first_new_year) * 12 + m month = (m - 1) % 12 count = 0 sum = 0.0 for j in range(js, jn + 1): for i in range(iw, ie + 1): if (sst[i][j][mm - 1] < parameters.sea_surface_cutoff_temp or invalid(clim[i][j][month])): continue count += 1 sum += sst[i][j][mm - 1] - clim[i][j][month] index = (y - IYRBEG) * 12 + m - 1 box.set_value(index, MISSING) if count > 0: box.set_value(index, sum / count) box.trim() yield box
def merge_ocean(ocean, sst, dates): """Adds the array *sst* of new monthly sea-surface temperature readings, which has data for the dates *dates*, to the boxed iterator *ocean*. Returns a new boxed iterator. """ clim = giss_io.step4_load_clim() first_new_year = dates[0][0] last_new_year = dates[-1][0] last_new_month = dates[-1][1] reader = iter(ocean) meta = reader.next() meta.monm = 12 * (last_new_year - IYRBEG + 1) meta.monm4 = meta.monm + 8 meta.title = (meta.title[:40] + " Had: 1880-11/1981, oi2: 12/1981-%2d/%04d" % (last_new_month, last_new_year)) yield meta # Average into Sergej's subbox grid for box in reader: box.pad_with_missing(meta.monm) # identify all the degree boxes which are included in this subbox js = int(box.lat_S + 90.01) jn = int(box.lat_N + 89.99) iw = int(box.lon_W + 360.01) ie = int(box.lon_E + 359.99) if ie >= 360: iw = iw - 360 ie = ie - 360 for y, m in dates: mm = (y - first_new_year) * 12 + m month = (m - 1) % 12 count = 0 sum = 0.0 for j in range(js, jn+1): for i in range(iw, ie+1): if (sst[i][j][mm-1] < parameters.sea_surface_cutoff_temp or invalid(clim[i][j][month])): continue count += 1 sum += sst[i][j][mm-1] - clim[i][j][month] index = (y - IYRBEG) * 12 + m - 1 box.set_value(index, MISSING) if count > 0: box.set_value(index, sum / count) box.trim() yield box
def combine(sums, wgts, begin, years, records, log, new_id=None): while records: record, rec_id, diff = get_longest_overlap(average(sums, wgts, years), begin, records) if invalid(diff): log.write("\tno other records okay\n") return del records[rec_id] add(sums, wgts, diff, begin, record) log.write("\t %s %d %d %f\n" % (rec_id, record.first_year, record.last_year - 1, diff))
def cmbine(combined, weights, counts, data, first, last, weight): """Adds the array *data* with weight *weight* into the array of weighted averages *combined*, with total weights *weights* and combined counts *counts* (that is, entry *combined[i]* is the result of combining *counts[i]* values with total weights *weights[i]*). Adds the computed bias between *combined* and *data* before combining. Only combines in the range [*first*, *last*); only combines valid values from *data*, and if there are fewer than *parameters.rural_station_min_overlap* entries valid in both arrays then it doesn't combine at all. Note: if *data[i]* is valid and *combined[i]* is not, the weighted average code runs and still produces the right answer, because *weights[i]* will be zero. """ sumn = ncom = 0 avg_sum = 0.0 a, b = first - 1, last for v_avg, v_new in itertools.izip(combined[a:b], data[a:b]): if invalid(v_avg) or invalid(v_new): continue ncom = ncom + 1 avg_sum += v_avg sumn += v_new if ncom < parameters.rural_station_min_overlap: return bias = (avg_sum - sumn) / float(ncom) # update period of valid data, averages and weights for n in xrange(first - 1, last): v_new = data[n] if invalid(v_new): continue wtnew = weights[n] + weight old_wt, weights[n] = weights[n], wtnew combined[n] = (old_wt * combined[n] + weight * (v_new + bias)) / wtnew counts[n] += 1
def get_longest_overlap(new_data, begin, records): """Find the record in the *records* dict that has the longest overlap with the *new_data* by considering annual anomalies. """ ann_mean, ann_anoms = monthly_annual(new_data) overlap = 0 # :todo: the records are consulted in an essentially arbitrary # order (chosen by the implementation of items()), but the order # may affect the result. # Tie breaks go to the last record consulted. for rec_id, record in records.items(): rec_ann_anoms = record.ann_anoms rec_ann_mean = record.ann_mean rec_years = record.last_year - record.first_year + 1 rec_begin = record.first_year sum = wgt = 0 for n in range(rec_years): rec_anom = rec_ann_anoms[n] if invalid(rec_anom): continue year = n + rec_begin anom = ann_anoms[year - begin] if invalid(anom): continue wgt += 1 sum += (rec_ann_mean + rec_anom) - (ann_mean + anom) if wgt < parameters.station_combine_min_overlap: continue if wgt < overlap: continue overlap = wgt diff = sum / wgt best_id = rec_id best_record = record if overlap < parameters.station_combine_min_overlap: return 0, 0, MISSING return best_record, best_id, diff
def pieces_get_longest_overlap(new_data, begin, records): ann_mean, ann_anoms = monthly_annual(new_data) overlap = 0 for rec_id, record in records.items(): rec_ann_anoms = record.ann_anoms rec_years = record.last_year - record.first_year + 1 rec_begin = record.first_year wgt = 0 for n in range(rec_years): rec_anom = rec_ann_anoms[n] if invalid(rec_anom): continue year = n + rec_begin anom = ann_anoms[year - begin] if invalid(anom): continue wgt = wgt + 1 if wgt < overlap: continue overlap = wgt best_id = rec_id best_record = record return best_record, best_id
def monthly_anomalies(data, reference_period=None, base_year=-9999): """Calculate monthly anomalies, by subtracting from every datum the mean for its month. A pair of (monthly_mean, monthly_anom) is returned. *monthly_mean* is a 12-long sequence giving the mean for each of the 12 months; *monthly_anom* is a 12-long sequence giving the anomalized series for each of the 12 months. If *reference_period* is supplied then it should be a pair (*first*, *last) and the mean for a month is taken over the period (an example would be reference_period=(1951,1980)). *base_year* specifies the first year of the data. The input data is a flat sequence, one datum per month. Effectively the data changes shape as it passes through this function. """ years = len(data) // 12 if reference_period: base = reference_period[0] - base_year limit = reference_period[1] - base_year + 1 else: # Setting base, limit to (0,0) is a bit of a hack, but it # does work. base = 0 limit = 0 monthly_mean = [] monthly_anom = [] for m in range(12): row = data[m::12] mean = valid_mean(row[base:limit]) if invalid(mean): # Fall back to using entire period mean = valid_mean(row) monthly_mean.append(mean) if valid(mean): def asanom(datum): """Convert a single datum to anomaly.""" if valid(datum): return datum - mean return MISSING monthly_anom.append(map(asanom, row)) else: monthly_anom.append([MISSING] * years) return monthly_mean, monthly_anom
def add(sums, wgts, diff, begin, record): """Add the data from *record* to the *sums* and *wgts* arrays, first shifting it by subtracting *diff*.""" rec_begin = record.first_year rec_years = record.last_year - record.first_year + 1 rec_data = record.series assert len(rec_data) == 12*rec_years offset = rec_begin - begin offset *= 12 for i in range(len(rec_data)): datum = rec_data[i] if invalid(datum): continue index = i + offset sums[index] += datum - diff wgts[index] += 1
def monthly_anomalies(data, reference_period=None, base_year=-9999): """Calculate monthly anomalies, by subtracting from every datum the mean for its month. A pair of (monthly_mean, monthly_anom) is returned. *monthly_mean* is a 12-long sequence giving the mean for each of the 12 months; *monthly_anom* is a 12-long sequence giving the anomalized series for each of the 12 months. If *reference_period* is supplied then it should be a pair (*first*, *last) and the mean for a month is taken over the period (an example would be reference_period=(1951,1980)). *base_year* specifies the first year of the data. The input data is a flat sequence, one datum per month. Effectively the data changes shape as it passes through this function. """ years = len(data) // 12 if reference_period: base = reference_period[0] - base_year limit = reference_period[1] - base_year + 1 else: # Setting base, limit to (0,0) is a bit of a hack, but it # does work. base = 0 limit = 0 monthly_mean = [] monthly_anom = [] for m in range(12): row = data[m::12] mean = valid_mean(row[base:limit]) if invalid(mean): # Fall back to using entire period mean = valid_mean(row) monthly_mean.append(mean) if valid(mean): def asanom(datum): """Convert a single datum to anomaly.""" if valid(datum): return datum - mean return MISSING monthly_anom.append(map(asanom, row)) else: monthly_anom.append([MISSING]*years) return monthly_mean, monthly_anom
def adjust_helena(stream): """Modifies records as specified in config/combine_pieces_helena.in, by adding the delta to every datum for that station prior to the specified month. """ helena_ds = read_config.get_helena_dict() for record in stream: id = record.uid if helena_ds.has_key(id): series = record.series this_year, month, summand = helena_ds[id] begin = record.first_year # Index of month specified by helena_ds M = (this_year - begin)*12 + month # All valid data up to and including M get adjusted for i in range(M+1): datum = series[i] if invalid(datum): continue series[i] += summand record.set_series(record.first_month, series) del helena_ds[id] yield record
def trend2(points, xmid, min): """Finds a fit to the data *points[]*, using regression analysis, by a line with a change in slope at *xmid*. Returned is a 4-tuple (*sl1*, *sl2*, *rms*, *sl*): the left-hand slope, the right-hand slope, the RMS error, and the slope of an overall linear fit. """ # Todo: incorporate into getfit. count0 = count1 = 0 sx0 = sx1 = 0 sxx0 = sxx1 = 0 sxa0 = sxa1 = 0 sa = 0.0 saa = 0.0 for (x,v) in points: if invalid(v): continue x -= xmid sa += v saa += v ** 2 if x > 0.0: count1 += 1 sx1 += x sxx1 += x ** 2 sxa1 += x * v else: count0 += 1 sx0 += x sxx0 += x ** 2 sxa0 += x * v if count0 < min or count1 < min: return MISSING, MISSING, MISSING, MISSING count = count0 + count1 denom = (count * sxx0 * sxx1 - sxx0 * sx1 ** 2 - sxx1 * sx0 ** 2) sl1 = (sx0 * (sx1 * sxa1 - sxx1 * sa) + sxa0 * (count * sxx1 - sx1 ** 2)) / denom sl2 = (sx1 * (sx0 * sxa0 - sxx0 * sa) + sxa1 * (count * sxx0 - sx0 ** 2)) / denom ymid = (sa - sl1 * sx0 - sl2 * sx1) / count rms = (count * ymid ** 2 + saa - 2 * ymid * (sa - sl1 * sx0 - sl2 * sx1) + sl1 * sl1 * sxx0 + sl2 * sl2 * sxx1 - 2 * sl1 * sxa0 - 2 * sl2 * sxa1) # linear regression sx = sx0 + sx1 sxx = sxx0 + sxx1 sxa = sxa0 + sxa1 sl = (count * sxa - sa * sx) / (count * sxx - sx ** 2) return sl1, sl2, rms, sl