def SBBXtoBX(data): """Simultaneously combine the land series and the ocean series and combine subboxes into boxes. *data* should be an iterator of (land, ocean) subbox series pairs. Returns an iterator of box data. """ # First item from iterator is normally a pair of metadataobjects, # one for land, one for ocean. If we are piping step3 straight into # step5 then it is not a pair. In that case we synthesize missing # ocean data. meta = data.next() try: land_meta, ocean_meta = meta except (TypeError, ValueError): # Use the land meta object for both land and ocean data land_meta, ocean_meta = meta, meta print "No ocean data; using land data only" data = blank_ocean_data(data) # number of subboxes within each box nsubbox = 100 # TODO: Formalise use of only monthlies, see step 3. assert land_meta.mavg == 6 NYRSIN = land_meta.monm / 12 combined_year_beg = min(land_meta.yrbeg, ocean_meta.yrbeg) # Index into the combined array of the first year of the land data. land_offset = 12 * (land_meta.yrbeg - combined_year_beg) # As land_offset but for ocean data. ocean_offset = 12 * (ocean_meta.yrbeg - combined_year_beg) combined_n_months = max(land_meta.monm + land_offset, land_meta.monm + ocean_offset) info = [ land_meta.mo1, land_meta.kq, land_meta.mavg, land_meta.monm, land_meta.monm4, combined_year_beg, land_meta.missing_flag, land_meta.precipitation_flag ] info[4] = 2 * land_meta.monm + 5 yield (info, land_meta.title) for box_number, box in enumerate(eqarea.grid()): # Averages for the land and ocean (one series per subbox)... avg = [] wgtc = [] # Eat the records from land and ocean 100 (nsubbox) at a time. # In other words, all 100 subboxes for the box. landsub, oceansub = zip(*itertools.islice(data, nsubbox)) # :todo: combine below zip with above zip? for i, l, o in zip(range(nsubbox), landsub, oceansub): a = [MISSING] * combined_n_months if (o.good_count < parameters.subbox_min_valid or l.d < parameters.subbox_land_range): # use land series for this subbox a[land_offset:land_offset + len(l.series)] = l.series wgtc.append(l.good_count) else: # use ocean series for this subbox a[ocean_offset:ocean_offset + len(o.series)] = o.series wgtc.append(o.good_count) avg.append(a) # GISTEMP sort. # We want to end up with IORDR, the permutation array that # represents the sorter order. IORDR[0] is the index (into the # *wgtc* array) of the longest record, IORDR[1] the index of the # next longest record, and so on. We do that by decorating the # *wgtc* array with indexes 0 to 99, and then extracting the # (permuted) indexes into IORDR. # :todo: should probably import from a purpose built module. from step3 import sort IORDR = range(nsubbox) sort(IORDR, lambda x, y: wgtc[y] - wgtc[x]) # From here to the "for" loop over the cells (below) we are # initialising data for the loop. Primarily the AVGR and WTR # arrays. nc = IORDR[0] # Weights for the box's record. wtr = [a != MISSING for a in avg[nc]] # Box record avgr = avg[nc][:] # Loop over the remaining cells. for nc in IORDR[1:]: if wgtc[nc] >= parameters.subbox_min_valid: series.combine(avgr, wtr, avg[nc], 1, 0, combined_n_months / 12, parameters.box_min_overlap) series.anomalize(avgr, parameters.subbox_reference_period, combined_year_beg) ngood = sum(valid(a) for a in avgr) yield (avgr, wtr, ngood, box) # We've now consumed all 8000 input boxes and yielded 80 boxes. We # need to tickle the input to check that it is exhausted and to # cause it to run the final tail of its generator. # We expect the call to .next() to raise StopIteration, which is # just what we want. data.next() # Ordinarily we never reach here. assert 0, "Too many input records"
def SBBXtoBX(data): """Simultaneously combine the land series and the ocean series and combine subboxes into boxes. *data* should be an iterator of (land, ocean) subbox series pairs. Returns an iterator of box data. """ # First item from iterator is normally a pair of metadataobjects, # one for land, one for ocean. If we are piping step3 straight into # step5 then it is not a pair. In that case we synthesize missing # ocean data. meta = data.next() try: land_meta, ocean_meta = meta except (TypeError, ValueError): # Use the land meta object for both land and ocean data land_meta,ocean_meta = meta, meta print "No ocean data; using land data only" data = blank_ocean_data(data) # number of subboxes within each box nsubbox = 100 # TODO: Formalise use of only monthlies, see step 3. assert land_meta.mavg == 6 NYRSIN = land_meta.monm/12 combined_year_beg = min(land_meta.yrbeg, ocean_meta.yrbeg) # Index into the combined array of the first year of the land data. land_offset = 12*(land_meta.yrbeg-combined_year_beg) # As land_offset but for ocean data. ocean_offset = 12*(ocean_meta.yrbeg-combined_year_beg) combined_n_months = max(land_meta.monm + land_offset, land_meta.monm + ocean_offset) info = [land_meta.mo1, land_meta.kq, land_meta.mavg, land_meta.monm, land_meta.monm4, combined_year_beg, land_meta.missing_flag, land_meta.precipitation_flag] info[4] = 2 * land_meta.monm + 5 yield(info, land_meta.title) for box_number,box in enumerate(eqarea.grid()): # Averages for the land and ocean (one series per subbox)... avg = [] wgtc = [] # Eat the records from land and ocean 100 (nsubbox) at a time. # In other words, all 100 subboxes for the box. landsub,oceansub = zip(*itertools.islice(data, nsubbox)) # :todo: combine below zip with above zip? for i, l, o in zip(range(nsubbox), landsub, oceansub): a = [MISSING]*combined_n_months if (o.good_count < parameters.subbox_min_valid or l.d < parameters.subbox_land_range): # use land series for this subbox a[land_offset:land_offset+len(l.series)] = l.series wgtc.append(l.good_count) else: # use ocean series for this subbox a[ocean_offset:ocean_offset+len(o.series)] = o.series wgtc.append(o.good_count) avg.append(a) # GISTEMP sort. # We want to end up with IORDR, the permutation array that # represents the sorter order. IORDR[0] is the index (into the # *wgtc* array) of the longest record, IORDR[1] the index of the # next longest record, and so on. We do that by decorating the # *wgtc* array with indexes 0 to 99, and then extracting the # (permuted) indexes into IORDR. # :todo: should probably import from a purpose built module. from step3 import sort IORDR = range(nsubbox) sort(IORDR, lambda x,y: wgtc[y] - wgtc[x]) # From here to the "for" loop over the cells (below) we are # initialising data for the loop. Primarily the AVGR and WTR # arrays. nc = IORDR[0] # Weights for the box's record. wtr = [a != MISSING for a in avg[nc]] # Box record avgr = avg[nc][:] # Loop over the remaining cells. for nc in IORDR[1:]: if wgtc[nc] >= parameters.subbox_min_valid: series.combine(avgr, wtr, avg[nc], 1, 0, combined_n_months/12, parameters.box_min_overlap) series.anomalize(avgr, parameters.subbox_reference_period, combined_year_beg) ngood = sum(valid(a) for a in avgr) yield (avgr, wtr, ngood, box) # We've now consumed all 8000 input boxes and yielded 80 boxes. We # need to tickle the input to check that it is exhausted and to # cause it to run the final tail of its generator. # We expect the call to .next() to raise StopIteration, which is # just what we want. data.next() # Ordinarily we never reach here. assert 0, "Too many input records"
def subbox_to_box(meta, cells, celltype='BOX'): """Aggregate the subboxes (aka cells, typically 8000 per globe) into boxes (typically 80 boxes per globe), and combine records to produce one time series per box. *celltype* is used for logging, using a distinct (3 character) code will allow the log output for the land, ocean, and land--ocean analyses to be separated. *meta* specifies the meta data and is used to determine the first year (meta.yrbeg) and length (meta.monm) for all the resulting series. Returns an iterator of box data: for each box a quadruple of (*anom*, *weight*, *ngood*, *box*) is yielded. *anom* is the temperature anomaly series, *weight* is the weights for the series (number of cells contributing for each month), *ngood* is total number of valid data in the series, *box* is a 4-tuple that describes the regions bounds: (southern, northern, western, eastern). """ # The (80) large boxes. boxes = list(eqarea.grid()) # For each box, make a list of contributors (cells that contribute # to the box time series); initially empty. contributordict = dict((box, []) for box in boxes) # Partition the cells into the boxes. for cell in cells: box = whichbox(boxes, cell.box) contributordict[box].append(cell) def padded_series(s): """Produce a series, that is padded to start in meta.yrbeg and is of length meta.monm months. *s* should be a giss_data.Series instance. """ result = [MISSING] * meta.monm offset = 12 * (s.first_year - meta.yrbeg) result[offset:offset + len(s)] = s.series return result # For each box, sort and combine the contributing cells, and output # the result (by yielding it). for box in boxes: contributors = contributordict[box] # :todo: should probably import from a purpose built module. from step3 import sort sort(contributors, lambda x, y: y.good_count - x.good_count) best = contributors[0] box_series = padded_series(best) box_weight = [float(valid(a)) for a in box_series] # Start the *contributed* list with this cell. l = [any(valid(v) for v in box_series[i::12]) for i in range(12)] s = ''.join('01'[x] for x in l) contributed = [[best.uid, 1.0, s]] # Loop over the remaining contributors. for cell in contributors[1:]: if cell.good_count >= parameters.subbox_min_valid: addend_series = padded_series(cell) weight = 1.0 station_months = series.combine(box_series, box_weight, addend_series, weight, parameters.box_min_overlap) s = ''.join('01'[bool(x)] for x in station_months) else: weight = 0.0 s = '0' * 12 contributed.append([cell.uid, weight, s]) box_first_year = meta.yrbeg series.anomalize(box_series, parameters.subbox_reference_period, box_first_year) uid = giss_data.boxuid(box, celltype=celltype) log.write("%s cells %s\n" % (uid, asjson(contributed))) ngood = sum(valid(a) for a in box_series) yield (box_series, box_weight, ngood, box)
def subbox_to_box(meta, cells, celltype='BOX'): """Aggregate the subboxes (aka cells, typically 8000 per globe) into boxes (typically 80 boxes per globe), and combine records to produce one time series per box. *celltype* is used for logging, using a distinct (3 character) code will allow the log output for the land, ocean, and land--ocean analyses to be separated. *meta* specifies the meta data and is used to determine the first year (meta.yrbeg) and length (meta.monm) for all the resulting series. Returns an iterator of box data: for each box a quadruple of (*anom*, *weight*, *ngood*, *box*) is yielded. *anom* is the temperature anomaly series, *weight* is the weights for the series (number of cells contributing for each month), *ngood* is total number of valid data in the series, *box* is a 4-tuple that describes the regions bounds: (southern, northern, western, eastern). """ # The (80) large boxes. boxes = list(eqarea.grid()) # For each box, make a list of contributors (cells that contribute # to the box time series); initially empty. contributordict = dict((box, []) for box in boxes) # Partition the cells into the boxes. for cell in cells: box = whichbox(boxes, cell.box) contributordict[box].append(cell) def padded_series(s): """Produce a series, that is padded to start in meta.yrbeg and is of length meta.monm months. *s* should be a giss_data.Series instance. """ result = [MISSING] * meta.monm offset = 12 * (s.first_year - meta.yrbeg) result[offset:offset+len(s)] = s.series return result # For each box, sort and combine the contributing cells, and output # the result (by yielding it). for box in boxes: contributors = contributordict[box] # :todo: should probably import from a purpose built module. from step3 import sort sort(contributors, lambda x,y: y.good_count - x.good_count) best = contributors[0] box_series = padded_series(best) box_weight = [float(valid(a)) for a in box_series] # Start the *contributed* list with this cell. l = [any(valid(v) for v in box_series[i::12]) for i in range(12)] s = ''.join('01'[x] for x in l) contributed = [[best.uid, 1.0, s]] # Loop over the remaining contributors. for cell in contributors[1:]: if cell.good_count >= parameters.subbox_min_valid: addend_series = padded_series(cell) weight = 1.0 station_months = series.combine(box_series, box_weight, addend_series, weight, parameters.box_min_overlap) s = ''.join('01'[bool(x)] for x in station_months) else: weight = 0.0 s = '0'*12 contributed.append([cell.uid, weight, s]) box_first_year = meta.yrbeg series.anomalize(box_series, parameters.subbox_reference_period, box_first_year) uid = giss_data.boxuid(box, celltype=celltype) log.write("%s cells %s\n" % (uid, asjson(contributed))) ngood = sum(valid(a) for a in box_series) yield (box_series, box_weight, ngood, box)