def test_stack(): rec = rnp.root2rec(load('test.root')) s = rnp.stack([rec, rec]) assert_equal(s.shape[0], 2 * rec.shape[0]) assert_equal(s.dtype.names, rec.dtype.names) s = rnp.stack([rec, rec], fields=['x', 'y']) assert_equal(s.shape[0], 2 * rec.shape[0]) assert_equal(s.dtype.names, ('x', 'y')) # recs don't have identical fields rec2 = recfunctions.drop_fields(rec, ['i', 'x']) s = rnp.stack([rec, rec2]) assert_equal(set(s.dtype.names), set(['y', 'z']))
def merged_records(self, category=None, region=None, fields=None, cuts=None, clf=None, clf_name='classifier', include_weight=True, systematic='NOMINAL'): recs = self.records( category=category, region=region, fields=fields, include_weight=include_weight, cuts=cuts, systematic=systematic) if include_weight and fields is not None: if 'weight' not in fields: fields = list(fields) + ['weight'] rec = stack(recs, fields=fields) if clf is not None: scores, _ = clf.classify( self, category, region, cuts=cuts, systematic=systematic) rec = recfunctions.rec_append_fields(rec, names=clf_name, data=scores, dtypes='f4') return rec
def concat_ttrees_to_array(ttrees, branches=None): """Concatenates multiple TTrees of different classes into one ndarray.""" rec = [] for i in range(len(ttrees)): rec.append(rnp.tree2rec(ttrees[i], branches)) return rnp.rec2array(rnp.stack(rec, fields=branches), fields=branches)
def draw_array_helper(self, field_hist, category, region, cuts=None, weighted=True, field_scale=None, weight_hist=None, scores=None, clf=None, min_score=None, max_score=None, systematic='NOMINAL', bootstrap_data=False): from .data import Data, DataInfo all_fields = [] classifiers = [] for f in field_hist.iterkeys(): if isinstance(f, basestring): all_fields.append(f) elif isinstance(f, Classifier): classifiers.append(f) else: all_fields.extend(list(f)) if len(classifiers) > 1: raise RuntimeError( "more than one classifier in fields is not supported") elif len(classifiers) == 1: classifier = classifiers[0] else: classifier = None if isinstance(self, Data) and bootstrap_data: log.info("using bootstrapped data") analysis = bootstrap_data recs = [] scores = [] for s in analysis.backgrounds: rec = s.merged_records(category, region, fields=all_fields, cuts=cuts, include_weight=True, clf=clf, systematic=systematic) recs.append(rec) b_rec = stack(recs, fields=all_fields + ['classifier', 'weight']) s_rec = analysis.higgs_125.merged_records(category, region, fields=all_fields, cuts=cuts, include_weight=True, clf=clf, systematic=systematic) # handle negative weights separately b_neg = b_rec[b_rec['weight'] < 0] b_pos = b_rec[b_rec['weight'] >= 0] def bootstrap(rec): prob = np.abs(rec['weight']) prob = prob / prob.sum() # random sample without replacement log.warning(str(int(round(abs(rec['weight'].sum()))))) sample_idx = np.random.choice( rec.shape[0], size=int(round(abs(rec['weight'].sum()))), replace=False, p=prob) return rec[sample_idx] rec = stack([ bootstrap(b_neg), bootstrap(b_pos), bootstrap(s_rec)], fields=all_fields + ['classifier', 'weight']) rec['weight'][:] = 1. scores = rec['classifier'] else: # TODO: only get unblinded vars rec = self.merged_records(category, region, fields=all_fields, cuts=cuts, include_weight=True, clf=classifier, systematic=systematic) if isinstance(scores, tuple): # sanity #assert (scores[1] == rec['weight']).all() # ignore the score weights since they should be the same as the rec # weights scores = scores[0] if weight_hist is not None and scores is not None: log.warning("applying a weight histogram") edges = np.array(list(weight_hist.xedges())) # handle strange cases edges[0] -= 1E10 edges[-1] += 1E10 weights = np.array(list(weight_hist.y())).take( edges.searchsorted(scores) - 1) weights = rec['weight'] * weights else: weights = rec['weight'] if scores is not None: if min_score is not None: idx = scores > min_score rec = rec[idx] weights = weights[idx] scores = scores[idx] if max_score is not None: idx = scores < max_score rec = rec[idx] weights = weights[idx] scores = scores[idx] for fields, hist in field_hist.items(): if isinstance(fields, Classifier): fields = ['classifier'] # fields can be a single field or list of fields elif not isinstance(fields, (list, tuple)): fields = [fields] if hist is None: # this var might be blinded continue # defensive copy if isinstance(fields, tuple): # select columns in numpy recarray with a list fields = list(fields) arr = np.copy(rec[fields]) if field_scale is not None: for field in fields: if field in field_scale: arr[field] *= field_scale[field] # convert to array arr = rec2array(arr, fields=fields) # HACK HACK HACK _weights = weights if fields == ['dEta_jets']: log.warning("HACK HACK") nonzero = arr > 0 arr = arr[nonzero] _weights = weights[nonzero] # include the scores if the histogram dimensionality allows if scores is not None and hist.GetDimension() == len(fields) + 1: arr = np.c_[arr, scores] elif hist.GetDimension() != len(fields): raise TypeError( 'histogram dimensionality does not match ' 'number of fields: %s' % (', '.join(fields))) hist.fill_array(arr, weights=_weights) if isinstance(self, Data): if hasattr(hist, 'datainfo'): hist.datainfo += self.info else: hist.datainfo = DataInfo(self.info.lumi, self.info.energies)