def test_transpose(): nelements = 1e3 cat_a = d.bundle(x=n.array([1, 2, 3]), y=n.array([4, 5, 6]), z=n.array([7, 8, 9])) cat_b = d.bundle(x=n.array([1, 2, 3]), y=n.array([4, 5, 6]), z=n.array([7, 8, 9])) cat_c = d.bundle(x=n.array([1, 2, 3]), y=n.array([4, 5, 6]), z=n.array([7, 8, 9]), other=n.array([10, 11, 12])) b = d.bundle(cat_a=cat_a, cat_b=cat_b, cat_c=cat_c) assert all(b.cat_a.x == [1, 2, 3]) assert all(b.cat_b.y == [4, 5, 6]) assert all(b.cat_c.y == [4, 5, 6]) assert list(b.cat_a.keys()) == ["x", "y", "z"] assert list(b.cat_b.keys()) == ["x", "y", "z"] assert list(b.cat_c.keys()) == ["other", "x", "y", "z"] bt = b.transpose() assert list(bt.x.keys()) == ["cat_a", "cat_b", "cat_c"] assert list(bt.y.keys()) == ["cat_a", "cat_b", "cat_c"] assert list(bt.z.keys()) == ["cat_a", "cat_b", "cat_c"] assert "other" not in list(bt.keys()) assert id(bt.x.cat_a) == id(b.cat_a.x) assert id(bt.y.cat_a) == id(b.cat_a.y) assert id(bt.z.cat_a) == id(b.cat_a.z)
def test(): sig = d.bundle(var1 = n.random.normal(2,2,1e5), var2 = n.random.normal(1,1,1e5)) bg = d.bundle(var1 = n.random.normal(0,1,1e4), var2 = n.random.normal(-1,2,1e4)) vars = d.bundle(sig=sig, bg=bg).transpose() weights = d.bundle(sig=n.ones(1e5), bg=n.ones(1e4)) hist1d = d.bundleize(d.factory.hist1d) d.visual() def initfunc(vc,vars,weights,mask): vc.myfig = p.figure() p.figure(vc.myfig.number) h1 = hist1d( vars.var1[mask], n.linspace(-20,20,101), weights[mask]) c = d.bundle(sig="r", bg="k") h1.line(c=c) def updatefunc(vc,vars,weights,mask): vc.myfig.clear() p.figure(vc.myfig.number) h1 = hist1d( vars.var1[mask], n.linspace(-20,20,101), weights[mask]) c = d.bundle(sig="r", bg="k") h1.line(c=c) vc.myfig.canvas.draw() def anyfunc(*args): print args c = VisualCutter(vars,weights, "(vars.var1 > %(var1)s) & (vars.var2 < %(var2)s)", initfunc, updatefunc) c.run()
def test_transpose(): nelements = 1e3 cat_a = d.bundle(x=n.array([1,2,3]), y=n.array([4,5,6]), z=n.array([7,8,9])) cat_b = d.bundle(x=n.array([1,2,3]), y=n.array([4,5,6]), z=n.array([7,8,9])) cat_c = d.bundle(x=n.array([1,2,3]), y=n.array([4,5,6]), z=n.array([7,8,9]), other=n.array([10,11,12])) b = d.bundle(cat_a=cat_a, cat_b=cat_b, cat_c=cat_c) assert all(b.cat_a.x == [1,2,3]) assert all(b.cat_b.y == [4,5,6]) assert all(b.cat_c.y == [4,5,6]) assert list(b.cat_a.keys()) == ["x","y","z"] assert list(b.cat_b.keys()) == ["x","y","z"] assert list(b.cat_c.keys()) == ["other", "x","y","z"] bt = b.transpose() assert list(bt.x.keys()) == ["cat_a", "cat_b", "cat_c"] assert list(bt.y.keys()) == ["cat_a", "cat_b", "cat_c"] assert list(bt.z.keys()) == ["cat_a", "cat_b", "cat_c"] assert "other" not in list(bt.keys()) assert id(bt.x.cat_a) == id(b.cat_a.x) assert id(bt.y.cat_a) == id(b.cat_a.y) assert id(bt.z.cat_a) == id(b.cat_a.z)
def updatefunc(vc,vars,weights,mask): vc.myfig.clear() p.figure(vc.myfig.number) h1 = hist1d( vars.var1[mask], n.linspace(-20,20,101), weights[mask]) c = d.bundle(sig="r", bg="k") h1.line(c=c) vc.myfig.canvas.draw()
def __init__(self, varbundle, weights, cutstring, initfunc, updatefunc, basemask=None): """ varbundle: varnames -> categories -> numpy arrays """ self.vars = varbundle self.weights = weights self.varnames = self.vars.keys() self.catnames = self.vars.transpose().keys() self.initfunc = initfunc self.updatefunc = updatefunc self.cutstring = cutstring if basemask is None: self.basemask = d.bundle(**dict([(k, n.ones(len(weights.get(k)), dtype=bool)) for k in weights.keys()])) else: self.basemask = basemask self.ranges = dict() placeholders = re.findall("\%\((\w+)\)\w", cutstring) for ph in placeholders: if ph not in self.varnames: print "Couldn't identify key %s. Set range in self.ranges manually!" % ph self.ranges[ph] = None else: mi = n.nanmin( self.vars.get(ph).map(n.nanmin).values()) ma = n.nanmax( self.vars.get(ph).map(n.nanmax).values()) self.ranges[ph] = (mi,ma)
def test_bundle_creation(): b = d.bundle(y=1, z=2, x=3) assert list(b.keys()) == ["x", "y", "z"] assert b.x == 3 assert b.y == 1 assert b.z == 2 assert b._b_type == int assert isinstance(b, d.objbundle.object_bundle)
def get_one_variable(self,varname,current,total, unpack_recarrays=False): " helper function that retrieves a single variable" print " %3d/%d reading variable %s" % (current,total,varname), start2 = time.time() arrays = {} missing_datasets = [] for name,dataset in self.datasets.iteritems(): tmp = None try: if varname in self.vars and (self.vars[varname].vardef is not None): v = self.vars[varname] tmp = dataset._ds_get(v.vardef) if v.transform is not None: tmp = v.transform(tmp) else: tmp = dataset._ds_get(varname) except ValueError: missing_datasets.append(name) # tmp is now pointing either to None, a 1d array or a recarray with named columns if tmp is not None: # unpack the different columns of the recarray into 1d arrays in differnt # slots of the resulting bundle if unpack_recarrays: if tmp.dtype.names is None: arrays[name] = tmp else: for column in tmp.dtype.names: arrays[name+"_"+column] = tmp[column] # just store the array else: arrays[name] = tmp if len(arrays) == 0: print "| done after %d seconds" % (time.time() - start2) return None # add empty arrays where necessary # rationale: empty arrays are easier to handle than bundles with missing keys # TODO: maybe make this configureable if len(missing_datasets) > 0: dtype = arrays.values()[0].dtype for name in missing_datasets: arrays[name] = n.zeros(0, dtype=dtype) print "| filling empty keys", print "| done after %d seconds" % (time.time() - start2) sys.stdout.flush() return d.bundle(**arrays)
def get_one_variable(self, varname, current, total, unpack_recarrays=False): " helper function that retrieves a single variable" print " %3d/%d reading variable %s" % (current, total, varname), start2 = time.time() arrays = {} missing_datasets = [] for name, dataset in self.datasets.iteritems(): tmp = None try: if varname in self.vars and (self.vars[varname].vardef is not None): v = self.vars[varname] tmp = dataset._ds_get(v.vardef) if v.transform is not None: tmp = v.transform(tmp) else: tmp = dataset._ds_get(varname) except ValueError: missing_datasets.append(name) # tmp is now pointing either to None, a 1d array or a recarray with named columns if tmp is not None: # unpack the different columns of the recarray into 1d arrays in differnt # slots of the resulting bundle if unpack_recarrays: if tmp.dtype.names is None: arrays[name] = tmp else: for column in tmp.dtype.names: arrays[name + "_" + column] = tmp[column] # just store the array else: arrays[name] = tmp if len(arrays) == 0: print "| done after %d seconds" % (time.time() - start2) return None # add empty arrays where necessary # rationale: empty arrays are easier to handle than bundles with missing keys # TODO: maybe make this configureable if len(missing_datasets) > 0: dtype = arrays.values()[0].dtype for name in missing_datasets: arrays[name] = n.zeros(0, dtype=dtype) print "| filling empty keys", print "| done after %d seconds" % (time.time() - start2) sys.stdout.flush() return d.bundle(**arrays)
def test_int_bundle(): x = 5 y = -2 x_mask = x>0 y_mask = y>0 b = d.bundle(x=x,y=y) b_mask = b>0 assert b_mask.x == x_mask assert b_mask.y == y_mask b_add = b + 2 assert b_add.x == (x+2) assert b_add.y == (y+2)
def test_int_bundle(): x = 5 y = -2 x_mask = x > 0 y_mask = y > 0 b = d.bundle(x=x, y=y) b_mask = b > 0 assert b_mask.x == x_mask assert b_mask.y == y_mask b_add = b + 2 assert b_add.x == (x + 2) assert b_add.y == (y + 2)
def test_ndarray_bundle(): x = n.random.normal(2, 2, 1e3) y = n.random.normal(-2, 2, 2e3) x_cut = x[x > 0] y_cut = y[y > 0] b = d.bundle(x=x, y=y) b_cut = b[b > 0] assert (b_cut.x == x_cut).all() assert (b_cut.y == y_cut).all() b_add = b + 2 assert (b_add.x == (x + 2)).all() assert (b_add.y == (y + 2)).all() shapes = b_cut.shape assert shapes.x == x_cut.shape assert shapes.y == y_cut.shape sums = b_cut.sum() assert sums.x == x_cut.sum() assert sums.y == y_cut.sum()
def test_diversify(): x = n.random.normal(2,2,1e3) y = n.random.normal(-2,2,2e3) b = d.bundle(x=x,y=y) b2 = b.diversify({ "x" : ["x1", "x2"], "y" : ["y1", "y2"]}) assert list(b2.keys()) == ["x1", "x2", "y1", "y2"] assert id(b2.x1) == id(x) assert id(b2.x2) == id(x) assert id(b2.y1) == id(y) assert id(b2.y2) == id(y) b3 = b.diversify({ "x" : ["x1", "x2"], "y" : ["y1", "y2"]}, copy=True) assert id(b3.x1) != id(x) assert id(b3.x2) != id(x) assert id(b3.y1) != id(y) assert id(b3.y2) != id(y) assert (b3.x1 == x).all() assert (b3.x2 == x).all() assert (b3.y1 == y).all() assert (b3.y2 == y).all()
def test_ndarray_bundle(): x = n.random.normal(2,2,1e3) y = n.random.normal(-2,2,2e3) x_cut = x[x>0] y_cut = y[y>0] b = d.bundle(x=x,y=y) b_cut = b[b>0] assert (b_cut.x == x_cut).all() assert (b_cut.y == y_cut).all() b_add = b + 2 assert (b_add.x == (x+2)).all() assert (b_add.y == (y+2)).all() shapes = b_cut.shape assert shapes.x == x_cut.shape assert shapes.y == y_cut.shape sums = b_cut.sum() assert sums.x == x_cut.sum() assert sums.y == y_cut.sum()
def test_diversify(): x = n.random.normal(2, 2, 1e3) y = n.random.normal(-2, 2, 2e3) b = d.bundle(x=x, y=y) b2 = b.diversify({"x": ["x1", "x2"], "y": ["y1", "y2"]}) assert list(b2.keys()) == ["x1", "x2", "y1", "y2"] assert id(b2.x1) == id(x) assert id(b2.x2) == id(x) assert id(b2.y1) == id(y) assert id(b2.y2) == id(y) b3 = b.diversify({"x": ["x1", "x2"], "y": ["y1", "y2"]}, copy=True) assert id(b3.x1) != id(x) assert id(b3.x2) != id(x) assert id(b3.y1) != id(y) assert id(b3.y2) != id(y) assert (b3.x1 == x).all() assert (b3.x2 == x).all() assert (b3.y1 == y).all() assert (b3.y2 == y).all()
def initfunc(vc,vars,weights,mask): vc.myfig = p.figure() p.figure(vc.myfig.number) h1 = hist1d( vars.var1[mask], n.linspace(-20,20,101), weights[mask]) c = d.bundle(sig="r", bg="k") h1.line(c=c)
def get(self, vars, unpack_recarrays=False): """ varname is either a string or a list of strings with variable names returns either a ndarray_bundle or a ndarray_bundle_bundle """ start = time.time() def get_one_variable(self, varname, current, total, unpack_recarrays=False): " helper function that retrieves a single variable" print " %3d/%d reading variable %s" % (current, total, varname), start2 = time.time() arrays = {} missing_datasets = [] for name, dataset in self.datasets.iteritems(): tmp = None try: if varname in self.vars and (self.vars[varname].vardef is not None): v = self.vars[varname] tmp = dataset._ds_get(v.vardef) if v.transform is not None: tmp = v.transform(tmp) else: tmp = dataset._ds_get(varname) except ValueError: missing_datasets.append(name) # tmp is now pointing either to None, a 1d array or a recarray with named columns if tmp is not None: # unpack the different columns of the recarray into 1d arrays in differnt # slots of the resulting bundle if unpack_recarrays: if tmp.dtype.names is None: arrays[name] = tmp else: for column in tmp.dtype.names: arrays[name + "_" + column] = tmp[column] # just store the array else: arrays[name] = tmp if len(arrays) == 0: print "| done after %d seconds" % (time.time() - start2) return None # add empty arrays where necessary # rationale: empty arrays are easier to handle than bundles with missing keys # TODO: maybe make this configureable if len(missing_datasets) > 0: dtype = arrays.values()[0].dtype for name in missing_datasets: arrays[name] = n.zeros(0, dtype=dtype) print "| filling empty keys", print "| done after %d seconds" % (time.time() - start2) sys.stdout.flush() return d.bundle(**arrays) if isinstance(vars, str): tmp = get_one_variable(self, vars, 1, 1, unpack_recarrays) print "total time:", time.time() - start return tmp elif isinstance(vars, list) and all([isinstance(i, str) for i in vars]): bundles = dict( [ (varname, get_one_variable(self, varname, i + 1, len(vars), unpack_recarrays)) for i, varname in enumerate(vars) ] ) bundles = dict([(i, j) for i, j in bundles.iteritems() if j is not None]) if len(bundles) == 0: print "total time:", time.time() - start return None else: tmp = d.bundle(**bundles) print "total time:", time.time() - start return tmp else: raise ValueError("vars must be either a string or a list of strings")
def get(self, vars, unpack_recarrays=False): """ varname is either a string or a list of strings with variable names returns either a ndarray_bundle or a ndarray_bundle_bundle """ start = time.time() def get_one_variable(self,varname,current,total, unpack_recarrays=False): " helper function that retrieves a single variable" print " %3d/%d reading variable %s" % (current,total,varname), start2 = time.time() arrays = {} missing_datasets = [] for name,dataset in self.datasets.iteritems(): tmp = None try: if varname in self.vars and (self.vars[varname].vardef is not None): v = self.vars[varname] tmp = dataset._ds_get(v.vardef) if v.transform is not None: tmp = v.transform(tmp) else: tmp = dataset._ds_get(varname) except ValueError: missing_datasets.append(name) # tmp is now pointing either to None, a 1d array or a recarray with named columns if tmp is not None: # unpack the different columns of the recarray into 1d arrays in differnt # slots of the resulting bundle if unpack_recarrays: if tmp.dtype.names is None: arrays[name] = tmp else: for column in tmp.dtype.names: arrays[name+"_"+column] = tmp[column] # just store the array else: arrays[name] = tmp if len(arrays) == 0: print "| done after %d seconds" % (time.time() - start2) return None # add empty arrays where necessary # rationale: empty arrays are easier to handle than bundles with missing keys # TODO: maybe make this configureable if len(missing_datasets) > 0: dtype = arrays.values()[0].dtype for name in missing_datasets: arrays[name] = n.zeros(0, dtype=dtype) print "| filling empty keys", print "| done after %d seconds" % (time.time() - start2) sys.stdout.flush() return d.bundle(**arrays) if isinstance(vars, str): tmp = get_one_variable(self, vars, 1,1, unpack_recarrays) print "total time:", time.time()-start return tmp elif isinstance(vars, list) and all([isinstance(i, str) for i in vars]): bundles = dict( [ (varname, get_one_variable(self, varname,i+1,len(vars),unpack_recarrays)) for i,varname in enumerate(vars)] ) bundles = dict( [ (i,j) for i,j in bundles.iteritems() if j is not None ] ) if len(bundles) == 0: print "total time:", time.time()-start return None else: tmp = d.bundle(**bundles) print "total time:", time.time()-start return tmp else: raise ValueError("vars must be either a string or a list of strings")
import numpy as n import dashi as d x = n.array( [1,2,3,4, 5] ) y = n.array( [6,7,8,9,10] ) bundle = d.bundle( x=x, y=y ) x.sum() y.sum() bundle.sum()