def integrate_2dfield(egrid, f, mask): ''' Integrates a 2d field over a certain region given a mask and grid. Parameters ---------- egrid : tuble of list of numpy arrays edge-based grid such that: xeg, yeg = egrid f : numpy array, 2dim input field that is integrated mask : numpy array, 2dim, bool mask of a certain region that is used in integration (same shape as f) Returns ------- fint : float value integral of f for the region where mask is True ''' xeg, yeg = egrid dx = gi.lmean(np.diff(xeg, axis=0), axis=1) dy = gi.lmean(np.diff(yeg, axis=1), axis=0) fint = (f[mask] * dx[mask] * dy[mask]).sum() return fint
def radial_integration_2d_field(egrid, f, rbins, p0): ''' Integrates a 2d field over a certain region given a base point and the ring edges. Parameters ---------- egrid : tuple or list of two numpy arrays edge-based grid such that: xeg, yeg = egrid f : numpy array, 2dim 2d field rbins : numpy array array of range bins p0 : tuple of 2 float values base point, i.e. x0, y0 = p0 Returns -------- fint : float radial integral of f ''' # get the equi-distant grids xeg, yeg = egrid xe = xeg.mean(axis=1) ye = yeg.mean(axis=0) xm = gi.lmean(xe) ym = gi.lmean(ye) # transformed function: now in cylinder coordinates ftrans = transform2cylinder_coords(xm, ym, f, p0=p0) kernel = lambda r, phi: r * ftrans(r, phi) nbins = rbins.shape[0] I = np.zeros(nbins - 1) for i in range(nbins - 1): r1 = rbins[i] r2 = rbins[i + 1] I[i] = scipy.integrate.nquad(kernel, [[r1, r2], [0, 2 * np.pi]])[0] return I
def radial_ring_integration_2d_field(egrid, f, r1, r2, p0): ''' Integrates a 2d field over a certain region given a base point and the ring edges. Parameters ---------- egrid : tuple or list of two numpy arrays edge-based grid such that: xeg, yeg = egrid f : numpy array, 2dim 2d field r1 : float value inner ring radius r2 : float value outer ring radius p0 : tuple of 2 float values base point, i.e. x0, y0 = p0 Returns -------- fint : float value radial integral of f ''' # get the equi-distant grids xeg, yeg = egrid xe = xeg.mean(axis=1) ye = yeg.mean(axis=0) xm = gi.lmean(xe) ym = gi.lmean(ye) # transformed function: now in cylinder coordinates ftrans = transform2cylinder_coords(xm, ym, f, p0=p0) kernel = lambda r, phi: r * ftrans(r, phi) I = scipy.integrate.nquad(kernel, [[r1, r2], [0, 2 * np.pi]]) return I[0]
def hist3d(v1, v2, v3, bins): ''' A wrapper for 3d histogram binning. It additionally generates the average bin values with the same 3d shape as histogram itself. Parameters ---------- v1 : np.array 1st data vector v2 : np.array 2nd data vector v3 : np.array 3rd data vector bins : list of 3 np.arrays bins argument passed to the np.histogramdd function Returns ------- bins3d : list of 3dim np.array mesh of bin edges h : np.array absolute histogram counts ''' h, bs = np.histogramdd([v1, v2, v3], bins) # generate average bin value bave = [] for b in bs: bave.append(gi.lmean(b)) bins3d = np.meshgrid(*bave, indexing='ij') return bins3d, h
def calculate_pcf4clust( c, egrid, nd_ref, weight = 1., rMax = 500., pcf_sensitivity = True, dr = 20., verbose = False): ''' Calculates Pair-correlation function for a cluster set. Parameters ----------- c : dict set o cluster properties egrid : tuple or list of two numpy arrays edge-based grid of reference number density nd_ref : numpy array, 2dim reference number density weight : float, optional, default = 1. weight between equi-distant and equal-area range binning 0 = equal-area 1 = equi-distant rMax : float, optional, default = 500. largest range bin dr : float, optional, default = 20. range ring interval verbose : bool, optional, default = False switch if more standard output is provided Returns -------- c : dict set of cluster props with pcf field updated ''' # read input data ------------------------------------------------ ncells = len( c['abs_time'] ) # ================================================================ # get norm of cell density function ------------------------------ dx = gi.lmean(np.diff(egrid[0], axis = 0), axis = 1) dy = gi.lmean(np.diff(egrid[1], axis = 1), axis = 0) ncells_per_slot = (dx * dy * nd_ref).sum() nd_normed = nd_ref / ncells_per_slot # ================================================================ # and also define homogeneous cell density function -------------- nd_const = 1. / (dx * dy * np.size(nd_ref)) # ================================================================ # get cell coordinates and cutout region ------------------------- xt, yt = c['x_mean'], c['y_mean'] # ================================================================ # loop over all time ids ----------------------------------------- if 'time_id' in c.keys(): tname = 'time_id' else: tname = 'rel_time' time_ids = sorted( set(c[tname]) ) rbins = radius_ring_edges(rMax, dr, weight = weight) nbins = len(rbins) c['pcf'] = np.nan * np.ma.ones( (ncells, nbins - 1) ) if pcf_sensitivity: for pcf_name in ['pcf_hom', 'pcf_hom_tconst', 'pcf_tconst']: c[pcf_name] = np.nan * np.ma.ones( (ncells, nbins - 1) ) for tid in time_ids: # masking m = (c[tname] == tid) ccount = m.sum() pos = np.array([xt[m], yt[m]]).T if verbose: print tid, m.sum() # correct for temporal variation of cell number nd_corr = ccount * nd_normed # pair-correlation analysis with variable BG density gfull, g, rbins, indices = pairCorrelationFunction_2D(pos, egrid, nd_corr, rMax, dr, weight = weight) gfull_with_edge = np.nan * np.ma.ones( (ccount, nbins - 1) ) gfull_with_edge[indices] = gfull c['pcf'][m] = gfull_with_edge c['pcf'] = np.ma.masked_invalid( c['pcf'] ) if pcf_sensitivity: nd = {} nd['pcf_hom'] = ccount * nd_const nd['pcf_hom_tconst'] = ncells_per_slot * nd_const nd['pcf_tconst'] = ncells_per_slot * nd_normed for pcf_name in nd.keys(): # pair-correlation analysis with variable BG density gfull, g, rbins, indices = pairCorrelationFunction_2D(pos, egrid, nd[pcf_name], rMax, dr, weight = weight) gfull_with_edge = np.nan * np.ma.ones( (ccount, nbins - 1) ) gfull_with_edge[indices] = gfull c[pcf_name][m] = gfull_with_edge c[pcf_name] = np.ma.masked_invalid( c[pcf_name] ) c['rbins_pcf'] = rbins # ================================================================ return c
def calculate_average_numberdensity(d, smooth_sig = 2., filter_in_logspace = False, var_names = None, bins = None, itime_range = None, dx = 20. , dy = 20.,): ''' Calculates average number density. Parameters ---------- d : dict of arrays set of cell properties smooth_sig : float, optional, default = 2. sigma of Gaussian filter for smoothing the results filter_in_logspace : bool, optional, default = True, switch if filter is applied in log-space var_names : list of two strings, optional, default = None list of the two variables used for binning if None: 'x_mean' and 'y_mean' are used bins : list or tuple of two numpy arrays, optional, default = None sets the bins of histogram analysis if None: min and max is estimated from data and (dx, dy) is used itime_range : list or tuple of two int, optional, default = None selects a range of relative time for the analysis if None: all times are used dx : float, optional, default = 20. interval of x-binning if bins == None dy : float, optional, default = 20. interval of y-binning if bins == None Returns -------- egrid : tuple of two numpy arrays edge-based output grid nd : numpy array, 2dim number density field. Notes ----- There might be some problems with the (x,y) versus the transformed (x,y) coordinates. ToDo: * implemnet super sampling ''' # get the right variable names ----------------------------------- if var_names is None: var_name1 = 'x_mean' var_name2 = 'y_mean' else: var_name1, var_name2 = var_names x, y, tid, trel = d[var_name1], d[var_name2], d['time_id'], d['rel_time'] # ================================================================ # prepare time masking ------------------------------------------- if itime_range is None: mask = np.ones_like( trel ).astype( np.bool ) else: itime_min, itime_max = itime_range mask = (trel >= itime_min) & (trel <= itime_max) # ================================================================ # transformed coordinate -> equator at zero # clon, clat = gi.xy2ll(1.*x, 1.*y) # xt, yt = gi.ll2xy(clon, clat, lon0 = 0, lat0 = 0) # mask out the spinup time # mask = trel > -12 # get binning ---------------------------------------------------- if bins is None: # take the values from the cell props xbins = np.arange(x.min(), x.max(), dx) ybins = np.arange(y.min(), y.max(), dy) bins = (xbins, ybins) else: xbins, ybins = bins # ================================================================ # check for region ----------------------------------------------- reg_xmask = ( x > xbins.min() ) & (x < xbins.max() ) reg_ymask = ( y > ybins.min() ) & (y < ybins.max() ) reg_mask = reg_xmask & reg_ymask mask = mask & reg_mask # ================================================================ # histogram analysis --------------------------------------------- h, xe, ye = np.histogram2d(x[mask], y[mask], bins) if filter_in_logspace: # smoothing ??? logh = np.ma.log(h) logh.data[ logh.mask ] = -5. loghs = scipy.ndimage.gaussian_filter(logh, smooth_sig) h = np.exp(loghs) else: h = scipy.ndimage.gaussian_filter(h, smooth_sig) # ================================================================ # grid center points --------------------------------------------- xm = gi.lmean(xe) ym = gi.lmean(ye) egrid = np.meshgrid(xe, ye, indexing = 'ij') # ================================================================ # get the number of cells per time slot -------------------------- target_times = sorted( set(tid[mask]) ) ntimes = 1.* len(target_times) ncells = 1. * len (tid[mask] ) ncells_per_slot = ncells / ntimes # ================================================================ # normalization -------------------------------------------------- N0 = (h * dx * dy).sum() nd0 = ncells_per_slot * h / N0 # ================================================================ return egrid, nd0
def init_reference_numbers(egrid, numberDensity, rbins, nsub=4, radial_integration=True): ''' Calculates the number of expected cells / particle as function of distance. Parameters ----------- egrid : tuble of list of two 2dim numpy arrays edge-based grid at which number density field is given numberDensity : numpy array number density field, as number of particles per area rbins : numpy array ring edge array nsub : int values, optional, default = 4 values that determines subsampling of the analyzed fields radial_integration : bool, optional, default = True switch if radial integration of an interpolated representation of the number field is used. Returns -------- xout : numpy array, 2dim with shape = (nrows, ncols) output x-grid yout : numpy array, 2dim with shape = (nrows, ncols) output y-grid n0 : numpy array, 3dim with shape = (nrows, ncols, nrbins) expected number of cells on grid for each ring interval ''' # get grids # ========== xg, yg = egrid # edge based grid xc = gi.lmean(gi.lmean(xg, axis=0), axis=1) # center point based grid yc = gi.lmean(gi.lmean(yg, axis=0), axis=1) # center point based grid dx = gi.lmean(np.diff(xg, axis=0), axis=1) dy = gi.lmean(np.diff(yg, axis=1), axis=0) n_dx_dy = numberDensity * dx * dy # set output grid # ================= xout = xc[::nsub, ::nsub] yout = yc[::nsub, ::nsub] pc = np.array([xc.flatten(), yc.flatten()]).T grid_tree = scipy.spatial.KDTree(pc) nrow_out, ncol_out = xout.shape # count particles per distance ring # ================================== edges = rbins num_increments = len(edges) - 1 n0 = np.zeros((nrow_out, ncol_out, num_increments)) ncum = np.zeros((nrow_out, ncol_out, num_increments)) # calculated expected number of reference cells # ============================================== for irow in range(nrow_out): for icol in range(ncol_out): p0 = (xout[irow, icol], yout[irow, icol]) if radial_integration: n0[irow, icol, :] = radial_integration_2d_field( egrid, numberDensity, rbins, p0) else: for ibin, r in enumerate(edges[1:]): m = grid_tree.query_ball_point(p0, r) ncum[irow, icol, ibin] = n_dx_dy.flatten()[m].sum() n0[irow, icol, 0] = ncum[irow, icol, 0] n0[irow, icol, 1:] = np.diff(ncum[irow, icol, ::-1]) return xout, yout, n0
def pcf(pos, rbins, domain, refgrid, ref_number, tree_as_grid_input=False): """ Compute the two-dimensional pair correlation function, also known as the radial distribution function, for a set of circular particles contained in a square region of a plane. Parameters ---------- pos : numpy array position vector such that x, y = pos.T and pos.shape = (Nparticles, 2) rbins : numpy array ring edge vector domain : tuple or list in the form ((xmin, xmax), (ymin, ymax)) containing min & max coordiantes for masking the edge refgrid : tuble of two 2dim numpy arrays OR scipy tree class instance grid on which reference number of cells is given ref_number : numpy array reference number of cells per ring tree_as_grid_input : bool, optional, default = False if not reference grid, but scipy tree class instance (for nearest neighbor int) is supplied Returns ------- g(r) : numpy array correlation function g(r) for each particle g_ave(r) : numpy array average correlation function radii : numpy array radii of the annuli used to compute g(r) interior_indices : numpy array indices of reference particles """ # Number of particles in ring/area of ring/number of reference particles/number density # area of ring = pi*(r_outer**2 - r_inner**2) # Find particles which are close enough to the box center that a circle of radius # rMax will not cross any edge of the box # get particle coordiantes # ========================= x, y = pos.T # check boundaries # ================ xmin, xmax = domain[0] ymin, ymax = domain[1] rMax = rbins.max() xmask = (x > rMax + xmin) & (x < xmax - rMax) ymask = (y > rMax + ymin) & (y < ymax - rMax) interior_indices, = np.where(xmask & ymask) num_interior_particles = len(interior_indices) pos_inner = pos[interior_indices] if num_interior_particles < 1: raise RuntimeError( "No particles found for which a circle of radius rMax\ will lie entirely within a square of side length S. Decrease rMax\ or increase the size of the square.") # set distance rings # ================== edges = rbins num_increments = len(edges) - 1 g = np.zeros([num_interior_particles, num_increments]) npart = np.zeros([num_interior_particles, num_increments]) # Compute number of observed particles # ==================================== for p in range(num_interior_particles): index = interior_indices[p] d = np.sqrt((x[index] - x)**2 + (y[index] - y)**2) d[index] = 2 * rMax (npart[p, :], bins) = np.histogram(d, bins=edges, normed=False) # Get number of reference particles # ================================= if tree_as_grid_input: grid_tree = refgrid else: xout, yout = refgrid pout = np.array([xout.flatten(), yout.flatten()]).T grid_tree = scipy.spatial.KDTree(pout) dist, index = grid_tree.query(pos_inner) nref = ref_number.reshape(-1, num_increments)[index] g = np.ma.divide(npart, nref) g = np.ma.masked_invalid(g) g_average = g.mean(axis=0) radii = gi.lmean(edges) return (g, g_average, radii, interior_indices)
def pairCorrelationFunction_2D(pos, egrid, numberDensity, rMax, dr, rbins=None, equal_area=True, weight=None, constant_analytic=False, mask_edge_particle=True, use_radial_interpolation=False, finite_size_correction=False): """Compute the two-dimensional pair correlation function, also known as the radial distribution function, for a set of circular particles contained in a square region of a plane. This simple function finds reference particles such that a circle of radius rMax drawn around the particle will fit entirely within the square, eliminating the need to compensate for edge effects. If no such particles exist, an error is returned. Try a smaller rMax...or write some code to handle edge effects! ;) Parameters ---------- pos : numpy array position vector such that x, y = pos.T and pos.shape = (Nparticles, 2) egrid : numpy array edge-based grid at which number density field is given numberDensity : numpy array number density field, as number of particles per area rMax : float value maximum radius dr : float value increment for increasing radius of annulus rbins : numpy array, optional, default = None array which contains range bin values equal_area : bool, optional, default = True switch that determines if equi-distance radius rings or rings of equal area are chosen, optional weight : float, optional, default = None if weight (number between 0 and 1) is set, than radius rings are calculated from a weighted version between equi-distant (weight = 0) and equal area (weight = 1) constant_analytic : bool, optional default = False use analytic form of expected reference number assuming a constant number density mask_edge_particles : bool, optional default = True If particle close to edge are ignored use_radial_interpolation : bool, optional, default = False if number field is represented by an interpolating function and intergation is performed on that field finite_size_correction : bool, optional, default = False if a finite size correction N / (N-1) is applied Returns ------- g(r) : numpy array correlation function g(r) for each particle g_ave(r) : numpy array average correlation function radii : numpy array radii of the annuli used to compute g(r) interior_indices : numpy array indices of reference particles Note ------ Possibly outdated! Try the faster variant pcf that uses precalculated reference number fields! """ # Number of particles in ring/area of ring/number of reference particles/number density # area of ring = pi*(r_outer**2 - r_inner**2) # Find particles which are close enough to the box center that a circle of radius # rMax will not cross any edge of the box # get particle coordiantes # ========================= x, y = pos.T # get grids # ========== xg, yg = egrid # edge based grid xc = gi.lmean(gi.lmean(xg, axis=0), axis=1) # center point based grid yc = gi.lmean(gi.lmean(yg, axis=0), axis=1) # center point based grid # check boundaries # ================ # overwrite rMax if rbins is given!!! if rbins != None: rMax = rbins.max() xmin, xmax = xg.min(), xg.max() ymin, ymax = yg.min(), yg.max() if mask_edge_particle: xmask = (x > rMax + xmin) & (x < xmax - rMax) ymask = (y > rMax + ymin) & (y < ymax - rMax) else: xmask = (x > xmin) & (x < xmax) ymask = (y > ymin) & (y < ymax) interior_indices, = np.where(xmask & ymask) num_interior_particles = len(interior_indices) if num_interior_particles < 1: raise RuntimeError( "No particles found for which a circle of radius rMax\ will lie entirely within a square of side length S. Decrease rMax\ or increase the size of the square.") # count particles per distance ring # ================================== if rbins == None: edges = radius_ring_edges(rMax, dr, equal_area=equal_area, weight=weight) else: edges = rbins num_increments = len(edges) - 1 g = np.zeros([num_interior_particles, num_increments]) # finite size correction (N - 1) / N # ================================= ncells = 1. * pos.shape[0] if finite_size_correction: corr_factor = (ncells - 1.) / ncells else: corr_factor = 1. # Compute pairwise correlation for each interior particle for p in range(num_interior_particles): index = interior_indices[p] d = np.sqrt((x[index] - x)**2 + (y[index] - y)**2) d[index] = 2 * rMax (npart, bins) = np.histogram(d, bins=edges, normed=False) dgrid = np.sqrt((x[index] - xc)**2 + (y[index] - yc)**2) # here, we calculate reference number of particles for i in range(num_increments): r1 = edges[i] r2 = edges[i + 1] if constant_analytic: n0 = np.pi * numberDensity.mean() * (r2**2 - r1**2) else: if use_radial_interpolation: p0 = (x[index], y[index]) n0 = radial_ring_integration_2d_field(egrid, numberDensity, r1, r2, p0=p0) else: dmask = (dgrid >= edges[i]) & (dgrid <= edges[i + 1]) n0 = integrate_2dfield(egrid, numberDensity, dmask) g[p, i] = np.ma.divide(npart[i], n0) # Average g(r) for all interior particles and compute radii #g_average = zeros(num_increments) #for i in range(num_increments): # radii[i] = (edges[i] + edges[i+1]) / 2. # rOuter = edges[i + 1] # rInner = edges[i] # g_average[i] = mean(g[:, i]) / (pi * (rOuter**2 - rInner**2)) g = corr_factor * np.ma.masked_invalid(g) g_average = g.mean(axis=0) radii = gi.lmean(edges) return (g, g_average, radii, interior_indices)
def percentiles_from_hist(xe, ye, h, p=[25, 50, 75], axis=0, sig=0): ''' Use output from the numpy 2d histogram routine to calculate percentiles of the y variable based on relative occurence rates. Parameters ---------- xe : np.array x-part of histogram grid (edge values) ye : np.array y-part of histogram grid (edge values) h : np.array frequency of occurence p : list, optional, default = [25, 50, 75] list of percentile values to be calculated (in 100th) axis : {0, 1}, optional axis along which the calculations are peformed sig : float, optional, default = 0 signa of a Gaussian filter apllied to the histogram in advance Should be non-negative. Returns ------- yperc : np.array list of arrays containing the percentiles ''' # normalization hn = conditioned_hist(1. * h, axis=axis) # get the right coordinate if axis == 0: z = xe[1:] hn = hn.transpose() elif axis == 1: z = gi.lmean(ye) z = ye[1:] # what follows assumes axis = 1 LLLLLLLLLLLLLLLLLLLLLLL # add edge z0 = z[0] - (z[1] - z[0]) / 2. zN = z[-1] + (z[-1] - z[-2]) / 2. z = np.hstack([z0, z, zN]) h0 = np.zeros_like(hn[:, 0]) hn = np.column_stack([h0, hn, h0]) sigvec = (0, sig) hn = scipy.ndimage.gaussian_filter(hn, sigvec) hcum = hn.cumsum(axis=1) * 100. zperc = [] for pval in p: # percentile crossing hr = hcum - pval # this is the point closest to crossing imax = np.abs(hr).argmin(axis=1) # neighboring indices iminus = np.where(imax - 1 < 0, 0, imax - 1) iplus = np.where(imax + 1 < hn.shape[1], imax + 1, imax) # indexing stuff!!!! # for axis = 1 i0 = list(range(imax.shape[0])) hmask = hr[i0, imax] < 0 # lower bound dh0 = np.where(hmask, hr[i0, imax], hr[i0, iminus]) z0 = np.where(hmask, z[imax], z[iminus]) # upper bound dh1 = np.where(hmask, hr[i0, iplus], hr[i0, imax]) z1 = np.where(hmask, z[iplus], z[imax]) zp = (dh1 * z0 - dh0 * z1) / (dh1 - dh0) zperc.append(zp) return zperc
x = 2 * np.random.randn(1000) y = x**2 + np.random.randn(1000) # y = x + np.random.randn(1000) pl.figure() pl.plot(y, x, 'o') h, xe, ye = np.histogram2d(y, x, (18, 20), normed=True) hn = conditioned_hist(h, axis=0) pl.pcolormesh(xe, ye, hn.transpose(), alpha=0.5) x25, x50, x75 = percentiles_from_hist(xe, ye, h) pl.plot(x50, gi.lmean(ye), 'r-', lw=3) pl.plot(x25, gi.lmean(ye), 'g-', lw=3) pl.plot(x75, gi.lmean(ye), 'g-', lw=3) pl.figure() pl.plot(gi.lmean(ye), x50) pl.plot(x, x**2, 'o') # # pl.figure() # pl.plot(x,y, 'o') # # h, xe, ye = np.histogram2d(x, y, (20,8), normed = True) # # hn = conditioned_hist(h, axis = 1) #
def main(collection_file, do_output=True, output_file=None, smooth_sig=2., filter_in_logspace=True, var_names=None, bins=None, itime_range=None, dx=20., dy=20., do_interpolation=False, variable_filename=None, variable_name=None, input_options={}): ''' Calculates average number density. Parameters ---------- collection_file : str filename of cluster collection do_output : bool, optional, default = True switch if output is written output_file : str, optional, default = None name of output file if None: name is generated based on collection filename smooth_sig : float, optional, default = 2. sigma of Gaussian filter for smoothing the results filter_in_logspace : bool, optional, default = True, switch if filter is applied in log-space var_names : list of two strings, optional, default = None list of the two variables used for binning if None: 'x_mean' and 'y_mean' are used bins : list or tuple of two numpy arrays, optional, default = None sets the bins of histogram analysis if None: min and max is estimated from data and (dx, dy) is used itime_range : list or tuple of two int, optional, default = None selects a range of relative time for the analysis if None: all times are used dx : float, optional, default = 20. interval of x-binning if bins == None dy : float, optional, default = 20. interval of y-binning if bins == None do_interpolation : bool, optional, default = True if number density field is interpolated onto cluster grid variable_filename : str, optional, default = None one of the variable files to input cluster grid georef variable_name : str, optional, default = None variable name in variable file input_options : dict, optional, default = {} all possible input options used in the data_reader Returns -------- egrid : tuple of two numpy arrays edge-based output grid nd : numpy array, 2dim number density field. ''' # ================================================================ # input collection data ------------------------------------------ cset = hio.read_dict_from_hdf(collection_file) # ================================================================ # do number denisty calculation ---------------------------------- egrid, nd_ref = calculate_average_numberdensity( cset, smooth_sig=smooth_sig, filter_in_logspace=filter_in_logspace, var_names=var_names, bins=bins, itime_range=itime_range, dx=dx, dy=dy) xgrid = egrid[0] ygrid = egrid[1] # ================================================================ # do interpolation of reference number density on cluster grid --- if do_interpolation: if variable_filename is not None and variable_name is not None: varset = data_reader.input(variable_filename, variable_name, **input_options) # centered base grid xgridc = gi.lmean(gi.lmean(xgrid, axis=0), axis=1) ygridc = gi.lmean(gi.lmean(ygrid, axis=0), axis=1) # get target grid cx = varset['x'] cy = varset['y'] # interpolation index ind = gi.create_interpolation_index(xgridc, ygridc, cx, cy, xy=True) # get total number of cells ncells = (nd_ref * dx * dy).sum() # normalization of new nd field da = gi.simple_pixel_area(cx, cy, xy=True) norm = (nd_ref[ind] * da).sum() nd_int = ncells * nd_ref[ind] / norm # ================================================================ # save stuff into hdf -------------------------------------------- if do_output: out = {} out['xgrid'] = xgrid out['ygrid'] = ygrid out['number_density'] = nd_ref if do_interpolation: out['x_int'] = cx out['y_int'] = cy out['nd_int'] = nd_int if output_file is None: oname = collection_file.replace('collected_cluster_props', 'average_number_density') else: oname = output_file print '... save data to %s' % oname hio.save_dict2hdf(oname, out) # ================================================================ return egrid, nd_ref
def histxr(dset, varname, bins, loop_dim='time', do_log=False): ''' Calculates histogram using xarray data structure. Allows to loop one dimension (e.g. time). Parameters ---------- dset : xarray Dataset input data container varname : str variable name for which binning is done bins : numpy array or list set of bin edges for histogram computations loop_dim : str, optional dimension name for which looping is done default = 'time' do_log : bool, optional switch for bin center calculations if True: bins are assume to be equi-distant in log-space Returns -------- hset : xarray Dataset histogram data, incl. pdf, cdf and cumulative moments ''' # ---------------------------- # (1) Preparation Part # ---------------------------- # nbin center points if do_log: logbins = np.log(bins) clogbins = gi.lmean(logbins) cbins = np.exp(clogbins) else: cbins = gi.lmean(bins) # ---------------------------- # (2) Loop Part # ---------------------------- nloop = len(dset.coords[loop_dim]) hist_list = [] for i in range(nloop): d = dset.isel({loop_dim: i})[varname].data h, xe = np.histogram(d.flatten(), bins) hist_list += [ h.copy(), ] harray = np.row_stack(hist_list) # ---------------------------- # (3) Xarray Part # ---------------------------- # get variable attributes attrs = dict(dset[varname].attrs) attrs['varname'] = varname attrs['var_longname'] = attrs.pop('longname', 'None') # store absolute counts hset = xr.Dataset( { 'histogram': ([loop_dim, 'cbin'], harray, { 'longname': 'absolute histogram counts' }) }, coords={ loop_dim: (loop_dim, dset[loop_dim]), 'ebin': ('ebin', bins, dict(longname='bin egdes', **attrs)), 'cbin': ('cbin', cbins, dict(longname='bin mid-points', **attrs)), }) hset['delta_bin'] = xr.DataArray(hset.ebin.diff('ebin').data, dims='cbin', attrs=dict(longname='bin widths', **attrs)) # pdf h_relative = hset.histogram / (1. * hset.histogram.sum(dim='cbin')) pdf = h_relative / hset.delta_bin hset['pdf'] = xr.DataArray( pdf, attrs={'longname': 'probability density function'}) # cdf cdf = (pdf * hset.delta_bin).cumsum(dim='cbin') hset['cdf'] = xr.DataArray( cdf, attrs={'longname': 'cumulative distribution function'}) # cumulative moments for i in range(1, 3): mom = hset.cbin**i cum_mom = (mom * pdf * hset.delta_bin).cumsum(dim='cbin') cum_mom_name = 'cum_mom%d' % i hset[cum_mom_name] = xr.DataArray( cum_mom, attrs={'longname': 'cumulative %d. moment' % i}) return hset
def timevar_hist(d, vname, x=None, atot=1., full_time_info=False, nbins=20, logbinning=False, edge_values=False, density=True, do_cnsd=False, do_ccsd=False): ''' Makes time-average 1d histograms for a certain variable. Parameters ---------- d : dict set of cell properties vname : str variable name used for analysis x : numpy array, optional, default = None bins for histogram if None: either predefined bins or bins from percentiles are used atot : float, topional, default = 1. total area of analysis domain (for cnsd) full_time_info : bool, optional, default = False switch if full time information is returned (rather than ave and standard error) nbins : int, optional, default = 20 number of bins logbinning : bool, optional, default = False switch if autogenerated bins use logarithmic bin distances edge_values : bool, optional, default = False if edge rather than mid-point values are returned density : bool, optional, default = True switch if PDF rather than absolute counts are returned do_cnsd : bool, optional, default = False if cloud number size distribution is calculated do_ccsd : bool, optional, default = False if cloud cover size distribution is calculated Returns -------- xm : numpy array, 1d mid-points of bins pdf : numpy array, 2d time-array of histograms per time slots hm : numpy array, 1d average histogram dhm : numpy array, 1d standard error of average histogram ''' # set bins ------------------------------------------------------- if np.any(x) == None: if vname in ['diameter', 'nN_diameter']: lx = np.linspace(np.log10(20), np.log10(1000), nbins) lx = np.linspace(np.log10(10), np.log10(1000), nbins) x = 10**lx elif vname == 'nN_distance': lx = np.linspace(np.log10(20), np.log10(1000), nbins) x = 10**lx elif vname == 'ml': lx = np.linspace(np.log10(1), np.log10(10000), nbins) x = 10**lx elif vname in ['shape_bt108', 'aspect']: x = np.linspace(0, 1, nbins) elif vname in ['bt108_min']: x = np.linspace(180, 230, nbins) else: # use percentile bins v1, v99 = np.percentile(d[vname], [1, 99]) if logbinning: lx = np.linspace(np.log10(v1), np.log10(v99), nbins) x = 10**lx else: x = np.linspace(v1, v99, nbins) if vname in ['diameter', 'nN_diameter', 'nN_distance' ] and np.any(x) == None: lx = np.log10(x) lxm = gi.lmean(lx) xm = 10**lxm else: xm = gi.lmean(x) # ================================================================ # collect histograms --------------------------------------------- tlist = sorted(set(d['time_id'])) ntime = len(tlist) nbins = len(xm) pdf = np.zeros((ntime, nbins)) cnsd = np.zeros((ntime, nbins)) ccsd = np.zeros((ntime, nbins)) range_mask = (d[vname] >= x.min()) & (d[vname] <= x.max()) for itime, t in enumerate(tlist): m = (d['time_id'] == t) & range_mask v = d[vname][m] Ntot = len(v) pdf[itime], x = np.histogram(v, x, density=density) # pdf[itime] = scipy.ndimage.gaussian_filter(pdf[itime], 0) #.5) cnsd[itime] = Ntot * pdf[itime] / atot ccsd[itime] = 100. * np.pi / (4.) * (cnsd[itime] * xm**3) # ================================================================ pdf = np.ma.masked_invalid(pdf) cnsd = np.ma.masked_invalid(cnsd) ccsd = np.ma.masked_invalid(ccsd) if do_cnsd: pdf = cnsd elif do_ccsd: pdf = ccsd hm = pdf.mean(axis=0) hsig = pdf.std(axis=0) # try an standard error approach ntimes, nbins = pdf.shape dhm = 2 * hsig / np.sqrt(ntimes) if edge_values: xout = x else: xout = xm if full_time_info: return xout, pdf else: return xout, hm, dhm