def vwap(candles: np.ndarray, source_type: str = "hlc3", anchor: str = "D", sequential: bool = False) -> Union[float, np.ndarray]: """ VWAP :param candles: np.ndarray :param source_type: str - default: "close" :param sequential: bool - default=False :return: float | np.ndarray """ warmup_candles_num = get_config('env.data.warmup_candles_num', 240) if not sequential and len(candles) > warmup_candles_num: candles = candles[-warmup_candles_num:] source = get_candle_source(candles, source_type=source_type) group_idx = candles[:, 0].astype('datetime64[ms]').astype( 'datetime64[{}]'.format(anchor)).astype('int') vwap = aggregate(group_idx, candles[:, 5] * source, func='cumsum') vwap /= aggregate(group_idx, candles[:, 5], func='cumsum') if sequential: return vwap else: return None if np.isnan(vwap[-1]) else vwap[-1]
def vwap(candles: np.ndarray, source_type: str = "hlc3", anchor: str = "D", sequential: bool = False) -> Union[float, np.ndarray]: """ VWAP :param candles: np.ndarray :param source_type: str - default: "close" :param anchor: str - default: "D" :param sequential: bool - default: False :return: float | np.ndarray """ candles = slice_candles(candles, sequential) source = get_candle_source(candles, source_type=source_type) group_idx = candles[:, 0].astype('datetime64[ms]').astype( f'datetime64[{anchor}]').astype('int') vwap_values = aggregate(group_idx, candles[:, 5] * source, func='cumsum') vwap_values /= aggregate(group_idx, candles[:, 5], func='cumsum') if sequential: return vwap_values else: return None if np.isnan(vwap_values[-1]) else vwap_values[-1]
def plot_binned_ch(x0, ch, n_bin=9, **kw): ix, x = np2.quantilize(x0, n_quantile=n_bin, return_summary=True) p = npg.aggregate(ix, ch, func='mean') sd = npg.aggregate(ix, ch, func='std') n = npg.aggregate(ix, 1, func='sum') se = sd / np.sqrt(n) h = plt.errorbar(x, p, yerr=se, **kw) return h, x, p, se
def createDatasetWeek(dataset, look_back=60, look_ahead=3, sample_per=60, sample=.2): idx, dataX, dataY = [], [], [] #Static re-sampling index ix = np.floor(np.linspace(0, look_back, look_back * sample_per + 1)[0:-1]).astype('int') #For each bar, extract re-sampled history and target for i in range(dataset.shape[0] - (look_back + look_ahead) * sample_per - 1): #Print Progress if i % 100000 == 0: print(i, 'rows processed') if np.isnan(dataset[(i + look_back * sample_per) - 1]): continue #Randomly pick a sample if np.random.random() > sample: continue #Resample price history p = dataset[i:(i + look_back * sample_per)] h = npg.aggregate(ix, p, 'nanmax', fill_value=np.nan) l = npg.aggregate(ix, p, 'nanmin', fill_value=np.nan) a = np.array([h, l]) a = (a - p[-1]) / (a.max() - a.min()) a = a.transpose() a = a.reshape((1, look_back, 2)) #Determine target result - Skip if no signal at 20s try: # Find next non-null value st = i + look_back * sample_per + np.where( ~np.isnan(dataset[i + look_back * sample_per:][:20]))[0][0] # Find last non-null value until period end ed = i + look_back * sample_per + np.where(~np.isnan( dataset[i + look_back * sample_per:i + look_back * sample_per + look_ahead * sample_per]))[0][-1] except: a = np.nan if not np.isnan(a).any(): dataX.append(a[0]) dataY.append(dataset[st] < dataset[ed]) idx.append(i) return np.array(idx), np.array(dataX), np.array(dataY)
def strategy(): if np.isnan(Bprice).any(): return False #Calculate h,l ix=np.round((Bepoch[-1]-Bepoch)/60).astype('int') if np.diff(npg.aggregate(ix,ix,'max')[[-HistDepth,-1]])[0]!=HistDepth-1: print('Incomplete time series') return 0 h=npg.aggregate(-ix+max(ix),Bprice,'max',fill_value=np.nan)[-HistDepth:] l=npg.aggregate(-ix+max(ix),Bprice,'min',fill_value=np.nan)[-HistDepth:] a=np.array([h[-HistDepth:],l[-HistDepth:]]) a=(a-Bprice[-1])/(a.max()-a.min()) return (a[:,0]>a[:,1]).all()
def geneCount(self, spots): ''' Produces a matrix numCells-by-numGenes where element at position (c,g) keeps the expected number of gene g in cell c. :param spots: :return: ''' start = time.time() nC = self.yx_coords.shape[0] + 1 nG = spots.gene_panel.shape[0] # cell_id = self.cell_id # _id = np.append(cell_id, cell_id.max()+1) _id = self.ds.index.tolist() nN = spots.call.neighbors.shape[1] CellGeneCount = np.zeros([nC, nG]) name = spots.gene_panel.index.values ispot = spots.data.gene_id.values for n in range(nN - 1): c = spots.call.neighbors.loc[:, n].values # c = spots.neighboring_cells['id'].sel(neighbor=n).values group_idx = np.vstack((c[None, :], ispot[None, :])) a = spots.call.cell_prob.loc[:, n] accumarray = npg.aggregate(group_idx, a, func="sum", size=(nC, nG)) CellGeneCount = CellGeneCount + accumarray end = time.time() print('time in geneCount: ', end - start) CellGeneCount = xr.DataArray(CellGeneCount, coords=[_id, name], dims=['cell_id', 'gene_name']) # self.CellGeneCount = CellGeneCount return CellGeneCount
def group(array: np.ndarray, groupby_cols: list, compute_functions: list, calcs_cols: list, display=True, length=None) -> np.ndarray: """ Group the array according to a unique mapper of multiple columns (groupby_cols) by doing various calculations (compute_functions) over a select columns (calc_cols). :param array: np.ndarray, input array to be grouped. :param groupby_cols: list, columns to be used to do the grouping. :param compute_functions: list, columns to be used to specify the different calculations. :param calcs_cols: list, columns over which the computations will be done. :param display: bool, whether or not to display a printed HTML data frame. :param length: int, how many rows of the displayed HTML table to print. :return group_array: np.ndarray, grouped array. """ args_dict = {} for a in calcs_cols: for f in compute_functions: args_dict[a + "_" + f] = npg.aggregate( np.unique(array[groupby_cols], return_inverse=True)[1], array[a], f) struct_gb = rfn.unstructured_to_structured(np.c_[list( args_dict.values())].T, names=list(args_dict.keys())) grouped = np.unique(array[groupby_cols], return_inverse=True)[0] group_array = rfn.merge_arrays([grouped, struct_gb], flatten=True) if display: table(group_array, length) return group_array
def actionLikelihoods(data, policies, aggregateStates=True, logIn=False, logOut=False): """ Computes the action likelihoods of a demonstration data set for a given set of stochastic policies. :param data: [D x 2] array containing D state-action pairs :param policies: [S x A x P] array containing P stochastic policies :param aggregateStates: flag to indicate if the likelihoods should be computed per demonstration pair or if they should be aggregated per state :param logIn: flag to indicate if the policies are provided in the log domain :param logOut: flag to indicate if the result should be return in the log domain :return: [D x P] or [S x P] array (depending on the aggregateStates flag) containing the action likelihoods """ # transform policies to log domain logGoalPolicies = policies if logIn else np.log(policies) # evaluate likelihoods for each demonstration pair L = logGoalPolicies[data[:, 0], data[:, 1], :] # if desired, aggregate all action likelihoods per state if aggregateStates: L = aggregate(data[:, 0], L, axis=0, size=policies.shape[0]) # convert back to linear domain if not logOut: L = np.exp(L) return L
def geneCount_upd(self): """ Produces a matrix numCells-by-numGenes where element at position (c,g) keeps the expected counts of gene g in cell c. """ # make an array nS-by-nN and fill it with the spots id gene_ids = np.tile(self.spots.gene_id, (self.nN, 1)).T # flatten it gene_ids = gene_ids.ravel() # make corresponding arrays for cell_id and probs cell_ids = self.spots.parent_cell_id.ravel() probs = self.spots.parent_cell_prob.ravel() # make the array to be used as index in the group-by operation group_idx = np.vstack((cell_ids, gene_ids)) # For each cell aggregate the number of spots from the same gene. # It will produce an array of size nC-by-nG where the entry at (c,g) # is the gene counts of gene g within cell c N_cg = npg.aggregate(group_idx, probs, size=(self.nC, self.nG)) # assert N_cg.sum() == self.spots.data.shape[0], \ # "The sum of the background spots and the cell gene counts should be equal to the total number of spots" # make output. This part needs to be rewritten out = np.zeros([self.nC, self.nG]) out[1:, :] = N_cg[1:, :] # cell at position zero is the background self.cells.background_counts = N_cg[0, :] # Actual cells are on non-zero positions self.cells.geneCount = out
def plot_ch_vs_coh_by_dur(ch, coh, dur): cohs, i_coh = np.unique(coh, return_inverse=True) durs, i_dur = np.unique(dur, return_inverse=True) ch_by_coh_dur = npg.aggregate(np.stack([i_dur, i_coh]), ch.astype(np.double), 'mean') return plt2.plotmulti(cohs, ch_by_coh_dur, cmap='coolwarm'), ch_by_coh_dur, cohs, durs
def _binned_agg( array: np.ndarray, indices: np.ndarray, num_bins: int, *, func, fill_value, dtype, ) -> np.ndarray: """NumPy helper function for aggregating over bins.""" try: import numpy_groupies except ImportError: raise ImportError( "This function requires the `numpy_groupies` package to be installed. Please install it with pip or conda." ) mask = np.logical_not(np.isnan(indices)) int_indices = indices[mask].astype(int) shape = array.shape[:-indices.ndim] + (num_bins, ) result = numpy_groupies.aggregate( int_indices, array[..., mask], func=func, size=num_bins, fill_value=fill_value, dtype=dtype, axis=-1, ) return result
def hist_ch_rt( self, ch, rt, to_plot=True, normalize='density', ): """ @param ch: ch[trial] @param rt: rt[trial] @param n_ch: @param nt: @return: """ n = npg.aggregate( np2.cat([ch, np.round(rt / self.dt).astype('long')]), 1., 'sum', [self.n_ch, self.nt]) if normalize == 'density': n = n / np.sum(n) / self.dt elif normalize == 'None': pass else: raise ValueError('Unsupported normalize=%s' % normalize) if to_plot: h = plt.plot(self.t, n.T) else: h = None return n, h
def Overlapc(frames, Nx, Ny, mapid): #check # overlap frames onto an image time0 = timer() accum = np.reshape(numpy_groupies.aggregate(mapid.ravel(), frames.ravel()), (Ny, Nx)) timers['Overlap'] += timer() - time0 return accum
def aggregate_rt_ch( cond1, rt1, ch1, n_cond, nt=consts.NT, n_ch=consts.N_CH, ): """ @param rt_frame: [trial] @type rt_frame: torch.LongTensor @param ch: [trial] @type ch: torch.LongTensor @param cond: [trial] @type cond: torch.LongTensor @param nt: in frames @type nt: int @return: p_rt_ch[cond, rt_frame, ch] @rtype: torch.FloatTensor """ # Use torch.index_add(dim, index, tensor) along with ravel() # see https://pytorch.org/docs/stable/tensors.html#torch.Tensor.index_add return torch.tensor( npg.aggregate(np.stack([cond1, rt1, ch1]), 1., 'sum', [n_cond, nt, n_ch]))
def pseudobulk_from_label(ds, agg_labels, norm_total=10000): label_marker_counts = npg.aggregate(agg_labels.encoded, ds.vals[:, agg_labels.is_labelled].A, func='sum', axis=1) label_total_counts = npg.aggregate( agg_labels.encoded, ds.vals[:, agg_labels.is_labelled].sum(0).A.ravel(), func='sum') label_norm_counts = ((label_marker_counts / label_total_counts) * norm_total).T label_norm_counts = pd.DataFrame(label_norm_counts, columns=ds.ra.Gene, index=agg_labels.le.classes_) return label_norm_counts
def proj_to_grid( points, xoff, yoff, xresolution, yresolution, xsize, ysize, fill_small_holes, ): row = np.floor((yoff - points[:, 1]) / xresolution).astype(dtype=np.int) col = np.floor((points[:, 0] - xoff) / yresolution).astype(dtype=np.int) points_group_idx = row * xsize + col points_val = points[:, 2] # remove points that lie out of the dsm boundary mask = ((row >= 0) * (col >= 0) * (row < ysize) * (col < xsize)) > 0 points_group_idx = points_group_idx[mask] points_val = points_val[mask] # create a place holder for all pixels in the dsm group_idx = np.arange(xsize * ysize).astype(dtype=np.int) group_val = np.empty(xsize * ysize) group_val.fill(np.nan) # concatenate place holders with the real valuies, then aggregate group_idx = np.concatenate((group_idx, points_group_idx)) group_val = np.concatenate((group_val, points_val)) dsm = npg.aggregate(group_idx, group_val, func="nanmax", fill_value=np.nan) dsm = dsm.reshape((ysize, xsize)) ########################################################################### # try to fill very small holes if fill_small_holes: dsm_new = dsm.copy() nan_places = np.argwhere(np.isnan(dsm_new)) for i in range(nan_places.shape[0]): row = nan_places[i, 0] col = nan_places[i, 1] neighbors = [] for j in range(row - 1, row + 2): for k in range(col - 1, col + 2): if ( j >= 0 and j < dsm_new.shape[0] and k >= 0 and k < dsm_new.shape[1] ): val = dsm_new[j, k] if not np.isnan(val): neighbors.append(val) if neighbors: dsm[row, col] = np.median(neighbors) ########################################################################### return dsm
def compute_max_yval_boundary_rc(means, val=0.9): #rows,cols = np.where(np.isclose(means,val)) rows, cols = np.where(means > val) order = np.argsort(cols) cols = cols[order] rows = rows[order] rows = npg.aggregate(cols, rows, func='max') cols = np.unique(cols) return rows, cols
def pseudobulk_counts(self, vals, level=10000, gene_names=None): label_marker_counts = npg.aggregate(self.encoded, vals[:, :].A, func='sum', axis=1) label_total_counts = npg.aggregate(self.encoded, vals.sum(0).A.ravel(), func='sum') label_norm_counts = ((label_marker_counts/label_total_counts) * level).T original_grp_names = self.le.inverse_transform(np.arange(label_norm_counts.shape[0])) label_norm_counts = pd.DataFrame(label_norm_counts, index = original_grp_names, columns = gene_names, ) return label_norm_counts
def strategy(): global TtradesConf #Only trade if profitable if (Amount - Bask[-1]) / Bask[-1] < .87: return 0 #Only trade if the previous contract is over if Bepoch[-1] - Tepoch[-1] < MinTradePer: return 0 #Check if no missing history point if np.isnan(Bprice).any(): return False #Calculate h,l ix = np.round((Bepoch[-1] - Bepoch) / 60).astype('int') # Check if we have collected enough history if max(ix) - min(ix) < HistDepth - 1: return 0 #Check if we have a value for all bars if np.diff(npg.aggregate(ix, ix, 'max')[[-HistDepth, -1 ]])[0] != HistDepth - 1: print(Bepoch[-1], ': Incomplete time series') return 0 h = npg.aggregate(-ix + max(ix), Bprice, 'max', fill_value=np.nan)[-HistDepth:] l = npg.aggregate(-ix + max(ix), Bprice, 'min', fill_value=np.nan)[-HistDepth:] a = np.array([h[-HistDepth:], l[-HistDepth:]]) a = (a - Bprice[-1]) / (a.max() - a.min()) a = a.reshape((1, HistDepth, 2)) y = model.predict_on_batch([a])[0] if y > .535: TtradesConf.append(y[0]) print(Bepoch[-1], ':Order. Conf=', y) return y > .535
def trajs2dems(self, states, actions): """ Converts state and action trajectories into a demonstration data set. :param states: [M x N] array containing M state trajectories of length N :param actions: [M x N] array containing M action trajectories of length N :return: [S x A] array representing S histograms over actions observed at the different states of the MDP """ return aggregate([states.ravel(), actions.ravel()], 1, size=(self.nStates, self.nActions))
def det2stoch(x, nCats): """ Converts a collection of deterministic category assignments into a stochastic representation with all mass placed at the indicated categories. :param x: 1d array of integers :param nCats: integer indicating total number of categories available (must be greater than maximum value in x) :return: [L x nCats] array providing the stochastic representation, where L is the length of x """ l = len(x) return aggregate(np.vstack((range(l), x)), 1, size=(l, nCats))
def segmentation_adjacency(segmentation, connectivity=4): """Generate an adjacency matrix out of a given segmentation.""" assert connectivity == 4 or connectivity == 8 # Get centroids. idx = np.indices(segmentation.shape) ys = npg.aggregate(segmentation.flatten(), idx[0].flatten(), func='mean') xs = npg.aggregate(segmentation.flatten(), idx[1].flatten(), func='mean') ys = np.reshape(ys, (-1, 1)) xs = np.reshape(xs, (-1, 1)) points = np.concatenate((ys, xs), axis=1) # Get mass. nums, mass = np.unique(segmentation, return_counts=True) n = nums.shape[0] # Get adjacency (https://goo.gl/y1xFMq). tmp = np.zeros((n, n), np.bool) # Get vertically adjacency. a, b = segmentation[:-1, :], segmentation[1:, :] tmp[a[a != b], b[a != b]] = True # Get horizontally adjacency. a, b = segmentation[:, :-1], segmentation[:, 1:] tmp[a[a != b], b[a != b]] = True # Get diagonal adjacency. if connectivity == 8: a, b = segmentation[:-1, :-1], segmentation[1:, 1:] tmp[a[a != b], b[a != b]] = True a, b = segmentation[:-1, 1:], segmentation[1:, :-1] tmp[a[a != b], b[a != b]] = True result = tmp | tmp.T result = result.astype(np.uint8) adj = sp.coo_matrix(result) return adj, points, mass
def fit(self, ds: loompy.LoomConnection, plot: str = None) -> np.ndarray: """ Fit a classifier and use it to determine cluster predictive power Args: ds Dataset plot Filename for optional plot Returns: Matrix of classification probabilities, shape (n_cells, n_labels) """ logging.info("Feature selection") nnz = ds.map([np.count_nonzero], axis=0)[0] valid_genes = np.logical_and(nnz > 5, nnz < ds.shape[1] * 0.5).astype("int") ds.ra._Valid = valid_genes logging.info("Normalization") normalizer = cg.Normalizer(False) normalizer.fit(ds) logging.info("Feature selection") (_, enrichment, _) = cg.MarkerSelection(findq=False, labels_attr="Clusters").fit(ds) genes = np.zeros_like(ds.ra.Gene, dtype=bool) for ix in range(enrichment.shape[1]): genes[np.argsort(-enrichment[:, ix])[:25]] = True logging.info("PCA projection") pca = cg.PCAProjection(genes, max_n_components=50) transformed = pca.fit_transform(ds, normalizer) le = LabelEncoder().fit(ds.ca.ClusterName) self.le = le labels = le.transform(ds.ca.ClusterName) train_X, test_X, train_Y, test_Y = train_test_split(transformed, labels, test_size=0.2) classifier = RandomForestClassifier(max_depth=30) classifier.fit(train_X, train_Y) self.report = classification_report(test_Y, classifier.predict(test_X), target_names=le.classes_) self.proba = classifier.predict_proba(transformed) if plot: agg = npg.aggregate(labels, self.proba, axis=0, func="mean") plt.imshow(agg, cmap="viridis") plt.xticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="vertical", fontsize=7) plt.yticks(np.arange(le.classes_.shape[0]), le.classes_, rotation="horizontal", fontsize=7) plt.xlabel("Predicted cell type") plt.ylabel("Observed cell type") plt.title("Predictive power of cluster identities") cbar = plt.colorbar() cbar.set_label('Average classification probability', rotation=90) plt.savefig(plot, bbox_inches="tight") return self.proba
def aggregate(subs, val=1., *args, **kwargs): """ :param subs: [dim, element] :type subs: torch.LongTensor, (*torch.LongTensor) :type size: torch.LongTensor """ if type(subs) is tuple or type(subs) is list: subs = np.stack(subs) # subs = np.concatenate(npys(*(sub.reshape(1,-1) for sub in subs)), 0) elif torch.is_tensor(subs): subs = npy(subs) return tensor(npg.aggregate(subs, val, *args, **kwargs))
def extract_features(segmentation, image, form_features=None): features = FormFeatureExtraction(segmentation).get_features(form_features) group_idx = segmentation.flatten() # Prepend mean color to form features. if image.shape[2] == 1: mean = npg.aggregate(group_idx, image.flatten(), func='mean') mean = np.reshape(mean, (-1, 1)) features = np.concatenate((mean, features), axis=1) elif image.shape[2] == 3: r = npg.aggregate(group_idx, image[:, :, 0:1].flatten(), func='mean') r = np.reshape(r, (-1, 1)) g = npg.aggregate(group_idx, image[:, :, 1:2].flatten(), func='mean') g = np.reshape(g, (-1, 1)) b = npg.aggregate(group_idx, image[:, :, 2:3].flatten(), func='mean') b = np.reshape(b, (-1, 1)) features = np.concatenate((r, g, b, features), axis=1) else: raise ValueError return features.astype(np.float32)
def get_coefs(dim, dif_other, dur, ch, cond, t_RDK_dur, correct_only=True): """ :param dim: :param dif_other: :param dur: [tr] :param ch: [tr, dim] :param cond: [tr, dim] :param t_RDK_dur: :param correct_only: :return: glmres.params, glmres.bse, glmres, glmmodel """ id_dif = np.empty_like(cond) for dim1 in range(consts.N_DIM): out = np.unique(np.abs(cond[:, dim1]), return_inverse=True) _, id_dif[:, dim1] = out odim = consts.N_DIM - 1 - dim incl = ((t_RDK_dur == dur) & (np.isin(id_dif[:, odim], dif_other))) if correct_only: incl = (incl & (np.sign(ch[:, odim] - 0.5) == np.sign(cond[:, odim]))) ch1 = ch[incl, dim] coh1 = cond[incl, dim] cohs, id_cohs = np.unique(coh1, return_inverse=True) if np.issubdtype(ch1.dtype, np.floating): # p_ch=1 is given ch11 = np.stack( [npg.aggregate(id_cohs, ch1), npg.aggregate(id_cohs, 1 - ch1)], -1) else: ch11 = npg.aggregate(np.vstack((id_cohs, 1 - ch1)), 1) glmmodel = sm.GLM(ch11, sm.add_constant(cohs), family=sm.families.Binomial()) glmres = glmmodel.fit() return glmres.params, glmres.bse, glmres, glmmodel
def expressed_fraction_from_label(ds, agg_labels, frac_of_max=0.01): expr_threshold = ds.vals.max(1).A * frac_of_max expr_threshold[expr_threshold < 1] = 0 detected_frac = npg.aggregate( agg_labels.encoded, ds.vals[:, agg_labels.is_labelled].A > expr_threshold, func='mean', axis=1) detected_frac = pd.DataFrame(detected_frac.T, columns=ds.ra.Gene, index=agg_labels.le.classes_) return detected_frac
def dat2p_dat( self, ch_tr_dim: np.ndarray, dur_tr: np.ndarray, ev_tr_dim: np.ndarray ) -> (torch.Tensor, torch.Tensor, np.ndarray, np.ndarray, np.ndarray, np.ndarray): """ :param ch_tr_dim: [tr, dim] :param dur_tr: [tr] :param ev_tr_dim: [tr, dim] :return: n_cond_dur_ch[cond, dur, ch], ev_cond_fr_dim_meanvar[dcond, fr, dim, (mean, var)], ev_cond_dim[dcond, dim], dcond_tr[tr], durs[dur], ddur_tr[tr] """ nt0 = self.nt0 dt0 = self.dt0 n_ch_flat = self.n_ch subsample_factor = self.subsample_factor nt = int(nt0 // subsample_factor) durs, ddur_tr = np.unique(dur_tr, return_inverse=True) ddur_tr = ddur_tr.astype(np.int) n_dur = len(durs) durs = torch.tensor(durs) ddur_tr = torch.tensor(ddur_tr, dtype=torch.long) ch_tr_flat = consts.ch_by_dim2ch_flat(ch_tr_dim) ev_cond_dim, dcond_tr = np.unique(ev_tr_dim, return_inverse=True, axis=0) n_cond_flat = len(ev_cond_dim) ev_cond_fr_dim = torch.tensor(ev_cond_dim)[:, None, :].expand( [-1, nt, -1]) ev_cond_fr_dim_meanvar = torch.stack( [ev_cond_fr_dim, torch.zeros_like(ev_cond_fr_dim)], -1) n_cond_dur_ch = npt.tensor( npg.aggregate(np.stack([dcond_tr, npy(ddur_tr), ch_tr_flat]), 1., 'sum', [n_cond_flat, n_dur, n_ch_flat])) return n_cond_dur_ch, ev_cond_fr_dim_meanvar, ev_cond_dim, dcond_tr, \ durs, ddur_tr
def aggregate(xda, idx_or_size, func=np.nanmean, fill_value=np.nan): """Aggregates a 2D array using an index array or block size Parameters ---------- xda: xarray.DataArray the array with the data to aggregate idx_or_size: xarray.DataArray, int, or tuple either an array with each pixel indexed based on the value in the source array or the size in pixels of the desired grid func: callable (optional) the numpy function used to aggregate each block fill_value: int or float (optional) the value to use for missing values when cells do not fit perfectly into the original array Returns ------- numpy.array array containing the aggregated values """ if isinstance(idx_or_size, int): idx_or_size = (idx_or_size, idx_or_size) # Coerce array to a numpy array arr = to_numpy_array(xda) # Use scipy to split array into grid if block size given if isinstance(idx_or_size, (list, tuple)): return block_reduce(arr, idx_or_size, func=func, cval=fill_value) # Use numpy_groupies to group on an index array idx = np.ravel(to_numpy_array(idx_or_size)) # Use nodata from index if set nodata = fill_value if isinstance(idx_or_size, xr.DataArray): nodata = idx_or_size.rio.nodata # Set data to fill_value where nodata in index vals = np.ravel(arr) vals[idx == nodata] = fill_value return npg.aggregate(idx, vals, func=func, fill_value=fill_value)
def quantilize(v, n_quantile=5, return_summary=False, fallback_to_unique=True): """Quantile starting from 0. Array is flattened first.""" v = np.array(v) if fallback_to_unique: x, ix = uniquetol(v, return_inverse=True) if (not fallback_to_unique) or len(x) > n_quantile: n = v.size ix = np.int32(np.ceil((stats.rankdata(v, method='ordinal') + 0.) \ / n * n_quantile) - 1) if return_summary: x = npg.aggregate(ix, v, func='mean') return ix, x else: return ix
def gridder(grid, time, lon, lat, depth, data, dt, title='ROMS Observations'): """ Construct an observations set from raw observations by placing them onto a grid. Parameters ---------- grid : seapy.model.grid or filename string, Grid to place the raw observations onto time : ndarray, Time of the observations. This can be a scalar and all values will be assigned to the single time; otherwise, there must be a corresponding time to each value in the data. lon : ndarray, longitude of the observations. This can be a scalar and all values will be assigned to the single location; otherwise, there must be a corresponding longitude to each value in the data. lat : ndarray, latitude of the observations. This can be a scalar and all values will be assigned to the single location; otherwise, there must be a corresponding latitude to each value in the data. depth : ndarray or None, depth of the observations. If None, then all values are placed on the surface; otherwise, must be a corresponding depth for each value in the data. data : list of named tuples of seapy.roms.obs.raw_data, This list is comprised of each set of observation data types that are to be gridded together. If there is only one type (e.g., SSH observations, there is only one item). An Argo float would have two items in the list (temperature and salinity observations). The list is comprised of named tuples of the raw observations with the following fields: "type" : string (or integer) of the type from seapy.roms.obs.obs_types "provenance" : string (or integer) of the type from seapy.roms.obs.obs_provenance "values" : ndarray of actual observed values in units for type "error" : ndarray (or None) of individual observational uncertainty (same units of values). If not known, use None "min_error" : float of the minimum error that should be prescribed to the observations (typically, the instrument error) in the same units of values. dt : float The bin size of time for observations to be considered at the same time. The units must be the same as the provided time. title : string, optional, Title to assign the observations structure for output Returns ------- obs : seapy.obs class Resulting observations from the raw data as placed onto grid. Examples -------- A profile of temp and salt observations at a given lat/lon: >>> obs = seapy.obs.gridder(grid, times, lon, lat, [ seapy.roms.obs.raw_data("TEMP", "CTD_ARGO", temp, None, 0.1), seapy.roms.obs.raw_data("SALT", "CTD_ARGO", salt, None, 0.05)], dt = 1/24, title="Argo") Satellite Data from a number of lat/lons at a single time >>> obs = seapy.obs.gridder(grid, time, lon, lat, seapy.roms.obs.raw_data("ZETA", "SSH_AVISO", sla, sla_err, 0.05), dt = 2/24, title="SSH") These will generate new observation structures from the raw data. """ from numpy_groupies import aggregate # Make sure the input is of the proper form grid = seapy.model.asgrid(grid) time = np.atleast_1d(time) lon = np.atleast_1d(lon) lat = np.atleast_1d(lat) # First, before relying on gridding, extract only the data that are # encompassed by the grid region_list = np.where(np.logical_and.reduce(( lat >= np.min(grid.lat_rho), lat <= np.max(grid.lat_rho), lon >= np.min(grid.lon_rho), lon <= np.max(grid.lon_rho)))) if not np.any(region_list): warn("No observations were located within grid region_list") return None lat = lat[region_list] lon = lon[region_list] # Get the appropriate k-dimension depending on whether the data # are 2-D or 3-D if depth is None: # Get the grid locations from the data locations subsurface_values = False (j, i) = grid.ij((lon, lat)) depth = np.zeros(i.size) k = np.ma.array(np.resize(grid.n, i.size)) else: # Get the grid locations from the data locations subsurface_values = True depth = np.atleast_1d(depth)[region_list] (k, j, i) = grid.ijk((lon, lat, depth)) # Sub-select only the points that lie on our grid valid_list = np.where((~i.mask * ~j.mask * ~k.mask) == True) i = i.compressed() j = j.compressed() k = k[valid_list] depth = depth[valid_list] # Make sure the times are consistent and in dt-space if time.size == 1: time = np.resize(time, valid_list[0].size) else: time = time[region_list][valid_list] dtime = np.floor(time / dt) # Loop over all time intervals putting everything together. NOTE: The # preference is to use aggregate over the time-dimension just as we do # in the spatial-dimension; however, this led to crashing. ot = list() ox = list() oy = list() oz = list() odep = list() olon = list() olat = list() oval = list() oerr = list() oprov = list() otype = list() for t in seapy.progressbar.progress(np.unique(dtime)): time_list = np.where(dtime == t) mtime = np.nanmean(time[time_list]) for v in data: valid_data = np.s_[:] if isinstance(v.values, np.ma.core.MaskedArray): valid_data = \ (v.values[region_list][valid_list][time_list].nonzero())[0] if not valid_data.size: continue # Put together the indices based on the type of data we have if subsurface_values: idx = (k[time_list][valid_data], j[time_list][valid_data], i[time_list][valid_data]) else: idx = (j[time_list][valid_data], i[time_list][valid_data]) indices = np.floor(idx).astype(int) # Grid the data onto our grid and compute the mean and variance ii = aggregate(indices, i[time_list][valid_data], func='mean') jj = aggregate(indices, j[time_list][valid_data], func='mean') binned = np.where(ii * jj > 0) ii = ii[binned].ravel() jj = jj[binned].ravel() (latl, lonl) = grid.latlon((ii, jj)) Nd = ii.size # Put the co-located values together nvalues = aggregate(indices, v.values[region_list][valid_list][ time_list][valid_data], func='mean') # Get their variance vari = aggregate(indices, v.values[region_list][valid_list][ time_list][valid_data], func='var') # Put together the known observation values if v.error is not None: errs = aggregate(indices, v.error[region_list][valid_list][ time_list][valid_data]**2, func='mean') errs = errs[binned].flatten() else: errs = 0.0 # Build the depth vectors if subsurface_values: dd = aggregate(indices, depth[time_list][valid_data], func='mean') kk = aggregate(indices, k[time_list][valid_data], func='mean') dd = dd[binned].ravel() # ROMS counts from 1 for depth layers kk = kk[binned].ravel() + 1 else: kk = np.resize(grid.n, Nd) dd = np.zeros(ii.shape) # Put all of the data from this time into our lists ot.append(np.resize(mtime, Nd)) ox.append(ii) oy.append(jj) oz.append(kk) odep.append(dd) olon.append(lonl) olat.append(latl) oval.append(nvalues[binned].flatten()) otype.append(np.resize(seapy.roms.obs.astype(v.type), Nd)) oprov.append(np.resize( seapy.roms.obs.asprovenance(v.provenance), Nd)) oerr.append(np.maximum(v.min_error**2, np.maximum(vari[binned].flatten(), errs))) # Make sure that we have something relevant if not oval: return None # Put everything together and create an observation class return seapy.roms.obs.obs(time=np.hstack(ot).ravel(), x=np.hstack(ox).ravel(), y=np.hstack(oy).ravel(), z=np.hstack(odep).ravel(), lat=np.hstack(olat).ravel(), lon=np.hstack(olon).ravel(), depth=np.hstack(oz).ravel(), value=np.hstack(oval).ravel(), error=np.hstack(oerr).ravel(), type=np.hstack(otype).ravel(), provenance=np.hstack(oprov).ravel(), title=title)