def cg_warmstart_reduces_iterations(A): options = dict(graph_iters=1, maxiter=1000) b = da.ones(A.shape[0], chunks=A.chunks[0]) x, res, iters = cg.cg_graph(A, b, **options) # no iters if solved _, _, iters_ws = cg.cg_graph(A, b, x_init=x, **options) assert iters_ws == 0 # iters deterministic x_partial, res_partial, iters_partial = cg.cg_graph(A, b, **options) _, _, iters_ws = cg.cg_graph(A, b, x_init=x_partial, **options) assert iters_partial + iters_ws == iters perturb_x = 0.1 * da.mean(x).compute() / (x.size**0.5) perturb_b = 0.1 * da.mean(b).compute() / (b.size**0.5) xp = x * (1 + da.random.normal(0, perturb_x, x.size, chunks=x.chunks)) bp = b * (1 + da.random.normal(0, perturb_b, b.size, chunks=b.chunks)) # nearby b _, _, iterp = cg.cg_graph(A, bp, x_init=x, **options) assert iters > iterp, '{} > {}'.format(iters, iterp) # nearby x0 _, _, iters_perturb = cg.cg_graph(A, b, x_init=xp, **options) assert iters > iterp, '{} > {}'.format(iters, iterp) # nearby (b, x0) _, _, iters_perturb = cg.cg_graph(A, bp, x_init=xp, **options) assert iters > iterp, '{} > {}'.format(iters, iterp) return True
def test_0d_array(): x = da.mean(da.ones(4, chunks=4), axis=()).compute() x = da.mean(da.ones(4, chunks=4), axis=0).compute() y = np.mean(np.ones(4)) assert type(x) == type(y) x = da.sum(da.zeros(4, chunks=1)).compute() y = np.sum(np.zeros(4)) assert type(x) == type(y)
def ttest_ind(a, b, axis=0, equal_var=True): v1 = da.var(a, axis, ddof=1) # XXX: np -> da v2 = da.var(b, axis, ddof=1) # XXX: np -> da n1 = a.shape[axis] n2 = b.shape[axis] if equal_var: df, denom = _equal_var_ttest_denom(v1, n1, v2, n2) else: df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2) res = _ttest_ind_from_stats(da.mean(a, axis), da.mean(b, axis), denom, df) return delayed(Ttest_indResult, nout=2)(*res)
def fit(self, X, y=None): # CHECKING THE TYPES if isinstance(X, dask.array.Array): import dask.array as numerical_module from dask.array.linalg import cholesky, inv else: import numpy as numerical_module from scipy.linalg import cholesky, inv # 1. Computes the mean vector and the covariance matrix of the training set mu = numerical_module.mean(X, axis=0) cov = numerical_module.cov(X.T) # 2. Computes the inverse of the covariance matrix inv_cov = pinv(cov) if self.pinv else inv(cov) # 3. Computes the Cholesky decomposition of the inverse covariance matrix self.weights = cholesky( inv_cov, lower=True ) # Setting lower true to have the same implementation as in the previous code self.input_subtract = mu self.input_divide = 1.0 return self
def score_gene_sets(ds, gs, z_score_ds=True, use_dask=False): if use_dask: import dask.array as np else: import numpy as np # gene sets has genes on rows, sets on columns # ds has cells on rows, genes on columns gs_x = gs.x ds_x = ds.x if z_score_ds: ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x gene_indices = (gs_x.sum(axis=1) > 0) & ( ds_x.std(axis=0) > 0 ) # keep genes that are in gene sets and have standard deviation > 0 gs_x = gs_x[gene_indices] ds_x = ds_x[:, gene_indices] if z_score_ds: ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x std = np.std(ds_x, axis=0) mean = np.mean(ds_x, axis=0) ds_x = (ds_x - mean) / std ds_x[ds_x < -5] = -5 ds_x[ds_x > 5] = 5 ds_x[ds_x == np.nan] = 0 scores = ds_x.dot(gs_x) ngenes_in_set = gs_x.sum(axis=0) ngenes_in_set[ngenes_in_set == 0] = 1 # avoid divide by zero scores = scores / ngenes_in_set # scores contains cells on rows, gene sets on columns return wot.Dataset(x=scores, row_meta=ds.row_meta, col_meta=gs.col_meta)
def score(self, X, y): """ Provide score by comparing predictions and ground truth. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query test data. Acceptable formats: dask CuPy/NumPy/Numba Array y : array-like (device or host) shape = (n_samples, n_features) Outputs test data. Acceptable formats: dask CuPy/NumPy/Numba Array Returns ------- score """ y_pred = self.predict(X, convert_dtype=True) if not isinstance(y_pred, da.Array): y_pred = y_pred.to_dask_array(lengths=True) if not isinstance(y, da.Array): y = y.to_dask_array(lengths=True) y_true = y.squeeze() y_mean = y_true.mean(axis=0) residual_sss = ((y_true - y_pred)**2).sum(axis=0, dtype='float64') total_sss = ((y_true - y_mean)**2).sum(axis=0, dtype='float64') r2_score = da.mean(1 - (residual_sss / total_sss)) return r2_score.compute()
def score(self, X, y): """ Provide score by comparing predictions and ground truth. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query test data. Acceptable formats: dask CuPy/NumPy/Numba Array y : array-like (device or host) shape = (n_samples, n_features) Outputs test data. Acceptable formats: dask CuPy/NumPy/Numba Array Returns ------- score """ labels, _, _ = self.predict(X, convert_dtype=True) diff = (labels == y) if self.data_handler.datatype == 'cupy': mean = da.mean(diff) return mean.compute() else: raise ValueError("Only Dask arrays are supported")
def score(self, X, y, convert_dtype=True): """ Predict labels for a query from previously stored index and index labels. The process is done in a multi-node multi-GPU fashion. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query test data. Acceptable formats: dask CuPy/NumPy/Numba Array y : array-like (device or host) shape = (n_samples, n_features) Labels test data. Acceptable formats: dask CuPy/NumPy/Numba Array Returns ------- score """ if self.data_handler.datatype == 'cupy': preds, _, _ = self.predict(X, convert_dtype=convert_dtype) diff = (preds == y) mean = da.mean(diff) return mean.compute() else: raise ValueError("Only Dask arrays are supported")
def normalize(self, gropuname1, groupname2): # ## normalize y ## # with h5py.File(self.OUTPATH, mode='r+') as f: for atom in self.MAINCHAIN: # load train_y = da.from_array( f[f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) val_y = da.from_array( f[f'/{atom}/{groupname2}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) total_y = da.concatenate([train_y, val_y], axis=0) y_mean = da.mean(total_y.reshape(-1), axis=0).compute() y_std = da.std(total_y.reshape(-1), axis=0).compute() # normalize train_y = da.divide(da.subtract(train_y, y_mean), y_std) val_y = da.divide(da.subtract(val_y, y_mean), y_std) # save da.to_hdf5(self.OUTPATH, f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}', train_y) da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname2}/{self.RESPONSE_NAME}', val_y) f.create_dataset(name=f'/{atom}/normalization', data=np.array([y_mean, y_std])) print(f'[{atom}]\tmean: {y_mean:.3f}\tstd: {y_std:.3f}')
def load_data(statistic, axis): import dask.array as da import numpy as np from glue.utils import view_shape x = da.from_zarr('/mnt/cephfs/zarr_data_full') f = 1500 scale = 2 lh = [] for k in range(scale): lc = [] for i in range(scale): lr = [] for j in range(scale): lr.append(x[f % 3500]) f = f + 1 lc.append(da.concatenate(lr)) lh.append(da.concatenate(lc, 1)) z = da.concatenate(lh, 2) if statistic == 'minimum': return da.min(z, axis).compute() elif statistic == 'maximum': return da.max(z, axis).compute() elif statistic == 'mean' or statistic == 'median': return da.mean(z, axis).compute() elif statistic == 'percentile': return percentile / 100 elif statistic == 'sum': return da.sum(z.axis).compute() return 0
def test_PowerMethod_project(): N, P = 1000, 1000 k = 10 svd_array = da.random.random(size=(N, P)).persist() proj_array = da.random.random(size=(10, P)).persist() mu = da.mean(svd_array, axis=0).persist() std = da.diag(1 / da.std(svd_array, axis=0)).persist() for scale in [True, False]: for center in [True, False]: svd_array1 = svd_array proj_array1 = proj_array if center: svd_array1 = svd_array1 - mu proj_array1 = proj_array1 - mu if scale: svd_array1 = svd_array1.dot(std) proj_array1 = proj_array1.dot(std) U, S, V = da.linalg.svd(svd_array1) U_k, S_k, V_k = svd_to_trunc_svd(U, S, V, k=k) PM = PowerMethod(k=k, scale=scale, center=center, factor=None, tol=1e-12) U_PM, S_PM, V_PM = PM.svd(array=svd_array) np.testing.assert_array_almost_equal( PM.project(proj_array, onto=V_k.T), proj_array1.dot(V_k.T))
def score(self, X, y): """ Provide score by comparing predictions and ground truth. Parameters ---------- X : array-like (device or host) shape = (n_samples, n_features) Query test data. Acceptable formats: dask CuPy/NumPy/Numba Array y : array-like (device or host) shape = (n_samples, n_features) Outputs test data. Acceptable formats: dask CuPy/NumPy/Numba Array Returns ------- score """ if self.data_handler.datatype == 'cupy': preds, _, _ = self.predict(X, convert_dtype=True) y_mean = y.mean(axis=0) residual_sss = ((y - preds)**2).sum(axis=0) total_sss = ((y - y_mean)**2).sum(axis=0) r2_score = da.mean(1 - (residual_sss / total_sss)) return r2_score.compute() else: raise ValueError("Only Dask arrays are supported")
def extract(self): df_path = pd.read_csv('path_to_file.csv', sep=';') df_path = df_path.rename(columns={'Unnamed: 0': 'id'}) df_path = df_path.set_index('id') print(df_path) ds_batch = xr.open_mfdataset(df_path['path'], parallel=True) #loading ncdf files print(ds_batch) print("--- Total size (GB):") print(ds_batch.nbytes * (2**-30)) # get size of the dataset in GB #getting average albedos over whole time period (used for maps and scatter plots) darr = ds_batch['QFLAG'] #getting data for specific band print(darr) #res = darr.mean(['lon','lat']) #res = da.count_nonzero( da.bitwise_and(darr//2**5, 1), ['lon','lat']) #res = (darr==32).sum(['lon','lat']) #res = xr.ufunc.bitwise_and(darr, 0b100000).sum(['lon','lat']) func = lambda x: np.bitwise_and(np.right_shift(x, 5), np.uint64(1)) func = lambda x: np.bitwise_and(x, np.uint64(1)) res = xr.apply_ufunc(func, darr, input_core_dims=[['lon', 'lat']], dask='parallelized', vectorize=True) #res = itwise_and(np.right_shift(darr, 5), 1).sum(['lon','lat]) #res = (darr==32).max(['lon','lat']) print(np.array(res)) sys.exit() da_count = ((da >> 5) & 1) #calculate mean over time #da_mean_lowres = da_mean.sel(lat=slice(70, 30)).sel(lon=slice(-25, 70)) # this can be used to zoom in over Europe da_mean_lowres = da_mean.isel(lat=slice(None, None, 10)).isel( lon=slice(None, None, 10)) #downsampling for faster plotting #getting average, min and max albedos for each time step (used to plot timeline) da_timeline_mean = da.mean(['lon', 'lat']) da_timeline_max = da.max(['lon', 'lat']) da_timeline_min = da.min(['lon', 'lat']) #closing arrays to free memory DS.close() da.close() da_mean.close() return da_mean_lowres, da_timeline_mean, da_timeline_max, da_timeline_min da_mean_lowres.close() da_timeline_mean.close() da_timeline_max.close() da_timeline_min.close()
def test_func(default_val, dataset_flat, shape, dataset): shift_up = array.hstack([ array.zeros((shape[0], 1, shape[2])), dataset[:, :-1, :] ]).transpose([1, 2, 0]).reshape([shape[1] * shape[2], -1]) shift_up_mult = dataset_flat * shift_up del shift_up return array.mean(shift_up_mult, axis=1)
def test_make_snp_array_case_binom(shape, threshold): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.random(size=shape) arr[arr > threshold] = float('nan') assume(da.mean(da.mean(da.isnan(arr), axis=0) < 1) == 1) # Asserts that every tested arr has at least 1 non-nan value in each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='binom', dtype='float') mean = snp_array.mean(axis=0) np.testing.assert_array_almost_equal(1 + mean, np.ones(shape[1]))
def test_0d_array(): x = da.mean(da.ones(4, chunks=4), axis=0).compute() y = np.mean(np.ones(4)) assert type(x) == type(y) x = da.sum(da.zeros(4, chunks=1)).compute() y = np.sum(np.zeros(4)) assert type(x) == type(y)
def compute_importance_gbt(x, y, x_test, y_test): """Compute importance based on gradient boosted trees.""" print("Computing importance based on gradient boosted trees ... ") num_factors = y.shape[1] #num_codes = x.shape[0] importance_matrix = list() train_loss = [] test_loss = [] for i in range(num_factors): model = GradientBoostingClassifier(verbose=1) model.fit(x, y[:, i]) importance_matrix.append(np.abs(model.feature_importances_)) train_loss.append(da.mean(model.predict(x) == y[:, i])) test_loss.append(da.mean(model.predict(x_test) == y_test[:, i])) return da.vstack(importance_matrix), np.mean(train_loss), np.mean( test_loss)
def ttest_1samp(a, popmean, axis=0, nan_policy="propagate"): if nan_policy != "propagate": raise NotImplementedError( "`nan_policy` other than 'propagate' have not been implemented.") n = a.shape[axis] df = n - 1 d = da.mean(a, axis) - popmean v = da.var(a, axis, ddof=1) denom = da.sqrt(v / float(n)) with np.errstate(divide="ignore", invalid="ignore"): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_1sampResult, nout=2)(t, prob)
def ttest_1samp(a, popmean, axis=0, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] df = n - 1 d = da.mean(a, axis) - popmean v = da.var(a, axis, ddof=1) denom = da.sqrt(v / float(n)) with np.errstate(divide='ignore', invalid='ignore'): t = da.divide(d, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_1sampResult, nout=2)(t, prob)
def scale_drop(Xmat, h5write): xmeans = da.mean(Xmat, axis=0) print("Centering X columns") Xmat = Xmat - xmeans xnorms = da.linalg.norm(Xmat, axis=0) xnorms, xmeans = da.compute(xnorms, xmeans) keepcols = np.arange(Xmat.shape[1])[np.nonzero(xnorms)] dropcols = np.arange(Xmat.shape[1])[xnorms == 0] print("Dropping column with norm zero:") xcolnames = [] for colname in xinfo['xcolnames']: xcolnames.append(colname) for dropix in dropcols: if xinfo['xcolnames'][colname] == dropix: print(colname) Xmat = Xmat[:, keepcols] xnorms = xnorms[keepcols] xmeans = xmeans[keepcols] xcolnames_keep = np.array(xcolnames)[keepcols] print("Standardizing X columns") Xmat = Xmat / xnorms tol = 1e-8 ## IF using scipy QR #Qx, Rx, Px = scipy.linalg.qr(Xmat, mode='economic', pivoting=True) #dropcols_qr = Px[np.nonzero(abs(np.diag(Rx))<tol)] #keepcols_qr = Px[np.nonzero(abs(np.diag(Rx))>=tol)] #rank = np.sum(abs(np.diag(Rx)) >= tol) ## USING BLOCKED QR Qx, Rx, PImat = bk.tsqr_pivot_seq(Xmat) #Rx = Rx[0:rank, 0:rank] #Qx = Qx[:, 0:rank] keepcols_qr = np.argmax(PImat, axis=0) dropmask = np.ones(Xmat.shape[1], dtype=bool) dropmask[keepcols_qr] = False dropcols_qr = np.arange(Xmat.shape[1])[dropmask] rank = keepcols_qr.shape[0] print("Dropping columns based on pivoted QR:") print("\t" + "\n\t".join(xcolnames_keep[dropcols_qr])) xnorms = xnorms[keepcols_qr] xmeans = xmeans[keepcols_qr] Xmat = Xmat[:, keepcols_qr] xcolnames_keep = xcolnames_keep[keepcols_qr] #keepcols_store = h5write.create_array(h5write.root, 'keepcols', # keepcols) #cols_orig_store = h5write.create_array(h5write.root, 'xcolnames_all', xcolnames) #cols_keep_store = h5write.create_array(h5write.root, 'xcolnames_keep', xcolnames_keep) #da.store([xcolnames, xcolnames_keep], [cols_orig_store, cols_keep_store]) return Xmat, Qx, Rx
def fit(self, X, y): # CHECKING THE TYPES if isinstance(X, dask.array.Array): import dask.array as numerical_module from dask.array.linalg import cholesky, inv else: import numpy as numerical_module from scipy.linalg import cholesky, inv possible_labels = set(y) y_ = numerical_module.array(y) n_classes = len(possible_labels) # 1. compute the means for each label mu_l = numerical_module.array( [ numerical_module.mean( X[numerical_module.where(y_ == label)[0]], axis=0 ) for label in possible_labels ] ) # 2. Compute Sw Sw = numerical_module.zeros((X.shape[1], X.shape[1]), dtype=float) for label in possible_labels: indexes = numerical_module.where(y_ == label)[0] X_l_mu_l = X[indexes] - mu_l[label] Sw += X_l_mu_l.T @ X_l_mu_l # 3. Compute inv scaled_Sw = (1 / n_classes) * Sw inv_scaled_Sw = pinv(scaled_Sw) if self.pinv else inv(scaled_Sw) # 3. Computes the Cholesky decomposition self.weights = cholesky( inv_scaled_Sw, lower=True ) # Setting lower true to have the same implementation as in the previous code self.input_subtract = 0 self.input_divide = 1.0 return self
def mean(self, axis=None, dtype=None, keepdims=False, split_every=None, out=None) -> da.core.Array: if out is not None: raise NotImplementedError( f'`out` argument is not supported for {StackedArray.__name__}') means = (da.mean(array, axis=axis, dtype=dtype, keepdims=keepdims, split_every=split_every, out=None) for array in expand_arrays(self.arrays)) return self.reduce(means, da.add)
def test_make_snp_array_case_normal(shape, threshold): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.random(size=shape) arr[arr > threshold] = float('nan') assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1) # Asserts that every tested arr has a non-zero std for each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='norm', dtype='float') np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0), np.ones(shape[1]))
def const_features_for_single_grid_single_file(grid_indx, wind_grid_indx, data): client = Client() dims = data['no2'].shape ntime = dims[0] - 1 nvel = dims[2] data_dict = dict() data_hours = da.array(data['hour'][1:]) data_dict['hour'] = da.repeat(data_hours[:, :], nvel, axis=1) data_dict['date'] = da.zeros((ntime, nvel)) + da.mean(data['date'][:]) data_dict['date'] = data_dict['date'] cum_ic_flash = da.array(data['IC_FLASHCOUNT'][:, grid_indx, :]) cum_cg_flash = da.array(data['CG_FLASHCOUNT'][:, grid_indx, :]) data_dict['IC_FLASHCOUNT'] = da.repeat(cum_ic_flash[1:, :] - cum_ic_flash[:-1, :], nvel, axis=1) data_dict['CG_FLASHCOUNT'] = da.repeat(cum_cg_flash[1:, :] - cum_cg_flash[:-1, :], nvel, axis=1) e_no_lower = da.array(data['E_NO'])[1:, grid_indx, :] e_no_upper = da.zeros((ntime, nvel - e_no_lower.shape[1])) data_dict['E_NO'] = da.concatenate([e_no_lower, e_no_upper], axis=1) data_dict['U'] = (data['U'][1:, wind_grid_indx[0][0], :] + data['U'][1:, wind_grid_indx[0][1], :])/2 data_dict['V'] = (data['V'][1:, wind_grid_indx[1][0], :] + data['V'][1:, wind_grid_indx[1][1], :])/2 match_vars = ['no2', 'pres', 'temp', 'CLDFRA'] print('Variables read directly from wrf: {}'.format(match_vars[:])) for var in match_vars: data_dict[var] = da.array(data[var])[1:, grid_indx, :] reduce_dim_vars = ['elev', 'W'] print('Variables average vertically: {}'.format(reduce_dim_vars[:])) for var in reduce_dim_vars: this_value = da.array(data[var])[1:, grid_indx, :] data_dict[var] = (this_value[:, 1:] + this_value[:, :-1]) / 2 add_dim_vars = ['COSZEN', 'PBLH', 'LAI', 'HGT', 'SWDOWN', 'GLW'] print('Variables add vertical layers: {}'.format(add_dim_vars[:])) for var in add_dim_vars: this_value = da.array(data[var])[1:, grid_indx, :] data_dict[var] = da.repeat(this_value, nvel, axis=1) print('Key of dict:{}'.format(data_dict.keys())) save_arr = [] for var in data_dict.keys(): data_dict[var] = data_dict[var].flatten() save_arr.append(data_dict[var]) save_arr = da.array(save_arr).compute() return save_arr
def ttest_rel(a, b, axis=0, nan_policy="propagate"): if nan_policy != "propagate": raise NotImplementedError( "`nan_policy` other than 'propagate' have not been implemented.") n = a.shape[axis] df = float(n - 1) d = (a - b).astype(np.float64) v = da.var(d, axis, ddof=1) dm = da.mean(d, axis) denom = da.sqrt(v / float(n)) with np.errstate(divide="ignore", invalid="ignore"): t = da.divide(dm, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_relResult, nout=2)(t, prob)
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, blockshape=(2, )) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, chunks=(2,)) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def ttest_rel(a, b, axis=0, nan_policy='propagate'): if nan_policy != 'propagate': raise NotImplementedError("`nan_policy` other than 'propagate' " "have not been implemented.") n = a.shape[axis] df = float(n - 1) d = (a - b).astype(np.float64) v = da.var(d, axis, ddof=1) dm = da.mean(d, axis) denom = da.sqrt(v / float(n)) with np.errstate(divide='ignore', invalid='ignore'): t = da.divide(dm, denom) t, prob = _ttest_finish(df, t) return delayed(Ttest_relResult, nout=2)(t, prob)
def test_make_snp_array_case_normal(shape, max_value, mask_nans): assume(shape[0] > 1 and shape[1] > 1) # Assumes not degenerate 2d Array arr = da.random.randint(0, max_value, size=shape) if mask_nans: arr[arr == max_value - 1] = float('nan') assume(da.mean(da.nanstd(arr, axis=0) > 0) == 1) # Asserts that every tested arr has a non-zero std for each column snp_array = utils.make_snp_array(arr, mean=True, std=True, std_method='norm', mask_nan=mask_nans, dtype='int8') np.testing.assert_array_almost_equal(1 + snp_array.mean(axis=0), np.ones(shape[1]))
def fit( self, X: ArrayLike, y: Optional[ArrayLike] = None, ) -> "PattersonScaler": """Fit scaler parameters Parameters ---------- X : (samples, variants) array_like Alternate allele counts with missing values encoded as either nan or negative numbers. """ X = da.ma.masked_array(X, mask=da.isnan(X) | (X < 0)) self.mean_ = da.ma.filled(da.mean(X, axis=0), fill_value=np.nan) p = self.mean_ / self.ploidy self.scale_ = da.sqrt(p * (1 - p)) self.n_features_in_ = X.shape[1] return self
def __call__(self, datasets, **info): """Create HNCC DNB composite.""" if len(datasets) != 4: raise ValueError("Expected 4 datasets, got %d" % (len(datasets), )) dnb_data = datasets[0] sza_data = datasets[1] lza_data = datasets[2] # this algorithm assumes units of "W cm-2 sr-1" so if there are other # units we need to adjust for that if dnb_data.attrs.get("units", "W m-2 sr-1") == "W m-2 sr-1": unit_factor = 10000. else: unit_factor = 1. mda = dnb_data.attrs.copy() dnb_data = dnb_data.copy() / unit_factor # convert to decimal instead of % moon_illum_fraction = da.mean(datasets[3].data) * 0.01 phi = da.rad2deg(da.arccos(2. * moon_illum_fraction - 1)) vfl = 0.026 * phi + 4.0e-9 * (phi**4.) m_fullmoon = -12.74 m_sun = -26.74 m_moon = vfl + m_fullmoon gs_ = self.gain_factor(sza_data.data) r_sun_moon = 10.**((m_sun - m_moon) / -2.5) gl_ = r_sun_moon * self.gain_factor(lza_data.data) gtot = 1. / (1. / gs_ + 1. / gl_) dnb_data += 2.6e-10 dnb_data *= gtot mda['name'] = self.attrs['name'] mda['standard_name'] = 'ncc_radiance' dnb_data.attrs = mda return dnb_data
def score(self, X, y=None): """Return the average log-likelihood of all samples. See. "Pattern Recognition and Machine Learning" by C. Bishop, 12.2.1 p. 574 or http://www.miketipping.com/papers/met-mppca.pdf Parameters ---------- X : array, shape(n_samples, n_features) The data. y : Ignored Returns ------- ll : float Average log-likelihood of the samples under the current model """ return da.mean(self.score_samples(X))
def cluster_centroids(data, clusters, k=None): """Return centroids of clusters & clusters in data. data is an array of observations with shape (A, B, ...). clusters is an array of integers of shape (A,) giving the index (from 0 to k-1) of the cluster to which each observation belongs. The clusters must all be non-empty. k is the number of clusters. If omitted, it is deduced from the values in the clusters array. The result is an array of shape (k, B, ...) containing the centroid of each cluster. >>> data = np.array([[12, 10, 87], ... [ 2, 12, 33], ... [68, 31, 32], ... [88, 13, 66], ... [79, 40, 89], ... [ 1, 77, 12]]) >>> cluster_centroids(data, np.array([1, 1, 2, 2, 0, 1])) array([[ 79., 40., 89.], [ 5., 33., 44.], [ 78., 22., 49.]]) """ if k is None: k = (da.max(clusters)).compute() + 1 result = [] result = [ da.mean(data[clusters.compute() == i], axis=0) for i in xrange(k) ] return da.reshape(da.concatenate(result, axis=0), shape=(k, ) + data.shape[1:])
def test_ScaledArray_fromArrayMoment_array(): N1, P = 7, 10 N2 = 5 array1 = da.random.random(size=(N1, P)).persist() mu = da.mean(array1, axis=0) std = da.diag(1/da.std(array1, axis=0)) array2 = da.random.random(size=(N2, P)).persist() for scale in [True, False]: for center in [True, False]: for factor1 in [None, 'n', 'p']: sa1 = ScaledCenterArray(scale=scale, center=center, factor=factor1) sa1.fit(array1) for factor2, factor_value in zip([None, 'n', 'p'], [1, N2, P]): sa2 = ScaledCenterArray.fromScaledArray(array=array2, scaled_array=sa1, factor=factor2) sa2_array = array2 if center: sa2_array = sa2_array - mu if scale: sa2_array = sa2_array.dot(std) np.testing.assert_array_almost_equal(sa2.array, sa2_array)
def test_xarray(): y = da.mean(xr.DataArray([1, 2, 3.0])) assert_eq(y, y)