def normalize(self, gropuname1, groupname2): # ## normalize y ## # with h5py.File(self.OUTPATH, mode='r+') as f: for atom in self.MAINCHAIN: # load train_y = da.from_array( f[f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) val_y = da.from_array( f[f'/{atom}/{groupname2}/{self.RESPONSE_NAME}'], chunks=("auto", 3)) total_y = da.concatenate([train_y, val_y], axis=0) y_mean = da.mean(total_y.reshape(-1), axis=0).compute() y_std = da.std(total_y.reshape(-1), axis=0).compute() # normalize train_y = da.divide(da.subtract(train_y, y_mean), y_std) val_y = da.divide(da.subtract(val_y, y_mean), y_std) # save da.to_hdf5(self.OUTPATH, f'/{atom}/{gropuname1}/{self.RESPONSE_NAME}', train_y) da.to_hdf5(self.OUTPATH, f'/{atom}/{groupname2}/{self.RESPONSE_NAME}', val_y) f.create_dataset(name=f'/{atom}/normalization', data=np.array([y_mean, y_std])) print(f'[{atom}]\tmean: {y_mean:.3f}\tstd: {y_std:.3f}')
def score_gene_sets(ds, gs, z_score_ds=True, use_dask=False): if use_dask: import dask.array as np else: import numpy as np # gene sets has genes on rows, sets on columns # ds has cells on rows, genes on columns gs_x = gs.x ds_x = ds.x if z_score_ds: ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x gene_indices = (gs_x.sum(axis=1) > 0) & ( ds_x.std(axis=0) > 0 ) # keep genes that are in gene sets and have standard deviation > 0 gs_x = gs_x[gene_indices] ds_x = ds_x[:, gene_indices] if z_score_ds: ds_x = ds_x.toarray() if scipy.sparse.isspmatrix(ds_x) else ds_x std = np.std(ds_x, axis=0) mean = np.mean(ds_x, axis=0) ds_x = (ds_x - mean) / std ds_x[ds_x < -5] = -5 ds_x[ds_x > 5] = 5 ds_x[ds_x == np.nan] = 0 scores = ds_x.dot(gs_x) ngenes_in_set = gs_x.sum(axis=0) ngenes_in_set[ngenes_in_set == 0] = 1 # avoid divide by zero scores = scores / ngenes_in_set # scores contains cells on rows, gene sets on columns return wot.Dataset(x=scores, row_meta=ds.row_meta, col_meta=gs.col_meta)
def test_PowerMethod_project(): N, P = 1000, 1000 k = 10 svd_array = da.random.random(size=(N, P)).persist() proj_array = da.random.random(size=(10, P)).persist() mu = da.mean(svd_array, axis=0).persist() std = da.diag(1 / da.std(svd_array, axis=0)).persist() for scale in [True, False]: for center in [True, False]: svd_array1 = svd_array proj_array1 = proj_array if center: svd_array1 = svd_array1 - mu proj_array1 = proj_array1 - mu if scale: svd_array1 = svd_array1.dot(std) proj_array1 = proj_array1.dot(std) U, S, V = da.linalg.svd(svd_array1) U_k, S_k, V_k = svd_to_trunc_svd(U, S, V, k=k) PM = PowerMethod(k=k, scale=scale, center=center, factor=None, tol=1e-12) U_PM, S_PM, V_PM = PM.svd(array=svd_array) np.testing.assert_array_almost_equal( PM.project(proj_array, onto=V_k.T), proj_array1.dot(V_k.T))
def test_make_regression(n_samples, n_features, n_informative, n_targets, bias, effective_rank, tail_strength, noise, shuffle, coef, random_state, n_parts, cluster): c = Client(cluster) try: from cuml.dask.datasets import make_regression result = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, effective_rank=effective_rank, noise=noise, shuffle=shuffle, coef=coef, random_state=random_state, n_parts=n_parts) if coef: out, values, coefs = result else: out, values = result assert out.shape == (n_samples, n_features), "out shape mismatch" if n_targets > 1: assert values.shape == (n_samples, n_targets), \ "values shape mismatch" else: assert values.shape == (n_samples, ), "values shape mismatch" assert len(out.chunks[0]) == n_parts assert len(out.chunks[1]) == 1 if coef: if n_targets > 1: assert coefs.shape == (n_features, n_targets), \ "coefs shape mismatch" assert len(coefs.chunks[1]) == 1 else: assert coefs.shape == (n_features, ), "coefs shape mismatch" assert len(coefs.chunks[0]) == 1 test1 = da.all(da.sum(coefs != 0.0, axis=0) == n_informative) std_test2 = da.std(values - (da.dot(out, coefs) + bias), axis=0) test1, std_test2 = da.compute(test1, std_test2) diff = cp.abs(1.0 - std_test2) test2 = cp.all(diff < 1.5 * 10**(-1.)) assert test1, \ "Unexpected number of informative features" assert test2, "Unexpectedly incongruent outputs" finally: c.close()
def statistics(self, data, pca_stats=None): # set headers if pca_stats: # for pca if pca_stats["eigenvals"] is not None: self.stats_header.setText("Eigenvalue: {} ({}%)".format( round(pca_stats["eigenvals"][self.pc_id - 1], 2), round(pca_stats["eigenvals_%"][self.pc_id - 1], 2))) self.stats_header.setToolTip( "It shows how are the dispersion of the data with respect to its component" ) else: self.stats_header.setText("Eigenvalue: --") self.stats_header.setToolTip( "Is only available when the components are computed with the plugin" ) else: # for aoi self.stats_header.setText("Pixels in AOI: {}".format( round(data.size if data.size > 1 else 0, 2))) self.stats_header.setToolTip("") # restore or compute the statistics if self.QCBox_StatsLayer.currentText( ) == self.pc_name and self.stats_pc is not None: min, max, std, p25, p50, p75 = self.stats_pc else: da_data = da.from_array(data, chunks=(8000000, )) min = da.min(da_data).compute() max = da.max(da_data).compute() std = da.std(da_data).compute() p25 = da.percentile(da_data, 25).compute()[0] p50 = da.percentile(da_data, 50).compute()[0] p75 = da.percentile(da_data, 75).compute()[0] if self.QCBox_StatsLayer.currentText() == self.pc_name: self.stats_pc = (min, max, std, p25, p50, p75) # set in dialog self.stats_min.setText(str(round(min, 2))) self.stats_max.setText(str(round(max, 2))) self.stats_std.setText(str(round(std, 2))) self.stats_p25.setText(str(round(p25, 2))) self.stats_p50.setText(str(round(p50, 2))) self.stats_p75.setText(str(round(p75, 2)))
def test_ScaledArray_fromArrayMoment_array(): N1, P = 7, 10 N2 = 5 array1 = da.random.random(size=(N1, P)).persist() mu = da.mean(array1, axis=0) std = da.diag(1/da.std(array1, axis=0)) array2 = da.random.random(size=(N2, P)).persist() for scale in [True, False]: for center in [True, False]: for factor1 in [None, 'n', 'p']: sa1 = ScaledCenterArray(scale=scale, center=center, factor=factor1) sa1.fit(array1) for factor2, factor_value in zip([None, 'n', 'p'], [1, N2, P]): sa2 = ScaledCenterArray.fromScaledArray(array=array2, scaled_array=sa1, factor=factor2) sa2_array = array2 if center: sa2_array = sa2_array - mu if scale: sa2_array = sa2_array.dot(std) np.testing.assert_array_almost_equal(sa2.array, sa2_array)
def test_make_regression(n_samples, n_features, n_informative, n_targets, bias, effective_rank, tail_strength, noise, shuffle, coef, n_parts, order, use_full_low_rank, client): c = client from cuml.dask.datasets import make_regression result = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, effective_rank=effective_rank, noise=noise, shuffle=shuffle, coef=coef, n_parts=n_parts, use_full_low_rank=use_full_low_rank, order=order) if coef: out, values, coefs = result else: out, values = result assert out.shape == (n_samples, n_features), "out shape mismatch" if n_targets > 1: assert values.shape == (n_samples, n_targets), \ "values shape mismatch" else: assert values.shape == (n_samples,), "values shape mismatch" assert len(out.chunks[0]) == n_parts assert len(out.chunks[1]) == 1 if coef: if n_targets > 1: assert coefs.shape == (n_features, n_targets), \ "coefs shape mismatch" assert len(coefs.chunks[1]) == 1 else: assert coefs.shape == (n_features,), "coefs shape mismatch" assert len(coefs.chunks[0]) == 1 test1 = da.all(da.sum(coefs != 0.0, axis=0) == n_informative) std_test2 = da.std(values - (da.dot(out, coefs) + bias), axis=0) test1, std_test2 = da.compute(test1, std_test2) diff = cp.abs(1.0 - std_test2) test2 = cp.all(diff < 1.5 * 10**(-1.)) assert test1, \ "Unexpected number of informative features" assert test2, "Unexpectedly incongruent outputs" data_ddh = DistributedDataHandler.create(data=(out, values), client=c) out_part, value_part = data_ddh.gpu_futures[0][1].result() if coef: coefs_ddh = DistributedDataHandler.create(data=coefs, client=c) coefs_part = coefs_ddh.gpu_futures[0][1].result() if order == 'F': assert out_part.flags['F_CONTIGUOUS'] if n_targets > 1: assert value_part.flags['F_CONTIGUOUS'] if coef: assert coefs_part.flags['F_CONTIGUOUS'] elif order == 'C': assert out_part.flags['C_CONTIGUOUS'] if n_targets > 1: assert value_part.flags['C_CONTIGUOUS'] if coef: assert coefs_part.flags['C_CONTIGUOUS']
def __call__(self, tag='', reprocess=False): super().__call__(tag='tensor', reprocess=reprocess) if self.cache is None: ds = self.parent ds_new = xr.Dataset(coords=ds.coords, attrs=ds.attrs) print('Calculating tensor') # j = jacobianConv(ds.U, ds.V, ds.W, dx, dy, dz, sigma=1.5) j = jacobian(ds.U.data, ds.V.data, ds.W.data, ds.attrs['piv_step_frame'], ds.attrs['dx'], ds.attrs['dy'], ds.attrs['dz']) j = j.compute() ds_new['jacobian'] = (['time', 'x', 'y', 'z', 'comp', 'dims'], j) ds_new['jacobianNorm'] = da.sqrt( (ds_new['jacobian']**2.).sum(dim=['comp', 'dims'])) jT = ds_new.jacobian.transpose('time', 'x', 'y', 'z', 'dims', 'comp') #.values ds_new['strainTensor'] = (ds_new.jacobian + jT) / 2. ds_new['vorticityTensor'] = (ds_new.jacobian - jT) / 2. ds_new['strainTensorNorm'] = da.sqrt( (ds_new.strainTensor**2.).sum(dim=['comp', 'dims'])) ds_new['vorticityTensorNorm'] = da.sqrt( (ds_new.vorticityTensor**2.).sum(dim=['comp', 'dims'])) ds_new['dudx'] = (['time', 'x', 'y', 'z'], j[:, :, :, :, 0, 0]) ds_new['dvdy'] = (['time', 'x', 'y', 'z'], j[:, :, :, :, 1, 1]) ds_new['dwdz'] = (['time', 'x', 'y', 'z'], j[:, :, :, :, 2, 2]) ds_new['divergence'] = ds_new['dudx'] + ds_new['dvdy'] + ds_new[ 'dwdz'] print(ds_new['divergence']) # ds_new['vorticity'] = ( ['time', 'x', 'y', 'z', 'comp'], da.stack((ds_new['vorticityTensor'][:, :, :, :, 2, 1], ds_new['vorticityTensor'][:, :, :, :, 0, 2], ds_new['vorticityTensor'][:, :, :, :, 1, 0]), axis=-1)) ds_new['divNorm'] = ds_new['divergence'] / ds_new['jacobianNorm'] ds_new['divNorm_mean'] = da.mean(ds_new['divNorm']) ds_new['divNorm_std'] = da.std(ds_new['divNorm']) delta = (ds.attrs['dx'] * ds.attrs['dy'] * ds.attrs['dz'])**(1. / 3.) dpx = (ds.attrs['pdx'] * ds.attrs['pdy'] * ds.attrs['pdz'])**(1. / 3.) delta_px = delta / dpx dt = ds.attrs['piv_step_ensemble'] ds_new['divRMS'] = da.mean((ds_new['divergence'] * dt)**2.)**0.5 ds_new['velocityError'] = ds_new['divRMS'] / ( (3. / (2. * delta_px**2.))**0.5) print( da.percentile(ds_new['vorticityTensorNorm'].data.ravel(), 99.)) print(ds_new['divRMS']) print(ds_new['divNorm_mean']) ds_new['vorticityError'] = ds_new['divRMS'] / dt / da.percentile( ds_new['vorticityTensorNorm'].data.ravel(), 99.) print('saving') self.cache = ds_new self.to_file() return self.cache