def _rmatvec(self, x): # apply forward fft x = da.reshape(x, self.dimsd) y = sqrt(1. / self.nt) * da.fft.rfft(x, n=self.nt, axis=0) y = y.astype(self.cdtype) y = y[:self.nfmax] # apply batched matrix mult y = y.rechunk((self.G.chunks[0], self.nr, self.nv)) if self.saveGt: if self.conj: y = y.conj() y = da.matmul(self.GT, y) if self.conj: y = y.conj() else: if self.conj: y = da.matmul(y.transpose(0, 2, 1), self.G).transpose(0, 2, 1) else: y = da.matmul(y.transpose(0, 2, 1).conj(), self.G).transpose(0, 2, 1).conj() if not self.prescaled: y *= self.dr * self.dt * np.sqrt(self.nt) # apply inverse fft y = da.pad(y, ((0, self.nfft - self.nfmax), (0, 0), (0, 0)), mode='constant') y = y.rechunk(self.dimsdf) y = sqrt(self.nt) * da.fft.irfft(y, n=self.nt, axis=0) if self.twosided: y = da.fft.fftshift(y, axes=0) y = y.astype(self.dtype) y = da.real(y) return y.ravel()
def _rmatvec(self, x): x = np.squeeze(x.reshape(self.nsl, self.nx, self.nz)) if self.chunks[0] is not None: x = x.rechunk(self.chunks[0]) if self.nz == 1: x = x[..., np.newaxis] if hasattr(self, 'GT'): y = da.matmul(self.GT, x) else: if self.transposeG: y = da.matmul(self.G.transpose((0, 2, 1)).conj(), x) else: y = da.matmul(x.transpose(0, 2, 1).conj(), self.G).transpose(0, 2, 1).conj() return y.ravel()
def power_dask(data, x_init): A = da.matmul(data, da.transpose(data)) A.compute() T = 150 y = x_init for t in range(T): v = np.matmul(A, y) y = v / np.linalg.norm(v)
def _matvec(self, x): x = da.squeeze(x.reshape(self.nsl, self.ny, self.nz)) if self.chunks[0] is not None: x = x.rechunk(self.chunks[0]) if self.nz == 1: x = x[..., np.newaxis] y = da.matmul(self.G, x) return y.ravel()
def test_optimize_blockwise_duplicate_dependency(optimize_graph): # Two blockwise operations in a row with duplicate name # (See: https://github.com/dask/dask/issues/8535) xx = da.from_array(np.array([[1, 1], [2, 2]]), chunks=1) xx = xx * 2 z = da.matmul(xx, xx) # Compare to known answer result = z.compute(optimize_graph=optimize_graph) assert assert_eq(result, [[12, 12], [24, 24]])
def test_matmul(x_shape, y_shape): np.random.seed(3732) x = np.random.random(x_shape)[()] y = np.random.random(y_shape)[()] a = da.from_array(x, chunks=tuple((i // 2) for i in x.shape)) b = da.from_array(y, chunks=tuple((i // 2) for i in y.shape)) expected = None try: expected = np.matmul(x, y) except ValueError: pass for d1, d2 in itertools.product([a, x], [b, y]): if x.ndim == 0 or y.ndim == 0: with pytest.raises(ValueError): da.matmul(d1, d2) else: assert_eq(expected, da.matmul(d1, d2))
def test_matmul(): x = np.random.random((5, 5)) y = np.random.random((5, 2)) a = da.from_array(x, chunks=(1, 5)) b = da.from_array(y, chunks=(5, 1)) assert_eq(np.matmul(x, y), da.matmul(a, b)) assert_eq(np.matmul(a, y), da.matmul(x, b)) assert_eq(np.matmul(x, b), da.matmul(a, y)) list_vec = list(range(1, 6)) assert_eq(np.matmul(x, list_vec), da.matmul(a, list_vec)) assert_eq(np.matmul(list_vec, y), da.matmul(list_vec, b)) z = np.random.random((5, 5, 5)) c = da.from_array(z, chunks=(1, 5, 1)) with pytest.raises(NotImplementedError): da.matmul(a, z) assert_eq(np.matmul(z, x), da.matmul(c, a))
cph.fit(pheno[[T_name, event_name] + covname], T_name, event_col=event_name) # res_surv = cph.compute_residuals(pheno[[T_name, event_name] + covname], 'deviance').sort_index()['deviance'] res_surv = cph.compute_residuals(pheno[[T_name, event_name] + covname], 'martingale').sort_index()['martingale'] # This is the most memory intensive part. Might need to change if we are dealing with biobank scale data if args.apr_flag == 'N': logger.info('Calculating Null covariance matrix\n') mat = cph.predict_cumulative_hazard(pheno) P = np.diff(mat,axis=-0) for isubj in range(P.shape[1]): idx = np.abs(mat.index - pheno[T_name][isubj]).argmin() P[idx::,isubj] = 0 V = da.diag(np.array(pheno[event_name] - res_surv)) - da.dot(P.transpose(),P) X = np.array(pheno[covname]) C = V - da.matmul(da.matmul(da.matmul(V,X), da.linalg.inv(da.matmul(da.matmul(X.transpose(), V),X))), da.matmul(X.transpose(), V)) else: logger.info('Using first order approximations for testing statistics\n') # auto chunk to reduce the query time chunk_array = [bim.i.values[i:i + chunk_size] for i in xrange(0, len(bim.i.values), chunk_size)] nchunk = len(chunk_array) chunk_ind = 1 #################################### Create HDF temporary file ######### # The idea is to use I/O to prevent memory overload # Initialize the temporary files tmp_f = outpath + '.tmp.hdf5' try: os.remove(tmp_f) except OSError:
h5read = tables.open_file('knockoff-data.h5', mode='r') h5regression = tables.open_file('regression-data.h5', mode='r') X = da.from_array(h5read.root.X) pdim = X.shape[1] Xtilde = da.from_array(h5read.root.Xtilde) Y = da.from_array(h5regression.root.Y) keepcols_svd = list(h5read.root.keepcols) xcolnames_pdim = [] for k in xinfo['xcolnames']: if xinfo['xcolnames'][k] in keepcols_svd: xcolnames_pdim.append(k) with Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof: Xaug = da.hstack([X, Xtilde]) betahat_aug = da.linalg.solve(da.matmul(Xaug.T, Xaug), da.matmul(Xaug.T, Y)).compute() Wstat = [ abs(betahat_aug[i]) - abs(betahat_aug[i + pdim]) for i in range(pdim) ] threshold = knockoff_threshold(Wstat, FDR, offset=1) sel = [Wstat[j] >= threshold for j in range(pdim)] Xdrop = X[:, sel] betahat_final = da.linalg.solve(da.matmul(Xdrop.T, Xdrop), da.matmul(Xdrop.T, Y)).compute() colnames_final = [i for i, j in zip(xcolnames_pdim, sel) if j] colnames_dropped = [i for i, j in zip(xcolnames_pdim, sel) if not j] print("desired FDR: ") print(FDR) print("\nKnockoff drops these columns:\n") print(colnames_dropped)
import dask.array as da from dask.dot import dot_graph image_1 = da.zeros((5, 5), chunks=(5, 5)) image_2 = da.ones((5, 5), chunks=(5, 5)) dot_graph(image_1.dask) # In[15]: image_4 = (image_1 - 10) + (image_2 * 50) dot_graph(image_4.dask) # In[16]: image_5 = da.matmul(image_1, image_4) dot_graph(image_5.dask) # ## Image Processing # the initial examples were shown on very simple image problems. Here we can see how it looks for real imaging issues. # In[17]: import dask.array as da from dask.dot import dot_graph import numpy as np from skimage.io import imread import matplotlib.pyplot as plt from skimage.util import montage as montage2d # for showing results
def coclustering(Z, nclusters_row, nclusters_col, errobj, niters, epsilon, col_clusters_init=None, row_clusters_init=None, run_on_worker=False): """ Run the co-clustering, Dask implementation :param Z: m x n data matrix :param nclusters_row: num row clusters :param nclusters_col: number of column clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param run_on_worker: whether the function is submitted to a Dask worker :return: has converged, number of iterations performed. final row and column clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [m, n] = Z.shape row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate cluster based averages # nel_clusters is a matrix with the number of elements per co-cluster # originally computed as: da.dot(da.dot(R.T, da.ones((m, n))), C) nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) logger.debug('num of populated clusters: row {}, col {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute())) nel_clusters = da.outer(nel_row_clusters, nel_col_clusters) CoCavg = (da.matmul(da.matmul(R.T, Z), C) + Gavg * epsilon) / \ (nel_clusters + epsilon) # Calculate distance based on row approximation d_row = _distance(Z, da.matmul(C, CoCavg.T), epsilon) # Assign to best row cluster row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # Calculate distance based on column approximation d_col = _distance(Z.T, da.matmul(R, CoCavg), epsilon) # Assign to best column cluster col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # Error value (actually just the column components really) old_e = e minvals = da.min(d_col, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, e = client.persist( [row_clusters, R, col_clusters, C, e]) if run_on_worker: # this is workaround for e.compute() for a function that runs # on a worker with multiple threads # https://github.com/dask/distributed/issues/3827 e = client.compute(e) secede() e = e.result() rejoin() else: e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Coclustering converged in {s} iterations') else: logger.debug(f'Coclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, e
def _distance(Z, Y, epsilon): """ Distance function """ Y = Y + epsilon # The first term below is equal to one row of: da.dot(da.ones(m, n), Y) # with Z.shape = (m, n) and Y.shape = (n, k) return Y.sum(axis=0, keepdims=True) - da.matmul(Z, da.log(Y))
import operator import time import timeit import datetime import sys K = 1 << 10 mx = 64 * K cx = 4 * K name = 'matrixmul_64k_x_4k' client = Client('10.255.23.115:8786', name=name) x = da.random.random((mx, mx), chunks=(cx, cx)) y = da.random.random((mx, mx), chunks=(cx, cx)) z = da.matmul(x, y) # Start the computation. start = datetime.datetime.now() results = z.compute(scheduler='distributed') end = datetime.datetime.now() print(f'Matrix multiplication is done in {end - start}') z.visualize(filename=f'{name}.png')
X = np.random.random((batch_size, 3**2 * input_channels, input_size**2)) kernels_mean = np.random.random((total_kernels, 3**2 * input_channels)) cov_list = [ random_cov(3**2 * input_channels) for number in range(total_kernels) ] kernels_cov = np.stack(cov_list) X = da.from_array(X) kernels_mean = da.from_array(kernels_mean) kernels_cov = da.from_array(kernels_cov) batch_out = [] for i in range(batch_size): kernel_out = [] for j in range(total_kernels): mean = da.matmul(kernels_mean[j, :], X[i, :, :]) cov = da.matmul(da.transpose(X[i, :, :]), da.matmul(kernels_cov[j, :, :], X[i, :, :])) z = mvn_random_DASK(mean, cov, total_samples, input_size**2) g = relu(z) mean_g = da.mean(g, axis=1) kernel_out.append(mean_g) kernels_out = da.stack(kernel_out, axis=0) batch_out.append(kernels_out) batches_out = da.stack(batch_out, axis=0) print('task graph complete') mean_g.visualize(rankdir="LR", filename="task_graph_mean_g.pdf", cmap='viridis') kernels_out.visualize(rankdir="LR", filename="task_graph_conv_out.pdf")
# validate =True validate = False if validate: numpy_validation_list = va.single_mean_single_covariance_validator( X.compute(), kernels_mean.compute(), kernels_cov.compute(), batch_size, total_kernels, input_size) times = [] # list for storing execution times cluster = 'localhost:8001' # address of compute cluster with Client(cluster) as client: # Using cluster as client do for n in range(itrs): # itrs runs start = time.time() # save start tikme batch_out = [] # create list for batch output for i in range(batch_size): # for each image kernel_out = [] # create list for kernel outputs mean = da.matmul(kernels_mean, X[i, :, :]) # compute all kernel means for j in range(total_kernels): # for each kernel cov = da.matmul( da.transpose(X[i, :, :]), # compute covariance da.matmul(kernels_cov[j, :, :], X[i, :, :])) z = mvn_random_DASK( mean[j, :], cov, total_samples, input_size**2) # sample from transformed distribution g = relu(z) # pass samples through relu mean_g = da.mean( g, axis=1) # compute ensemble mean from samples kernel_out.append( mean_g) # add ensemble mean to kernel outputs list kernels_out = da.stack(kernel_out, axis=0) # stack all kernel outputs batch_out.append(
import dask.array as da from dask.dot import dot_graph image_1 = da.zeros((1024, 1024), chunks=(512, 512)) image_2 = da.ones((1024, 1024), chunks=(512, 512)) dot_graph(image_1.dask) # In[36]: image_4 = (image_1 - 10) + (image_2 * 50) dot_graph(image_4.dask) # In[37]: image_5 = da.matmul(image_1, image_2) dot_graph(image_5.dask) # In[38]: image_6 = (da.matmul(image_1, image_2) + image_1) * image_2 dot_graph(image_6.dask) # In[39]: import dask_ndfilters as da_ndfilt image_7 = da_ndfilt.convolve(image_6, image_1) dot_graph(image_7.dask) # # Deep Learning
import pandas as pd import dask.array as da import sys import os #fluidity_fp = '/mnt/c/Users/julia/fluidity/fluidity-master/python' DA_project_fp = '/mnt/c/Users/julia/Documents/Imperial/DA_project' #sys.path.append(fluidity_fp) sys.path.append(DA_project_fp) import vtktools ug1 = vtktools.vtu(DA_project_fp + '/data/LSBU_c_0.vtu') ug2 = vtktools.vtu(DA_project_fp + '/data/LSBU_0.vtu') print(ug1.GetFieldNames()) print(ug2.GetFieldNames()) #read the values of the tracers and copy in a vector named p p = ug2.GetVectorField('Velocity') #create a dask array p = da.from_array(p, chunks=[5000, 63]) n = len(p) print(n) #create background matrix Background = da.matmul(p, p.T) Background.compute() print(Background[0, 0])
def cov_mult(conv_matrix, cov_matrix): conv_matrix = da.transpose(conv_matrix) return da.matmul(da.matmul(conv_matrix, cov_matrix), da.transpose(conv_matrix))
for i in range(itr1): x = 1000 * i + 1000 y = 1000 * i + 1000 z = 1000 * i + 1000 A = np.random.random((x, y)) B = np.random.random((y, z)) Adask = da.from_array(A, chunks='auto') Bdask = da.from_array(B, chunks='auto') start = time.time() results = [] for j in range(itr2): for k in range(z): results.append(da.matmul(Adask, Bdask[:, k])) results_stacked = da.stack(results) result = results_stacked.compute() execution_time_dask_dot.append((time.time() - start) / itr2) start = time.time() for j in range(itr2): C = da.matmul(Adask, Bdask) result2 = C.compute() execution_time_dask_matmul.append((time.time() - start) / itr2) #%% data_set = { 'Operation': [ "matrix-vector", "matrix-vector",
itr = 1 A = np.random.random((x,y)) B = np.random.random((y,z)) start = time.time() for i in range(itr): C = np.matmul(A,B) execution_time_np_matmul = (time.time() - start)/itr Adask = da.from_array(A, chunks = 'auto') Bdask = da.from_array(B, chunks = 'auto') start = time.time() for i in range(itr): C = da.matmul(Adask,Bdask) result = C.compute() execution_time_dask_matmul = (time.time() - start)/itr %% Cholesky test with threadpool_limits(limits=1, user_api='blas'): x = 10000 itr = 1 A = np.random.random((x,x)) A = np.matmul(A,A.transpose()) start = time.time() for i in range(itr): B = np.linalg.cholesky(A) execution_time_np_cholesky = (time.time() - start)/itr
def convolution_mean_DASK(X, mu_W, batch_size, input_kernels): mu_z = np.empty((batch_size, input_kernels, X.shape[2])) for i in range(batch_size): for j in range(input_kernels): mu_z[i, j, :] = da.matmul(mu_W[j, :], X[i, :, :]) return mu_z