def test_matrix_multiply(): ctx = ndarray.gpu(0) x = np.random.uniform(0, 10, size=(500, 700)).astype(np.float32) y = np.random.uniform(0, 10, size=(700, 1000)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1000), ctx=ctx) gpu_op.matrix_multiply(arr_x, False, arr_y, False, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(np.dot(x, y), z, rtol=1e-5) x = np.random.uniform(0, 10, size=(1000, 500)).astype(np.float32) y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((1000, 2000), ctx=ctx) gpu_op.matrix_multiply(arr_x, False, arr_y, True, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(np.dot(x, np.transpose(y)), z, rtol=1e-5) x = np.random.uniform(0, 10, size=(500, 1000)).astype(np.float32) y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((1000, 2000), ctx=ctx) gpu_op.matrix_multiply(arr_x, True, arr_y, True, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(np.dot(np.transpose(x), np.transpose(y)), z, rtol=1e-5)
def _copy_to_gpu(params): ctx = ndarray.gpu(0) gpu_arrays = [] for param in params: param.const = ndarray.array(param.const, ctx=ctx) gpu_arrays.append(param) return gpu_arrays
def test_matrix_elementwise_sqrt(): ctx = ndarray.gpu(0) shape = (500, 200) x = np.random.uniform(0, 10, size=shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) gpu_op.matrix_elementwise_sqrt(arr_x, arr_x) z = arr_x.asnumpy() np.testing.assert_allclose(np.sqrt(x), z, rtol=1e-5)
def test_softmax(): ctx = ndarray.gpu(0) shape = (400, 1000) x = np.random.uniform(-5, 5, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.softmax(arr_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(au.nn.softmax_func(x), y, rtol=1e-5)
def test_relu(): shape = (2000, 2500) ctx = ndarray.gpu(0) x = np.random.uniform(-1, 1, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.relu(arr_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(np.maximum(x, 0).astype(np.float32), y)
def test_matrix_elementwise_add_by_const(): shape = (2000, 3000) ctx = ndarray.gpu(0) x = np.random.uniform(0, 10, size=shape).astype(np.float32) val = np.random.uniform(-5, 5) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(x + val, y, rtol=1e-5)
def test_broadcast_to(): ctx = ndarray.gpu(0) shape = (200, 300) to_shape = (130, 200, 300) x = np.random.uniform(-1, 1, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(to_shape, ctx=ctx) gpu_op.broadcast_to(arr_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(np.broadcast_to(x, to_shape), y)
def __init__(self, cost, params, lr=0.1, momentum=0.9, use_gpu=False): super().__init__(cost, params, lr=lr, use_gpu=use_gpu) self.momentum = momentum if use_gpu: self.velocity = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in params ] else: self.velocity = [np.zeros_like(param.const) for param in params]
def test_matrix_elementwise_multiply(): ctx = ndarray.gpu(0) shape = (500, 200) x = np.random.uniform(0, 10, size=shape).astype(np.float32) y = np.random.uniform(0, 10, size=shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty(shape, ctx=ctx) gpu_op.matrix_elementwise_multiply(arr_x, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x * y, z, rtol=1e-5)
def test_relu_gradient(): shape = (2000, 2500) ctx = ndarray.gpu(0) x = np.random.uniform(-1, 1, shape).astype(np.float32) grad_x = np.random.uniform(-5, 5, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_grad_x = ndarray.array(grad_x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.relu_gradient(arr_x, arr_grad_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y)
def test_array_set(): ctx = ndarray.gpu(0) shape = (5000, 2000) # oneslike arr_x = ndarray.empty(shape, ctx=ctx) gpu_op.array_set(arr_x, 1.) x = arr_x.asnumpy() np.testing.assert_allclose(np.ones(shape), x) # zeroslike gpu_op.array_set(arr_x, 0.) x = arr_x.asnumpy() np.testing.assert_allclose(np.zeros(shape), x)
def __init__(self, cost, params, lr=1e-3, beta1=0.9, beta2=0.995, eps=1e-5, use_gpu=False): super().__init__(cost, params, lr, use_gpu=use_gpu) self.beta1 = beta1 self.beta2 = beta2 if self.use_gpu: self.velocity = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in params ] self.momentum = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in params ] self.vec_hat = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in self.params ] self.mom_hat = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in self.params ] else: self.velocity = [np.zeros_like(param.const) for param in params] self.momentum = [np.zeros_like(param.const) for param in params] self.time = 0 self.eps = eps
def test_reduce_sum_axis_zero(): ctx = ndarray.gpu(0) shape = (500, 200, 100) to_shape = (200, 100) x = np.random.uniform(0, 20, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(to_shape, ctx=ctx) gpu_op.reduce_sum_axis_zero(arr_x, arr_y) y = arr_y.asnumpy() y_ = np.sum(x, axis=0) for index, _ in np.ndenumerate(y): v = y[index] v_ = y_[index] if abs((v - v_) / v_) > 1e-4: print(index, v, v_) np.testing.assert_allclose(np.sum(x, axis=0), y, rtol=1e-5)
def __init__(self, eval_list, use_gpu=False): """ Executor computes values for a given subset of nodes in a computation graph. Parameters: ----------- :param eval_list: Values of the nodes of this list need to be computed """ self.eval_node_list = eval_list self.ctx = None if use_gpu: self.ctx = ndarray.gpu(0) self.topo_order = find_topo_sort(self.eval_node_list) self.node_to_arr_map = None self.node_to_shape_map = None self.feed_shapes = None
def step(self, feed_dict): exe_output = self.executor.run(feed_dict) self.time += 1 if self.use_gpu: # set for i in range(len(self.vec_hat)): gpu_op.matrix_elementwise_multiply_by_const( self.vec_hat[i], 0.0, self.vec_hat[i]) gpu_op.matrix_elementwise_multiply_by_const( self.mom_hat[i], 0.0, self.mom_hat[i]) for i in range(len(self.params)): gpu_op.matrix_elementwise_multiply_by_const( self.momentum[i], self.beta1, self.momentum[i]) # TODO: (upul) copying dev->hot>dev is expensive. We need a better approach. tem_gpu_array = ndarray.array(exe_output[i + 1].asnumpy(), ctx=ndarray.gpu(0)) gpu_op.matrix_elementwise_multiply_by_const( exe_output[i + 1], (1 - self.beta1), tem_gpu_array) gpu_op.matrix_elementwise_add(self.momentum[i], tem_gpu_array, self.momentum[i]) gpu_op.matrix_elementwise_div_by_const( self.momentum[i], (1 - self.beta1**self.time), self.mom_hat[i]) gpu_op.matrix_elementwise_multiply_by_const( self.velocity[i], self.beta2, self.velocity[i]) gpu_op.matrix_elementwise_multiply(exe_output[i + 1], exe_output[i + 1], exe_output[i + 1]) gpu_op.matrix_elementwise_multiply_by_const( exe_output[i + 1], (1 - self.beta2), exe_output[i + 1]) gpu_op.matrix_elementwise_add(self.velocity[i], exe_output[i + 1], self.velocity[i]) gpu_op.matrix_elementwise_div_by_const( self.velocity[i], (1 - self.beta2**self.time), self.vec_hat[i]) for i in range(len(self.params)): gpu_op.matrix_elementwise_sqrt(self.vec_hat[i], self.vec_hat[i]) gpu_op.matrix_elementwise_add_by_const(self.vec_hat[i], self.eps, self.vec_hat[i]) gpu_op.matrix_elementwise_multiply_by_const( self.mom_hat[i], -1 * self.lr, self.mom_hat[i]) gpu_op.matrix_elementwise_division(self.mom_hat[i], self.vec_hat[i], self.mom_hat[i]) gpu_op.matrix_elementwise_add(self.params[i].const, self.mom_hat[i], self.params[i].const) else: vec_hat = [np.zeros_like(param.const) for param in self.params] mom_hat = [np.zeros_like(param.const) for param in self.params] for i in range(len(self.params)): self.momentum[i] = self.beta1 * self.momentum[i] + ( 1 - self.beta1) * exe_output[i + 1] mom_hat[i] = self.momentum[i] / (1 - self.beta1**self.time) self.velocity[i] = self.beta2 * self.velocity[i] + ( 1 - self.beta2) * (exe_output[i + 1]**2) vec_hat[i] = self.velocity[i] / (1 - self.beta2**self.time) for i in range(len(self.params)): self.params[i].const += -self.lr * mom_hat[i] / ( np.sqrt(vec_hat[i]) + self.eps) cost = exe_output[0] if self.use_gpu: cost = cost.asnumpy() return cost