def test_matrix_elementwise_multiply(): ctx = ndarray.gpu(0) shape = (500, 200) x = np.random.uniform(0, 10, size=shape).astype(np.float32) y = np.random.uniform(0, 10, size=shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty(shape, ctx=ctx) gpu_op.matrix_elementwise_multiply(arr_x, arr_y, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(x * y, z, rtol=1e-5)
def test_relu_gradient(): shape = (2000, 2500) ctx = ndarray.gpu(0) x = np.random.uniform(-1, 1, shape).astype(np.float32) grad_x = np.random.uniform(-5, 5, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_grad_x = ndarray.array(grad_x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.relu_gradient(arr_x, arr_grad_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y)
def _copy_to_gpu(params): ctx = ndarray.gpu(0) gpu_arrays = [] for param in params: param.const = ndarray.array(param.const, ctx=ctx) gpu_arrays.append(param) return gpu_arrays
def test_matrix_elementwise_sqrt(): ctx = ndarray.gpu(0) shape = (500, 200) x = np.random.uniform(0, 10, size=shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) gpu_op.matrix_elementwise_sqrt(arr_x, arr_x) z = arr_x.asnumpy() np.testing.assert_allclose(np.sqrt(x), z, rtol=1e-5)
def test_softmax(): ctx = ndarray.gpu(0) shape = (400, 1000) x = np.random.uniform(-5, 5, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.softmax(arr_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(au.nn.softmax_func(x), y, rtol=1e-5)
def test_relu(): shape = (2000, 2500) ctx = ndarray.gpu(0) x = np.random.uniform(-1, 1, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.relu(arr_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(np.maximum(x, 0).astype(np.float32), y)
def test_matrix_elementwise_add_by_const(): shape = (2000, 3000) ctx = ndarray.gpu(0) x = np.random.uniform(0, 10, size=shape).astype(np.float32) val = np.random.uniform(-5, 5) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(shape, ctx=ctx) gpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(x + val, y, rtol=1e-5)
def test_broadcast_to(): ctx = ndarray.gpu(0) shape = (200, 300) to_shape = (130, 200, 300) x = np.random.uniform(-1, 1, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(to_shape, ctx=ctx) gpu_op.broadcast_to(arr_x, arr_y) y = arr_y.asnumpy() np.testing.assert_allclose(np.broadcast_to(x, to_shape), y)
def __init__(self, cost, params, lr=0.1, momentum=0.9, use_gpu=False): super().__init__(cost, params, lr=lr, use_gpu=use_gpu) self.momentum = momentum if use_gpu: self.velocity = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in params ] else: self.velocity = [np.zeros_like(param.const) for param in params]
def test_matrix_multiply(): ctx = ndarray.gpu(0) x = np.random.uniform(0, 10, size=(500, 700)).astype(np.float32) y = np.random.uniform(0, 10, size=(700, 1000)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((500, 1000), ctx=ctx) gpu_op.matrix_multiply(arr_x, False, arr_y, False, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(np.dot(x, y), z, rtol=1e-5) x = np.random.uniform(0, 10, size=(1000, 500)).astype(np.float32) y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((1000, 2000), ctx=ctx) gpu_op.matrix_multiply(arr_x, False, arr_y, True, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(np.dot(x, np.transpose(y)), z, rtol=1e-5) x = np.random.uniform(0, 10, size=(500, 1000)).astype(np.float32) y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.array(y, ctx=ctx) arr_z = ndarray.empty((1000, 2000), ctx=ctx) gpu_op.matrix_multiply(arr_x, True, arr_y, True, arr_z) z = arr_z.asnumpy() np.testing.assert_allclose(np.dot(np.transpose(x), np.transpose(y)), z, rtol=1e-5)
def __init__(self, cost, params, lr=1e-3, beta1=0.9, beta2=0.995, eps=1e-5, use_gpu=False): super().__init__(cost, params, lr, use_gpu=use_gpu) self.beta1 = beta1 self.beta2 = beta2 if self.use_gpu: self.velocity = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in params ] self.momentum = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in params ] self.vec_hat = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in self.params ] self.mom_hat = [ ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) for param in self.params ] else: self.velocity = [np.zeros_like(param.const) for param in params] self.momentum = [np.zeros_like(param.const) for param in params] self.time = 0 self.eps = eps
def test_reduce_sum_axis_zero(): ctx = ndarray.gpu(0) shape = (500, 200, 100) to_shape = (200, 100) x = np.random.uniform(0, 20, shape).astype(np.float32) arr_x = ndarray.array(x, ctx=ctx) arr_y = ndarray.empty(to_shape, ctx=ctx) gpu_op.reduce_sum_axis_zero(arr_x, arr_y) y = arr_y.asnumpy() y_ = np.sum(x, axis=0) for index, _ in np.ndenumerate(y): v = y[index] v_ = y_[index] if abs((v - v_) / v_) > 1e-4: print(index, v, v_) np.testing.assert_allclose(np.sum(x, axis=0), y, rtol=1e-5)
def run(self, feed_shapes, convert_to_numpy_ret_vals=False): """ Values of the nodes given in eval_list are evaluated against feed_dict Parameters ---------- :param feed_shapes: A dictionary of nodes who values are specified by the user Returns ------- :return: Values of the nodes specified by the eval_list """ def are_feed_shapes_equal(sa, sb): if (not isinstance(sa, dict)) or (not isinstance(sb, dict)): return False unmatched_item = set(sa.items()) ^ set(sb.items()) return len(unmatched_item) == 0 # Assume self.ctx is None implies numpy array and numpy ops. use_numpy = self.ctx is None node_to_val_map = {} for node, value in feed_shapes.items(): if use_numpy: # all values passed in feed_dict must be np.ndarray assert isinstance(value, np.ndarray) node_to_val_map[node] = value else: # convert values to ndarray.NDArray if necessary if isinstance(value, np.ndarray): node_to_val_map[node] = ndarray.array(value, ctx=self.ctx) elif isinstance(value, ndarray.NDArray): node_to_val_map[node] = value else: assert False, "feed_dict value type not supported" # collect shapes for all placeholders feed_shapes = {} for node in node_to_val_map: feed_shapes[node] = node_to_val_map[node].shape # infer shape if feed_shapes changed since last run # e.g. call run() on test data after trainng if (not are_feed_shapes_equal(feed_shapes, self.feed_shapes)): self.infer_shape(feed_shapes) self.feed_shapes = feed_shapes # plan memory if using GPU if (not use_numpy): self.memory_plan(feed_shapes) # Traverse graph in topo order and compute values for all nodes. for node in self.topo_order: if node in node_to_val_map: # Skip placeholder nodes. Values already provided by feed_dict. continue # TODO (upul): following if condition looks like a hack. Find a better approach if isinstance(node.op, PlaceholderOp) and node.const is not None: node_to_val_map[node] = node.const continue input_vals = [node_to_val_map[n] for n in node.inputs] if use_numpy: node_val = np.empty(shape=self.node_to_shape_map[node]) else: node_val = self.node_to_arr_map[node] # node_val is modified in-place whether np.ndarray or NDArray node.op.compute(node, input_vals, node_val, use_numpy) node_to_val_map[node] = node_val # Collect node values. if not use_numpy and convert_to_numpy_ret_vals: return [node_to_val_map[n].asnumpy() for n in self.eval_node_list] return [node_to_val_map[n] for n in self.eval_node_list]
def step(self, feed_dict): exe_output = self.executor.run(feed_dict) self.time += 1 if self.use_gpu: # set for i in range(len(self.vec_hat)): gpu_op.matrix_elementwise_multiply_by_const( self.vec_hat[i], 0.0, self.vec_hat[i]) gpu_op.matrix_elementwise_multiply_by_const( self.mom_hat[i], 0.0, self.mom_hat[i]) for i in range(len(self.params)): gpu_op.matrix_elementwise_multiply_by_const( self.momentum[i], self.beta1, self.momentum[i]) # TODO: (upul) copying dev->hot>dev is expensive. We need a better approach. tem_gpu_array = ndarray.array(exe_output[i + 1].asnumpy(), ctx=ndarray.gpu(0)) gpu_op.matrix_elementwise_multiply_by_const( exe_output[i + 1], (1 - self.beta1), tem_gpu_array) gpu_op.matrix_elementwise_add(self.momentum[i], tem_gpu_array, self.momentum[i]) gpu_op.matrix_elementwise_div_by_const( self.momentum[i], (1 - self.beta1**self.time), self.mom_hat[i]) gpu_op.matrix_elementwise_multiply_by_const( self.velocity[i], self.beta2, self.velocity[i]) gpu_op.matrix_elementwise_multiply(exe_output[i + 1], exe_output[i + 1], exe_output[i + 1]) gpu_op.matrix_elementwise_multiply_by_const( exe_output[i + 1], (1 - self.beta2), exe_output[i + 1]) gpu_op.matrix_elementwise_add(self.velocity[i], exe_output[i + 1], self.velocity[i]) gpu_op.matrix_elementwise_div_by_const( self.velocity[i], (1 - self.beta2**self.time), self.vec_hat[i]) for i in range(len(self.params)): gpu_op.matrix_elementwise_sqrt(self.vec_hat[i], self.vec_hat[i]) gpu_op.matrix_elementwise_add_by_const(self.vec_hat[i], self.eps, self.vec_hat[i]) gpu_op.matrix_elementwise_multiply_by_const( self.mom_hat[i], -1 * self.lr, self.mom_hat[i]) gpu_op.matrix_elementwise_division(self.mom_hat[i], self.vec_hat[i], self.mom_hat[i]) gpu_op.matrix_elementwise_add(self.params[i].const, self.mom_hat[i], self.params[i].const) else: vec_hat = [np.zeros_like(param.const) for param in self.params] mom_hat = [np.zeros_like(param.const) for param in self.params] for i in range(len(self.params)): self.momentum[i] = self.beta1 * self.momentum[i] + ( 1 - self.beta1) * exe_output[i + 1] mom_hat[i] = self.momentum[i] / (1 - self.beta1**self.time) self.velocity[i] = self.beta2 * self.velocity[i] + ( 1 - self.beta2) * (exe_output[i + 1]**2) vec_hat[i] = self.velocity[i] / (1 - self.beta2**self.time) for i in range(len(self.params)): self.params[i].const += -self.lr * mom_hat[i] / ( np.sqrt(vec_hat[i]) + self.eps) cost = exe_output[0] if self.use_gpu: cost = cost.asnumpy() return cost