def matmul(a: Variable, b: Variable) -> Variable: "Matrix multiplication." value = np.matmul(a.array, b.array) a_, b_ = enable_broadcast(a, b, matmul=True) local_gradients = [ (a_, lambda path_value: np.matmul(path_value, np.swapaxes(b.array, -2, -1)) ), (b_, lambda path_value: np.matmul(np.swapaxes(a.array, -2, -1), path_value) ), ] return Variable(value, local_gradients)
def multiply_by_locgrad(path_value): result = np.zeros(flatshape) result[np.arange(result.shape[0]), idx] = 1 swapped_shape = list(a.shape) swapped_shape[axis], swapped_shape[-1] = swapped_shape[ -1], swapped_shape[axis] result = result.reshape(swapped_shape) result = np.swapaxes(result, axis, -1) return path_value * result
def maxax(a: Variable, axis: int) -> Variable: "Reduce an axis, `axis`, to its max value." # Note, implementation now more complicated because CuPy doesn't have put_along_axis. axis = axis if axis >= 0 else a.ndim + axis value = np.swapaxes(a.array, axis, -1) value = value.reshape([-1, value.shape[-1]]) flatshape = value.shape idx = np.argmax(value, axis=-1) value = np.take_along_axis(value, idx[..., np.newaxis], -1) value = value.reshape( tuple(1 if i == axis else v for i, v in enumerate(a.shape))) def multiply_by_locgrad(path_value): result = np.zeros(flatshape) result[np.arange(result.shape[0]), idx] = 1 swapped_shape = list(a.shape) swapped_shape[axis], swapped_shape[-1] = swapped_shape[ -1], swapped_shape[axis] result = result.reshape(swapped_shape) result = np.swapaxes(result, axis, -1) return path_value * result local_gradients = ((a, multiply_by_locgrad), ) return Variable(value, local_gradients)
def matrix_transpose(a: Variable) -> Variable: "Swap the end two axes." value = np.swapaxes(a.array, -2, -1) local_gradients = [(a, lambda path_value: np.swapaxes(path_value, -2, -1))] return Variable(value, local_gradients)