def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc, dc): param = inputs[0] lr = np.array([lr], dtype=np.float32) momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) momentum = np.abs(momentum) grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0, ), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse)
def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc, dc): param = inputs[0] lr = np.array([lr], dtype=np.float32) momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32)) ) momentum = np.abs(momentum) grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0,), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) return (param_out, momentum_out) self.assertReferenceChecks( gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse)
def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc): param, grad = inputs lr = np.array([lr], dtype=np.float32) # Create a 1D row-wise average sum of squared gradients tensor. momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) momentum = np.abs(momentum) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0]))), ) # Note that unlike SparseAdagrad, RowWiseSparseAdagrad uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # The indices must be unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) for i, index in enumerate(indices): param_out[index], momentum_out[ index] = self.ref_row_wise_adagrad(param[index], momentum[index], grad[i], lr, epsilon) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse)
def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc): param, grad = inputs lr = np.array([lr], dtype=np.float32) # Create a 1D row-wise average sum of squared gradients tensor. momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32)) ) momentum = np.abs(momentum) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0]))), ) # Note that unlike SparseAdagrad, RowWiseSparseAdagrad uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # The indices must be unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) for i, index in enumerate(indices): param_out[index], momentum_out[index] = self.ref_row_wise_adagrad( param[index], momentum[index], grad[i], lr, epsilon) return (param_out, momentum_out) self.assertReferenceChecks( gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse)
def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create a 1D row-wise average 2nd moment tensor. mom2 = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) mom2 = np.absolute(mom2) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # Verify that the generated indices are unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index] = \ self.ref_row_wise_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], ref_row_wise_sparse, input_device_options=input_device_options)
def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create a 1D row-wise average 2nd moment tensor. mom2 = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32)) ) mom2 = np.absolute(mom2) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # Verify that the generated indices are unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index] = \ self.ref_row_wise_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], ref_row_wise_sparse, input_device_options=input_device_options)