class DistanceTest(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=2, min_dim=1, max_dim=4, dtype=np.float32), **hu.gcs) def test_L1_distance(self, inputs, gc, dc): X, Y = inputs # avoid kinks by moving away from 0 X += 0.02 * np.sign(X - Y) X[(X - Y) == 0.0] += 0.02 self.ws.create_blob("X").feed(X) self.ws.create_blob("Y").feed(Y) op = core.CreateOperator( 'L1Distance', ['X', 'Y'], ['l1_dist'], ) self.ws.run(op) np.testing.assert_allclose(self.ws.blobs[("l1_dist")].fetch(), np.linalg.norm((X - Y).flatten(), ord=1), rtol=1e-4, atol=1e-4) self.assertDeviceChecks(dc, op, [X, Y], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X, Y], 0, [0], stepsize=1e-2, threshold=1e-2) # Gradient check wrt Y self.assertGradientChecks(gc, op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2) @given(inputs=hu.tensors(n=2, min_dim=1, max_dim=2, dtype=np.float32), **hu.gcs) def test_dot_product(self, inputs, gc, dc): X, Y = inputs op = core.CreateOperator( 'DotProduct', ['X', 'Y'], ['DOT'], ) def dot_ref(X, Y): return ([np.dot(x, y) for x, y in zip(X, Y)], ) # Check against numpy dot reference self.assertReferenceChecks(gc, op, [X, Y], dot_ref) # Check over multiple devices self.assertDeviceChecks(dc, op, [X, Y], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X, Y], 0, [0]) # Gradient check wrt Y self.assertGradientChecks(gc, op, [X, Y], 1, [0])
class TestATen(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=2), **hu.gcs) def test_add(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add") def ref(X, Y): return [X + Y] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_pow(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["S"], ["Z"], operator="pow", exponent=2.0) def ref(X): return [np.square(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(x=st.integers(min_value=2, max_value=8), **hu.gcs) def test_sort(self, x, gc, dc): inputs = [np.random.permutation(x)] op = core.CreateOperator("ATen", ["S"], ["Z", "I"], operator="sort") def ref(X): return [np.sort(X), np.argsort(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_sum(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["S"], ["Z"], operator="sum") def ref(X): return [np.sum(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(**hu.gcs) def test_ones(self, gc, dc): op = core.CreateOperator("ATen", [], ["Z"], operator="ones", type="float", size={2, 4}) def ref(): return [np.ones([2, 4])] self.assertReferenceChecks(gc, op, [], ref)
class TestSparseNormalize(hu.HypothesisTestCase): @staticmethod def ref_normalize(param_in, use_max_norm, norm): param_norm = np.linalg.norm(param_in) + 1e-12 if (use_max_norm and param_norm > norm) or not use_max_norm: param_in = param_in * norm / param_norm return param_in # Suppress filter_too_much health check. # Likely caused by `assume` call falling through too often. @settings(suppress_health_check=[HealthCheck.filter_too_much]) @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2), use_max_norm=st.booleans(), norm=st.floats(min_value=1.0, max_value=4.0), data_strategy=st.data(), **hu.gcs_cpu_only) def test_sparse_normalize(self, inputs, use_max_norm, norm, data_strategy, gc, dc): param, grad = inputs param += 0.02 * np.sign(param) param[param == 0.0] += 0.02 # Create an indexing array containing values that are lists of indices, # which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, min_dim=1, max_dim=1, elements=st.sampled_from(np.arange(grad.shape[0]))), ) hypothesis.note('indices.shape: %s' % str(indices.shape)) # For now, the indices must be unique hypothesis.assume( np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SparseNormalize", ["param", "indices", "grad"], ["param"], use_max_norm=use_max_norm, norm=norm, ) def ref_sparse_normalize(param, indices, grad): param_out = np.copy(param) for _, index in enumerate(indices): param_out[index] = self.ref_normalize( param[index], use_max_norm, norm, ) return (param_out, ) # self.assertDeviceChecks(dc, op, [param, indices, grad], [0]) self.assertReferenceChecks(gc, op, [param, indices, grad], ref_sparse_normalize)
class TestWeightScale(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=1), ITER=st.integers(min_value=0, max_value=100), stepsize=st.integers(min_value=20, max_value=50), upper_bound_iter=st.integers(min_value=5, max_value=100), scale=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_weight_scale(self, inputs, ITER, stepsize, upper_bound_iter, scale, gc, dc): ITER = np.array([ITER], dtype=np.int64) op = core.CreateOperator("WeightScale", ["w", "iter"], ["nw"], stepsize=stepsize, upper_bound_iter=upper_bound_iter, scale=scale) def ref_weight_scale(w, iter, stepsize, upper_bound_iter, scale): iter = iter + 1 return [ w * scale if iter % stepsize == 0 and iter < upper_bound_iter else w ] self.assertReferenceChecks( gc, op, [inputs[0], ITER], functools.partial(ref_weight_scale, stepsize=stepsize, upper_bound_iter=upper_bound_iter, scale=scale))
class ReluTest(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32)) def relu_test(self, inputs, gc, dc): X = inputs[0] # First dimension is the batch size print(X.shape) pred_net = caffe2_pb2.NetDef() pred_net.name = "pred" pred_net.external_input.extend(["X"]) pred_net.external_output.append("Y") pred_net.op.add().CopyFrom(core.CreateOperator("Relu", ["X"], ["Y"])) pred_net_ref = caffe2_pb2.NetDef() pred_net_ref.name = "ref" pred_net_ref.external_input.extend(["X"]) pred_net_ref.external_output.append("Y_ref") pred_net_ref.op.add().CopyFrom( core.CreateOperator( "ReluFakeFp16", ["X"], ["Y_ref"], )) shape_hints = {"X": X.shape} pred_net_onnxified = onnxifi_caffe2_net(pred_net, shape_hints, debug=True, adjust_batch=True, use_onnx=False) print(pred_net_onnxified) num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op) np.testing.assert_equal(num_onnxified_ops, 1) workspace.SwitchWorkspace("glow_test_ws", True) workspace.FeedBlob("X", X) workspace.CreateNet(pred_net_ref) workspace.CreateNet(pred_net_onnxified) workspace.FeedBlob("X", X) # Run caffe2 net workspace.RunNet(pred_net_ref.name) Y_c2 = workspace.FetchBlob("Y_ref") # Run Glow net workspace.RunNet(pred_net_onnxified.name) Y_glow = workspace.FetchBlob("Y") # Results should be identical since we are comparing with the C2 emulation if not np.allclose(Y_c2, Y_glow): diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon)) print_test_debug_info("Relu", { "X": X, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff }) assert (0)
class LpnormTest(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32), **hu.gcs_cpu_only) def test_Lp_Norm(self, inputs, gc, dc): X = inputs[0] # avoid kinks by moving away from 0 X += 0.02 * np.sign(X) X[X == 0.0] += 0.02 self.ws.create_blob("X").feed(X) op = core.CreateOperator( 'LpNorm', ['X'], ['l1_norm'], p=1, ) self.ws.run(op) np.testing.assert_allclose(self.ws.blobs[("l1_norm")].fetch(), np.linalg.norm((X).flatten(), ord=1), rtol=1e-4, atol=1e-4) self.assertDeviceChecks(dc, op, [X], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2) op = core.CreateOperator( 'LpNorm', ['X'], ['l2_norm'], p=2, ) self.ws.run(op) np.testing.assert_allclose(self.ws.blobs[("l2_norm")].fetch(), np.linalg.norm((X).flatten(), ord=2)**2, rtol=1e-4, atol=1e-4) self.assertDeviceChecks(dc, op, [X], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2)
class DistanceTest(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=2, min_dim=1, max_dim=2, dtype=np.float32), **hu.gcs) def test_dot_product(self, inputs, gc, dc): X, Y = inputs op = core.CreateOperator( 'DotProduct', ['X', 'Y'], ['DOT'], ) def dot_ref(X, Y): return ([np.dot(x, y) for x, y in zip(X, Y)], ) # Check against numpy dot reference self.assertReferenceChecks(gc, op, [X, Y], dot_ref) # Check over multiple devices self.assertDeviceChecks(dc, op, [X, Y], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X, Y], 0, [0]) # Gradient check wrt Y self.assertGradientChecks(gc, op, [X, Y], 1, [0])
class TestAdam(hu.HypothesisTestCase): @staticmethod def ref_adam(param, mom1, mom2, grad, LR, ITER, beta1, beta2, epsilon): t = ITER + 1 corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \ (1 - np.power(beta1, t)) mom1_out = (beta1 * mom1) + (1 - beta1) * grad mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad) param_out = param + corrected_local_rate * mom1_out / \ (np.sqrt(mom2_out) + epsilon) return param_out, mom1_out, mom2_out @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) op = core.CreateOperator( "Adam", ["param", "mom1", "mom2", "grad", "lr", "iter"], ["output_param", "output_mom1", "output_mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, grad, LR, ITER], functools.partial( self.ref_adam, beta1=beta1, beta2=beta2, epsilon=epsilon), input_device_options=input_device_options) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs mom1 = np.absolute(mom1) mom2 = np.absolute(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) indices = np.arange(grad.shape[0]) indices = indices[indices % 2 == 0] grad = grad[indices] op = core.CreateOperator( "SparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index] = \ self.ref_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], ref_sparse, input_device_options=input_device_options)
class TestAdam(hu.HypothesisTestCase): @staticmethod def ref_adam(param, mom1, mom2, grad, LR, ITER, beta1, beta2, epsilon): t = ITER + 1 corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \ (1 - np.power(beta1, t)) mom1_out = (beta1 * mom1) + (1 - beta1) * grad mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad) param_out = param + corrected_local_rate * mom1_out / \ (np.sqrt(mom2_out) + epsilon) return param_out, mom1_out, mom2_out @staticmethod def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER, beta1, beta2, epsilon): t = ITER + 1 corrected_local_rate = LR * np.sqrt(1 - np.power(beta2, t)) / \ (1 - np.power(beta1, t)) mom1_out = (beta1 * mom1) + (1 - beta1) * grad mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad)) param_out = param + corrected_local_rate * mom1_out / \ (np.sqrt(mom2_out) + epsilon) return (param_out, mom1_out, mom2_out) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) op = core.CreateOperator( "Adam", ["param", "mom1", "mom2", "grad", "lr", "iter"], ["output_param", "output_mom1", "output_mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks(gc, op, [param, mom1, mom2, grad, LR, ITER], functools.partial(self.ref_adam, beta1=beta1, beta2=beta2, epsilon=epsilon), input_device_options=input_device_options) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, mom2, grad = inputs mom2 = np.absolute(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Verify that the generated indices are unique hypothesis.assume( np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index] = \ self.ref_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], ref_sparse, input_device_options=input_device_options) @given(inputs=hu.tensors(n=3), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs_cpu_only) def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create a 1D row-wise average 2nd moment tensor. mom2 = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) mom2 = np.absolute(mom2) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # Verify that the generated indices are unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index] = \ self.ref_row_wise_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], ref_row_wise_sparse, input_device_options=input_device_options)
class TestAdagrad(hu.HypothesisTestCase): @staticmethod def ref_adagrad(param_in, mom_in, grad, lr, epsilon): mom_out = mom_in + np.square(grad) grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon) param_out = param_in + grad_adj return (param_out, mom_out) @staticmethod def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon): mom_out = mom_in + np.mean(np.square(grad)) grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon) param_out = param_in + grad_adj return (param_out, mom_out) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_adagrad(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(self.ref_adagrad, epsilon=epsilon)) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc): param, momentum, grad = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) # Create an indexing array containing values that are lists of indices, # which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0]))), ) hypothesis.note('indices.shape: %s' % str(indices.shape)) # For now, the indices must be unique hypothesis.assume( np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) for i, index in enumerate(indices): param_out[index], momentum_out[index] = self.ref_adagrad( param[index], momentum[index], grad[i], lr, epsilon) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_sparse) @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc, dc): param, momentum = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0, ), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_sparse) @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc): param, grad = inputs lr = np.array([lr], dtype=np.float32) # Create a 1D row-wise average sum of squared gradients tensor. momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) momentum = np.abs(momentum) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0]))), ) # Note that unlike SparseAdagrad, RowWiseSparseAdagrad uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # The indices must be unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) for i, index in enumerate(indices): param_out[index], momentum_out[ index] = self.ref_row_wise_adagrad(param[index], momentum[index], grad[i], lr, epsilon) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse) @given(inputs=hu.tensors(n=1), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc, dc): param = inputs[0] lr = np.array([lr], dtype=np.float32) momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) momentum = np.abs(momentum) grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0, ), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse)
class TestMomentumSGD(serial.SerializedTestCase): @serial.given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs) def test_momentum_sgd(self, n, nesterov, gc, dc): param = np.random.rand(n).astype(np.float32) grad = np.random.rand(n).astype(np.float32) lr = np.random.rand(1).astype(np.float32) param_momentum = np.random.rand(n).astype(np.float32) momentum = 0.9 def momentum_sgd(grad, param_momentum, lr, param=None): if not nesterov: adjusted_gradient = lr * grad + momentum * param_momentum if param is None: return [adjusted_gradient, adjusted_gradient] else: paramup = param - adjusted_gradient return [adjusted_gradient, adjusted_gradient, paramup] else: m_new = momentum * param_momentum + lr * grad grad_new = (1 + momentum) * m_new - momentum * param_momentum if param is None: return [grad_new, m_new] else: paramup = param - grad_new return [grad_new, m_new, paramup] op = core.CreateOperator( "MomentumSGDUpdate", ["grad", "param_momentum", "lr", "param"], ["grad", "param_momentum", "param"], momentum=momentum, nesterov=int(nesterov), ) self.assertReferenceChecks(device_option=gc, op=op, inputs=[grad, param_momentum, lr, param], reference=momentum_sgd) op_noparam = core.CreateOperator( "MomentumSGD", ["grad", "param_momentum", "lr"], ["grad", "param_momentum"], momentum=momentum, nesterov=int(nesterov), ) self.assertReferenceChecks(device_option=gc, op=op_noparam, inputs=[grad, param_momentum, lr], reference=momentum_sgd) @serial.given(inputs=hu.tensors(n=3), momentum=st.floats(min_value=0.1, max_value=0.9), nesterov=st.booleans(), lr=st.floats(min_value=0.1, max_value=0.9), data_strategy=st.data(), **hu.gcs) def test_sparse_momentum_sgd(self, inputs, momentum, nesterov, lr, data_strategy, gc, dc): w, grad, m = inputs # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Verify that the generated indices are unique hypothesis.assume( np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] # Make momentum >= 0 m = np.abs(m) # Convert lr to a numpy array lr = np.asarray([lr], dtype=np.float32) op = core.CreateOperator("SparseMomentumSGDUpdate", ["grad", "m", "lr", "param", "indices"], ["adjusted_grad", "m", "param"], momentum=momentum, nesterov=int(nesterov), device_option=gc) # Reference def momentum_sgd(grad, m, lr): lr = lr[0] if not nesterov: adjusted_gradient = lr * grad + momentum * m return (adjusted_gradient, adjusted_gradient) else: m_new = momentum * m + lr * grad return ((1 + momentum) * m_new - momentum * m, m_new) def sparse(grad, m, lr, param, i): grad_new, m_new = momentum_sgd(grad, m[i], lr) m[i] = m_new param[i] -= grad_new return (grad_new, m, param) self.assertReferenceChecks(gc, op, [grad, m, lr, w, indices], sparse) @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs_gpu_only) @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.") def test_fp16momentum_sgd(self, n, nesterov, gc, dc): gpuvers = workspace.GetDeviceProperties(0)["major"] if gpuvers < 6: print( "No FP16 support because major version {} < 6".format(gpuvers)) return param = np.random.rand(n).astype(np.float16) grad = np.random.rand(n).astype(np.float16) lr = np.random.rand(1).astype(np.float32) param_momentum = np.random.rand(n).astype(np.float16) momentum = 0.9 nesterov = True def momentum_sgd(grad, param_momentum, lr, param=None): if not nesterov: adjusted_gradient = lr * grad + momentum * param_momentum paramup = param - adjusted_gradient return [adjusted_gradient, adjusted_gradient, paramup] else: m_new = momentum * param_momentum + lr * grad grad_new = (1 + momentum) * m_new - momentum * param_momentum paramup = param - grad_new return [grad_new, m_new, paramup] op = core.CreateOperator( "FP16MomentumSGDUpdate", ["grad", "param_momentum", "lr", "param"], ["grad", "param_momentum", "param"], momentum=momentum, nesterov=int(nesterov), weight_decay=0.0, ) self.assertReferenceChecks(device_option=gc, op=op, inputs=[grad, param_momentum, lr, param], reference=momentum_sgd)
class TestAdagrad(serial.SerializedTestCase): @staticmethod def ref_row_wise_adagrad(param_in, mom_in, grad, lr, epsilon): mom_out = mom_in + np.mean(np.square(grad)) grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon) param_out = param_in + grad_adj return (param_out, mom_out) @serial.given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_adagrad(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(ref_adagrad, epsilon=epsilon)) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_adagrad_output_effective_lr(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum", "effective_lr"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(ref_adagrad, epsilon=epsilon, output_effective_lr=True)) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_adagrad_output_effective_lr_and_update(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum", "effective_lr", "update"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(ref_adagrad, epsilon=epsilon, output_effective_lr_and_update=True)) # Suppress filter_too_much health check. # Likely caused by `assume` call falling through too often. @settings(suppress_health_check=[HealthCheck.filter_too_much]) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc): return adagrad_sparse_test_helper(self, inputs, lr, epsilon, None, ref_adagrad, gc, dc) @serial.given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc, dc): param, momentum = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0, ), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) return (param_out, momentum_out) ref_using_fp16_values = [False] if dc == hu.gpu_do: ref_using_fp16_values.append(True) for ref_using_fp16 in ref_using_fp16_values: if (ref_using_fp16): print( 'test_sparse_adagrad_empty with half precision embedding') momentum_i = momentum.astype(np.float16) param_i = param.astype(np.float16) else: print( 'test_sparse_adagrad_empty with full precision embedding') momentum_i = momentum.astype(np.float32) param_i = param.astype(np.float32) self.assertReferenceChecks( gc, op, [param_i, momentum_i, indices, grad, lr], ref_sparse) @unittest.skipIf("IN_CIRCLECI" in os.environ, "FIXME: flaky test in CircleCI") # Suppress filter_too_much health check. # Likely caused by `assume` call falling through too often. @settings(suppress_health_check=[HealthCheck.filter_too_much]) @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc): param, grad = inputs lr = np.array([lr], dtype=np.float32) # Create a 1D row-wise average sum of squared gradients tensor. momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) momentum = np.abs(momentum) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0]))), ) # Note that unlike SparseAdagrad, RowWiseSparseAdagrad uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # The indices must be unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) for i, index in enumerate(indices): param_out[index], momentum_out[ index] = self.ref_row_wise_adagrad(param[index], momentum[index], grad[i], lr, epsilon) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse) @serial.given(inputs=hu.tensors(n=1), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc, dc): param = inputs[0] lr = np.array([lr], dtype=np.float32) momentum = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32))) momentum = np.abs(momentum) grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0, ), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "RowWiseSparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_row_wise_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_row_wise_sparse)
class TestATen(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=2), **hu.gcs) def test_add(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add") def ref(X, Y): return [X + Y] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=2, dtype=np.float16), **hu.gcs_gpu_only) def test_add_half(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add") def ref(X, Y): return [X + Y] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_pow(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["S"], ["Z"], operator="pow", exponent=2.0) def ref(X): return [np.square(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(x=st.integers(min_value=2, max_value=8), **hu.gcs) def test_sort(self, x, gc, dc): inputs = [np.random.permutation(x)] op = core.CreateOperator("ATen", ["S"], ["Z", "I"], operator="sort") def ref(X): return [np.sort(X), np.argsort(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_sum(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["S"], ["Z"], operator="sum") def ref(X): return [np.sum(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(**hu.gcs) def test_ones(self, gc, dc): op = core.CreateOperator("ATen", [], ["Z"], operator="ones", type="float", size={2, 4}) def ref(): return [np.ones([2, 4])] self.assertReferenceChecks(gc, op, [], ref) @given(**hu.gcs) def test_index_put(self, gc, dc): op = core.CreateOperator("ATen", ['self', 'indices', 'values'], ["Z"], operator="index_put") def ref(self, indices, values): self[indices] = values return (self, ) tensor = np.random.randn(3, 3).astype(np.float32) mask = np.array([[True, True, True], [True, False, False], [True, True, False]]) values = np.random.randn(6).astype(np.float32) self.assertReferenceChecks(gc, op, [tensor, mask, values], ref)
class TestAdagrad(serial.SerializedTestCase): @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), weight_decay=st.sampled_from([0.0, 0.1]), **hu.gcs) @settings(deadline=10000) def test_adagrad(self, inputs, lr, epsilon, weight_decay, gc, dc): param, momentum, grad = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, weight_decay=weight_decay, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(ref_adagrad, epsilon=epsilon, weight_decay=weight_decay), ) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), weight_decay=st.sampled_from([0.0, 0.1]), **hu.gcs_cpu_only) @settings(deadline=10000) def test_adagrad_output_effective_lr(self, inputs, lr, epsilon, weight_decay, gc, dc): param, momentum, grad = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum", "effective_lr"], epsilon=epsilon, weight_decay=weight_decay, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial( ref_adagrad, epsilon=epsilon, output_effective_lr=True, weight_decay=weight_decay, ), ) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) @settings(deadline=10000) def test_adagrad_output_effective_lr_and_update(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum", "effective_lr", "update"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(ref_adagrad, epsilon=epsilon, output_effective_lr_and_update=True), ) # Suppress filter_too_much health check. # Likely caused by `assume` call falling through too often. @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), weight_decay=st.sampled_from([0.0, 0.1]), **hu.gcs) def test_sparse_adagrad(self, inputs, lr, epsilon, weight_decay, gc, dc): adagrad_sparse_test_helper( self, inputs, lr, epsilon, None, ref_adagrad, gc, dc, weight_decay=weight_decay, ) @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) @settings(deadline=10000) def test_sparse_adagrad_empty(self, inputs, lr, epsilon, gc, dc): param, momentum = inputs grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) ref_using_fp16_values = [False] if gc == hu.gpu_do: ref_using_fp16_values.append(True) for ref_using_fp16 in ref_using_fp16_values: if ref_using_fp16: print( "test_sparse_adagrad_empty with half precision embedding") momentum_i = momentum.astype(np.float16) param_i = param.astype(np.float16) else: print( "test_sparse_adagrad_empty with full precision embedding") momentum_i = momentum.astype(np.float32) param_i = param.astype(np.float32) adagrad_sparse_test_helper( self, [param_i, momentum_i, grad], lr, epsilon, None, ref_adagrad, gc, dc, ) # Suppress filter_too_much health check. # Likely caused by `assume` call falling through too often. @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), weight_decay=st.sampled_from([0.0, 0.1]), **hu.gcs) def test_row_wise_sparse_adagrad(self, inputs, lr, epsilon, weight_decay, gc, dc): adagrad_sparse_test_helper( self, inputs, lr, epsilon, None, functools.partial(ref_adagrad, row_wise=True), gc, dc, row_wise=True, weight_decay=weight_decay, ) @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) @settings(deadline=None) def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon, gc, dc): param, momentum = inputs grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) adagrad_sparse_test_helper( self, [param, momentum, grad], lr, epsilon, None, ref_adagrad, gc, dc, row_wise=True, )
class TestATen(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=2), **hu.gcs) def test_add(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add") def ref(X, Y): return [X + Y] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=2, dtype=np.float16), **hu.gcs_gpu_only) def test_add_half(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["X", "Y"], ["Z"], operator="add") def ref(X, Y): return [X + Y] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_pow(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["S"], ["Z"], operator="pow", exponent=2.0) def ref(X): return [np.square(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(x=st.integers(min_value=2, max_value=8), **hu.gcs) def test_sort(self, x, gc, dc): inputs = [np.random.permutation(x)] op = core.CreateOperator("ATen", ["S"], ["Z", "I"], operator="sort") def ref(X): return [np.sort(X), np.argsort(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(inputs=hu.tensors(n=1), **hu.gcs) def test_sum(self, inputs, gc, dc): op = core.CreateOperator("ATen", ["S"], ["Z"], operator="sum") def ref(X): return [np.sum(X)] self.assertReferenceChecks(gc, op, inputs, ref) @given(**hu.gcs) def test_index_uint8(self, gc, dc): # Indexing with uint8 is deprecated, but we need to provide backward compatibility for some old models exported through ONNX op = core.CreateOperator("ATen", ['self', 'mask'], ["Z"], operator="index") def ref(self, mask): return (self[mask.astype(np.bool)], ) tensor = np.random.randn(2, 3, 4).astype(np.float32) mask = np.array([[1, 0, 0], [1, 1, 0]]).astype(np.uint8) self.assertReferenceChecks(gc, op, [tensor, mask], ref) @given(**hu.gcs) def test_index_put(self, gc, dc): op = core.CreateOperator("ATen", ['self', 'indices', 'values'], ["Z"], operator="index_put") def ref(self, indices, values): self[indices] = values return (self, ) tensor = np.random.randn(3, 3).astype(np.float32) mask = np.array([[True, True, True], [True, False, False], [True, True, False]]) values = np.random.randn(6).astype(np.float32) self.assertReferenceChecks(gc, op, [tensor, mask, values], ref) @given(**hu.gcs) def test_unique(self, gc, dc): op = core.CreateOperator( "ATen", ['self'], ["output"], sorted=True, return_inverse=True, # return_counts=False, operator="_unique") def ref(self): index, _ = np.unique(self, return_index=False, return_inverse=True, return_counts=False) return (index, ) tensor = np.array([1, 2, 6, 4, 2, 3, 2]) print(ref(tensor)) self.assertReferenceChecks(gc, op, [tensor], ref)
class PythonOpTest(hu.HypothesisTestCase): @given(x=hu.tensor()) def test_feed(self, x): def f(inputs, _): self.assertEqual(x.shape, inputs[0].shape) self.assertEqual(type(inputs[0].shape), tuple) self.assertEqual(type(inputs[0].data), np.ndarray) np.testing.assert_almost_equal(x, inputs[0].data) op = CreatePythonOperator(f, ["x"], []) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) @given(x=hu.tensor()) def test_feed_with_helper_function(self, x): def f(inputs, _): self.assertEqual(x.shape, inputs[0].shape) self.assertEqual(type(inputs[0].shape), tuple) self.assertEqual(type(inputs[0].data), np.ndarray) np.testing.assert_almost_equal(x, inputs[0].data) net = core.Net("test") net.Python(f)(["x"], []) workspace.FeedBlob("x", x) workspace.RunNetOnce(net) @given(x=hu.tensor()) def test_feed_with_gc(self, x): def f(inputs, _): self.assertEqual(x.shape, inputs[0].shape) np.testing.assert_almost_equal(x, inputs[0].data) op = CreatePythonOperator(f, ["x"], []) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) del f workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) @given(x=hu.tensor()) def test_reshape(self, x): def f(inputs, outputs): outputs[0].reshape(inputs[0].shape) self.assertEqual(x.shape, inputs[0].shape) self.assertEqual(x.shape, outputs[0].shape) outputs[0].data[...] = inputs[0].data op = CreatePythonOperator(f, ["x"], ["y"]) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) y = workspace.FetchBlob("y") np.testing.assert_almost_equal(x, y) @given(x=hu.tensor()) def test_caught_exception_doesnt_terminate(self, x): def f(inputs, outputs): try: raise Exception("Exception in handler") except: pass op = CreatePythonOperator(f, ["x"], ["y"]) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) @given(x=hu.tensor(), n=st.integers(min_value=1, max_value=20), w=st.integers(min_value=1, max_value=20)) def test_multithreaded_evaluation(self, x, n, w): def f(inputs, outputs): outputs[0].reshape(inputs[0].shape) outputs[0].data[...] = inputs[0].data ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)] net = core.Net("net") net.Proto().op.extend(ops) net.Proto().type = "dag" net.Proto().num_workers = w iters = 100 plan = core.Plan("plan") plan.AddStep(core.ExecutionStep("test-step", net, iters)) workspace.FeedBlob("x", x) workspace.RunPlan(plan.Proto().SerializeToString()) for i in range(n): y = workspace.FetchBlob(str(i)) np.testing.assert_almost_equal(x, y) @given(x=hu.tensor(), in_place=st.booleans()) def test_gradient(self, x, in_place): def f(inputs, outputs): outputs[0].reshape(inputs[0].shape) outputs[0].data[...] = inputs[0].data * 2 def grad_f(inputs, outputs): # Ordering is [inputs, outputs, grad_outputs] grad_output = inputs[2] grad_input = outputs[0] grad_input.reshape(grad_output.shape) grad_input.data[...] = grad_output.data * 2 op = CreatePythonOperator(f, ["x"], ["x" if in_place else "y"], grad_f=grad_f) self.assertGradientChecks(hu.cpu_do, op, [x], 0, [0]) @given(inputs=hu.tensors(n=2)) def test_gradient_multiple(self, inputs): (x1, x2) = inputs def f(inputs, outputs): for idx in [0, 1]: self.assertEqual(type(inputs[idx].shape), tuple) outputs[idx].reshape(inputs[idx].shape) outputs[idx].data[...] = inputs[idx].data * 2 def grad_f(inputs, outputs): # Ordering is [inputs, outputs, grad_outputs] self.assertEqual(len(inputs), 6) self.assertEqual(len(outputs), 2) for (grad_output_idx, grad_input_idx) in [(4, 0), (5, 1)]: grad_output = inputs[grad_output_idx] grad_input = outputs[grad_input_idx] grad_input.reshape(grad_output.shape) grad_input.data[...] = grad_output.data * 2 op = CreatePythonOperator(f, ["x1", "x2"], ["y1", "y2"], grad_f=grad_f) for idx in [0, 1]: self.assertGradientChecks(hu.cpu_do, op, [x1, x2], idx, [0, 1])
class TestMomentumSGD(hu.HypothesisTestCase): @given(n=st.integers(4, 8), **hu.gcs) def test_momentum_sgd(self, n, gc, dc): param = np.random.rand(n).astype(np.float32) grad = np.random.rand(n).astype(np.float32) lr = np.random.rand(1).astype(np.float32) param_momentum = np.random.rand(n).astype(np.float32) momentum = 0.9 def momentum_sgd(grad, param_momentum, lr, param=None): adjgrad = lr * grad + momentum * param_momentum if param is None: return [adjgrad, adjgrad] else: paramup = param - adjgrad return [adjgrad, adjgrad, paramup] op = core.CreateOperator( "MomentumSGDUpdate", ["grad", "param_momentum", "lr", "param"], ["grad", "param_momentum", "param"], momentum=momentum, nesterov=0, ) self.assertReferenceChecks(device_option=gc, op=op, inputs=[grad, param_momentum, lr, param], reference=momentum_sgd) op_noparam = core.CreateOperator( "MomentumSGD", ["grad", "param_momentum", "lr"], ["grad", "param_momentum"], momentum=momentum, nesterov=0, ) self.assertReferenceChecks(device_option=gc, op=op_noparam, inputs=[grad, param_momentum, lr], reference=momentum_sgd) @given(inputs=hu.tensors(n=3), momentum=st.floats(min_value=0.1, max_value=0.9), nesterov=st.booleans(), lr=st.floats(min_value=0.1, max_value=0.9), data_strategy=st.data(), **hu.gcs) def test_sparse_momentum_sgd(self, inputs, momentum, nesterov, lr, data_strategy, gc, dc): w, grad, m = inputs # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0]))), ) hypothesis.note('indices.shape: %s' % str(indices.shape)) # For now, the indices must be unique hypothesis.assume( np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] # Make momentum >= 0 m = np.abs(m) # Convert lr to a numpy array lr = np.asarray([lr], dtype=np.float32) op = core.CreateOperator("SparseMomentumSGDUpdate", ["grad", "m", "lr", "param", "indices"], ["adjusted_grad", "m", "param"], momentum=momentum, nesterov=int(nesterov), device_option=gc) # Reference def momentum_sgd(grad, m, lr): lr = lr[0] if not nesterov: adjusted_gradient = lr * grad + momentum * m return (adjusted_gradient, adjusted_gradient) else: m_new = momentum * m + lr * grad return ((1 + momentum) * m_new - momentum * m, m_new) def sparse(grad, m, lr, param, i): grad_new, m_new = momentum_sgd(grad, m[i], lr) m[i] = m_new param[i] -= grad_new return (grad_new, m, param) self.assertReferenceChecks(gc, op, [grad, m, lr, w, indices], sparse)
class TestMaskedAdagrad(hu.HypothesisTestCase): @given( inputs=hu.tensors(n=3), lr=st.floats( min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False ), epsilon=st.floats( min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False ), ) def test_masked_adagrad(self, inputs, lr, epsilon): param, moment, grad = inputs moment = np.abs(moment) lr = np.array([lr], dtype=np.float32) mask = np.random.randint(2, size=param.shape).astype(np.float32) workspace.FeedBlob("param", param) workspace.FeedBlob("moment", moment) workspace.FeedBlob("grad", grad) workspace.FeedBlob("lr", lr) workspace.FeedBlob("mask", mask) ref_op = core.CreateOperator( "Adagrad", ["param", "moment", "grad", "lr"], ["out_param_ref", "out_moment_ref"], epsilon=epsilon, ) op = core.CreateOperator( "MaskedAdagrad", ["param", "moment", "grad", "lr", "mask"], ["out_param", "out_moment"], epsilon=epsilon, ) workspace.RunOperatorOnce(ref_op) workspace.RunOperatorOnce(op) out_param_ref = workspace.FetchBlob("out_param_ref") out_moment_ref = workspace.FetchBlob("out_moment_ref") out_param_ref = np.multiply(mask, out_param_ref) out_moment_ref = np.multiply(mask, out_moment_ref) out_param = workspace.FetchBlob("out_param") out_moment = workspace.FetchBlob("out_moment") np.testing.assert_array_equal(out_param_ref, out_param) np.testing.assert_array_equal(out_moment_ref, out_moment) @given( inputs=hu.tensors(n=3), lr=st.floats( min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False ), epsilon=st.floats( min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False ), has_mask_input=st.booleans(), has_mask_out=st.booleans(), block_size=st.integers(1, 4), row_wise=st.booleans(), ) def test_masked_sparse_adagrad( self, inputs, lr, epsilon, has_mask_input, has_mask_out, block_size, row_wise, ): param, moment, grad = inputs num_rows = param.shape[0] if row_wise: moment = np.resize(moment, num_rows) moment = np.abs(moment) lr = np.array([lr], dtype=np.float32) param_ref = np.copy(param) moment_ref = np.copy(moment) indices = np.random.randint(num_rows, size=grad.shape[0]) workspace.ResetWorkspace() row_size = int(np.prod(param.shape[1:])) num_blocks_per_row = (row_size + block_size - 1) // block_size bitmask_bytes_per_row = (num_blocks_per_row + 7) // 8 if has_mask_input: # Generate a random bit pattern mask = np.random.randint( np.iinfo(np.uint8).min, np.iinfo(np.uint8).max + 1, size=[num_rows, bitmask_bytes_per_row], dtype=np.uint8, ) workspace.FeedBlob("mask", mask) else: delays = np.array([1, 2, 3]).astype(np.int32) # Make sure to use numbers that can be exactly represented in # float32 to avoid potentially different ways of handling floats # between Python and C++. prune_ratios = np.array([0.5, 0.75, 0.875]).astype(np.float32) # Feed empty mask workspace.FeedBlob("mask", np.array([]).astype(np.uint8)) workspace.FeedBlob("param_ref", param_ref) workspace.FeedBlob("moment_ref", moment_ref) workspace.FeedBlob("param", param) workspace.FeedBlob("moment", moment) workspace.FeedBlob("indices", indices) workspace.FeedBlob("grad", grad) workspace.FeedBlob("lr", lr) net = core.Net("test_net") prefix = "RowWise" if row_wise else "" ref_op = core.CreateOperator( prefix + "SparseAdagrad", ["param_ref", "moment_ref", "indices", "grad", "lr"], ["param_ref", "moment_ref"], epsilon=epsilon, ) inputs = ["param", "moment", "indices", "grad", "lr", "mask", "mask_changed"] outputs = ["param", "moment"] if not has_mask_input: inputs += ["iter"] if has_mask_out: outputs += ["mask_out"] op = core.CreateOperator( "Masked" + prefix + "SparseAdagrad", inputs, outputs, epsilon=epsilon, block_size=block_size, delays=[] if has_mask_input else delays, prune_ratios=[] if has_mask_input else prune_ratios, ) net.Proto().op.extend([ref_op, op]) workspace.FeedBlob("mask_changed", np.array([0]).astype(np.bool)) workspace.FeedBlob("iter", np.array([0]).astype(np.int64)) workspace.CreateNet(net) if has_mask_input: # Test1: if mask_changed == false, only the rows we're updating are masked workspace.RunNet(net) param_ref = workspace.FetchBlob("param_ref") moment_ref = workspace.FetchBlob("moment_ref") param = workspace.FetchBlob("param") moment = workspace.FetchBlob("moment") param_ref = param_ref.reshape(num_rows, -1) if not row_wise: moment_ref = moment_ref.reshape(num_rows, -1) for i in range(grad.shape[0]): row = indices[i] for j in range(row_size): j_block = j // block_size byte = j_block // 8 bit = j_block % 8 m = mask[row][byte] & (1 << bit) if not m: param_ref[row, j] = 0 if not row_wise: moment_ref[row, j] = 0 np.testing.assert_array_equal(param_ref, param.reshape(num_rows, -1)) np.testing.assert_array_equal( moment_ref, moment if row_wise else moment.reshape(num_rows, -1) ) # Test2: mask_changed == true workspace.FeedBlob("param_ref", param_ref) workspace.FeedBlob("moment_ref", moment_ref) workspace.FeedBlob("mask_changed", np.array([1]).astype(np.bool)) workspace.RunNet(net) param_ref = workspace.FetchBlob("param_ref") moment_ref = workspace.FetchBlob("moment_ref") for i in range(num_rows): for j in range(row_size): j_block = j // block_size byte = j_block // 8 bit = j_block % 8 m = mask[i][byte] & (1 << bit) if not m: param_ref[i, j] = 0 if not row_wise: moment_ref[i, j] = 0 param = workspace.FetchBlob("param") moment = workspace.FetchBlob("moment") np.testing.assert_array_equal(param_ref, param.reshape(num_rows, -1)) np.testing.assert_array_equal( moment_ref, moment if row_wise else moment.reshape(num_rows, -1) ) else: # Test1: in the first iteration, there shouldn't be any masking workspace.RunNet(net) param_ref = workspace.FetchBlob("param_ref") moment_ref = workspace.FetchBlob("moment_ref") param = workspace.FetchBlob("param") moment = workspace.FetchBlob("moment") np.testing.assert_array_equal(param_ref, param) np.testing.assert_array_equal(moment_ref, moment) # Test2: for each pruning delay, masks should be updated accordingly for i in range(len(delays)): mask = _get_mask(param_ref, block_size, prune_ratios[i]) workspace.FeedBlob("iter", np.array([delays[i]]).astype(np.int64)) workspace.RunNet(net) param_ref = workspace.FetchBlob("param_ref") moment_ref = workspace.FetchBlob("moment_ref") param = workspace.FetchBlob("param") moment = workspace.FetchBlob("moment") param_ref = mask * param_ref.reshape(num_rows, row_size) if not row_wise: moment_ref = mask * moment_ref.reshape(num_rows, row_size) np.testing.assert_array_equal(param_ref.flatten(), param.flatten()) np.testing.assert_array_equal(moment_ref.flatten(), moment.flatten()) # Test3: after finishing delay, mask should be fixed workspace.FeedBlob("iter", np.array([delays[-1] + 1]).astype(np.int64)) workspace.RunNet(net) param_ref = workspace.FetchBlob("param_ref") moment_ref = workspace.FetchBlob("moment_ref") param = workspace.FetchBlob("param") moment = workspace.FetchBlob("moment") param_ref = mask * param_ref.reshape(num_rows, row_size) if not row_wise: moment_ref = mask * moment_ref.reshape(num_rows, row_size) np.testing.assert_array_equal(param_ref.flatten(), param.flatten()) np.testing.assert_array_equal(moment_ref.flatten(), moment.flatten())
class DistanceTest(serial.SerializedTestCase): @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_cosine_similarity(self, n, dim, gc, dc): X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32) Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32) self.ws.create_blob("X").feed(X) self.ws.create_blob("Y").feed(Y) kEps = 1e-12 cos_op = core.CreateOperator("CosineSimilarity", ["X", "Y"], ["cos"]) self.ws.run(cos_op) cos = np.divide(np.multiply(X, Y).sum(axis=1), np.multiply(np.linalg.norm(X, axis=1) + kEps, np.linalg.norm(Y, axis=1) + kEps)) np.testing.assert_allclose(self.ws.blobs[("cos")].fetch(), cos, rtol=1e-4, atol=1e-4) self.assertGradientChecks(gc, cos_op, [X, Y], 0, [0], stepsize=1e-2, threshold=1e-2) self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2) @serial.given(inputs=hu.tensors(n=2, min_dim=1, max_dim=2, dtype=np.float32), **hu.gcs) def test_dot_product(self, inputs, gc, dc): X, Y = inputs op = core.CreateOperator( 'DotProduct', ['X', 'Y'], ['DOT'], ) def dot_ref(X, Y): return ([np.dot(x, y) for x, y in zip(X, Y)],) # Check against numpy dot reference self.assertReferenceChecks(gc, op, [X, Y], dot_ref) # Check over multiple devices self.assertDeviceChecks(dc, op, [X, Y], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X, Y], 0, [0]) # Gradient check wrt Y self.assertGradientChecks(gc, op, [X, Y], 1, [0]) @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_L1_distance(self, n, dim, gc, dc): X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32) Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32) # avoid kinks by moving away from 0 X += 0.02 * np.sign(X - Y) X[(X - Y) == 0.0] += 0.02 self.ws.create_blob("X").feed(X) self.ws.create_blob("Y").feed(Y) op = core.CreateOperator( 'L1Distance', ['X', 'Y'], ['l1_dist'], ) self.ws.run(op) np.testing.assert_allclose(self.ws.blobs[("l1_dist")].fetch(), [np.linalg.norm(x - y, ord=1) for x, y in zip(X, Y)], rtol=1e-4, atol=1e-4) self.assertDeviceChecks(dc, op, [X, Y], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X, Y], 0, [0], stepsize=1e-2, threshold=1e-2) # Gradient check wrt Y self.assertGradientChecks(gc, op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2) @serial.given(n=st.integers(1, 3), dim=st.integers(4, 16), **hu.gcs) def test_L2_distance(self, n, dim, gc, dc): X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32) Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32) self.ws.create_blob("X").feed(X) self.ws.create_blob("Y").feed(Y) l2_op = core.CreateOperator("SquaredL2Distance", ["X", "Y"], ["l2_dist"]) self.ws.run(l2_op) np.testing.assert_allclose(self.ws.blobs[("l2_dist")].fetch(), np.square(X - Y).sum(axis=1) * 0.5, rtol=1e-4, atol=1e-4) self.assertGradientChecks(gc, l2_op, [X, Y], 0, [0], stepsize=1e-2, threshold=1e-2) self.assertGradientChecks(gc, l2_op, [X, Y], 1, [0], stepsize=1e-2, threshold=1e-2)
class TestAdagrad(hu.HypothesisTestCase): @staticmethod def ref_adagrad(param_in, mom_in, grad, lr, epsilon): mom_out = mom_in + np.square(grad) grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon) param_out = param_in + grad_adj return (param_out, mom_out) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_adagrad(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(self.ref_adagrad, epsilon=epsilon)) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_sparse_adagrad(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs indices = np.arange(grad.shape[0]) indices = indices[indices % 2 == 0] grad = grad[indices] momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) for i, index in enumerate(indices): param_out[index], momentum_out[index] = self.ref_adagrad( param[index], momentum[index], grad[i], lr, epsilon) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_sparse)
class TestLayerNormOp(serial.SerializedTestCase): @serial.given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_grad_op(self, X, gc, dc): X = X[0] if len(X.shape) == 1: X = np.expand_dims(X, axis=0) axis = np.random.randint(0, len(X.shape)) epsilon = 1e-4 op = core.CreateOperator( "LayerNormGradient", ["gout", "out", "mean", "stdev", "in"], ["gin"], axis=axis, epsilon=epsilon, ) norm, mean, stdev = _layer_norm_ref(axis, epsilon, X) gout = norm self.assertReferenceChecks(device_option=gc, op=op, inputs=[gout, norm, mean, stdev, X], reference=partial(_layer_norm_grad_ref, axis)) self.assertDeviceChecks( device_options=dc, op=op, inputs=[gout, norm, mean, stdev, X], outputs_to_check=[0], ) @given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_op(self, X, gc, dc): X = X[0] if len(X.shape) == 1: X = np.expand_dims(X, axis=0) axis = np.random.randint(0, len(X.shape)) epsilon = 1e-4 op = core.CreateOperator( "LayerNorm", ["input"], ["output", "mean", "stdev"], axis=axis, epsilon=epsilon, ) self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=partial(_layer_norm_ref, axis, epsilon)) self.assertDeviceChecks( device_options=dc, op=op, inputs=[X], outputs_to_check=[0, 1, 2], ) @given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_op_pytorch(self, X, gc, dc): X = X[0] if len(X.shape) == 1: X = np.expand_dims(X, axis=0) axis = np.random.randint(0, len(X.shape)) epsilon = 1e-4 expected_norm, expected_mean, expected_stdev = _layer_norm_ref( axis, epsilon, X) actual_norm, actual_mean, actual_stdev = torch.ops.caffe2.layer_norm_dont_use_this_op_yet( torch.tensor(X), axis, epsilon) torch.testing.assert_allclose(expected_norm, actual_norm) torch.testing.assert_allclose(expected_mean, actual_mean) torch.testing.assert_allclose(expected_stdev, actual_stdev) @given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_brew_wrapper(self, X, gc, dc): X = X[0] if len(X.shape) == 1: X = np.expand_dims(X, axis=0) axis = np.random.randint(0, len(X.shape)) scale_dim = [1] * np.ndim(X) scale_dim[axis] = X.shape[axis] self.ws.create_blob('input').feed(X) model = ModelHelper(name='test_layer_norm_brew_wrapper') brew.layer_norm( model, 'input', 'output', dim_in=X.shape[axis], axis=axis, epsilon=1e-4, ) self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run()
class TestLayerNormOp(hu.HypothesisTestCase): @given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_grad_op(self, X, gc, dc): X = X[0] if len(X.shape) == 1: X = np.expand_dims(X, axis=0) axis = np.random.randint(0, len(X.shape)) epsilon = 1e-4 op = core.CreateOperator( "LayerNormGradient", ["gout", "out", "mean", "stdev", "in"], ["gin"], axis=axis, epsilon=epsilon, ) def layer_norm_ref(X): left = int(np.prod(X.shape[:axis])) reshaped = np.reshape(X, [left, -1]) mean = np.mean(reshaped, axis=1).reshape([left, 1]) stdev = np.sqrt( np.mean(np.square(reshaped), axis=1).reshape([left, 1]) - np.power(mean, 2) + epsilon) norm = (reshaped - mean) / (stdev) norm = np.reshape(norm, X.shape) mean = np.reshape(mean, X.shape[:axis] + (1, )) stdev = np.reshape(stdev, X.shape[:axis] + (1, )) return [norm, mean, stdev] norm, mean, stdev = layer_norm_ref(X) gout = norm def layer_norm_grad_ref(gout_full, norm, mean_full, stdev_full, X_full): left = int(np.prod(X_full.shape[:axis])) right = int(np.prod(X_full.shape[axis:])) X = np.reshape(X_full, [left, right]) stdev = np.reshape(stdev_full, [left, 1]) mean = np.reshape(mean_full, [left, 1]) gout = np.reshape(gout_full, [left, right]) dstdev_end = (-1.0) / np.power(stdev, 2.0) \ * np.sum((X - mean) * gout, axis=1).reshape([left, 1]) dmean_end = np.sum(-1.0 / stdev * gout, axis=1).reshape([left, 1]) dx_end = 1.0 / stdev * gout # stdev block dmean_stdev = -1.0 * mean / stdev * dstdev_end dx_stdev = X / (right * stdev) * dstdev_end # mean block dmean = dmean_end + dmean_stdev dxmean = (1.0 / right) * dmean # final outputs dx = dx_end + dx_stdev + dxmean dx = dx.reshape(X_full.shape) return [dx] self.assertReferenceChecks(device_option=gc, op=op, inputs=[gout, norm, mean, stdev, X], reference=layer_norm_grad_ref) self.assertDeviceChecks( device_options=dc, op=op, inputs=[gout, norm, mean, stdev, X], outputs_to_check=[0], ) @given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_op(self, X, gc, dc): X = X[0] if len(X.shape) == 1: X = np.expand_dims(X, axis=0) axis = np.random.randint(0, len(X.shape)) epsilon = 1e-4 op = core.CreateOperator( "LayerNorm", ["input"], ["output", "mean", "stdev"], axis=axis, epsilon=epsilon, ) def layer_norm_ref(X): left = int(np.prod(X.shape[:axis])) reshaped = np.reshape(X, [left, -1]) mean = np.mean(reshaped, axis=1).reshape([left, 1]) stdev = np.sqrt( np.mean(np.power(reshaped, 2), axis=1).reshape([left, 1]) - np.power(mean, 2) + epsilon) norm = (reshaped - mean) / (stdev) norm = np.reshape(norm, X.shape) mean = np.reshape(mean, X.shape[:axis] + (1, )) stdev = np.reshape(stdev, X.shape[:axis] + (1, )) return [norm, mean, stdev] self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=layer_norm_ref) self.assertDeviceChecks( device_options=dc, op=op, inputs=[X], outputs_to_check=[0, 1, 2], ) @given(X=hu.tensors(n=1), **hu.gcs) def test_layer_norm_brew_wrapper(self, X, gc, dc): X = X[0] if len(X.shape) == 1: X = np.expand_dims(X, axis=0) axis = np.random.randint(0, len(X.shape)) scale_dim = [1] * np.ndim(X) scale_dim[axis] = X.shape[axis] self.ws.create_blob('input').feed(X) model = ModelHelper(name='test_layer_norm_brew_wrapper') brew.layer_norm( model, 'input', 'output', dim_in=X.shape[axis], axis=axis, epsilon=1e-4, ) self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run()
class TestAdam(hu.HypothesisTestCase): @staticmethod def ref_adam(param, mom1, mom2, grad, LR, ITER, beta1, beta2, epsilon, output_grad=False): t = ITER + 1 corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \ (1 - np.power(beta1, t)) mom1_out = (beta1 * mom1) + (1 - beta1) * grad mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad) grad_out = corrected_local_rate * mom1_out / \ (np.sqrt(mom2_out) + epsilon) param_out = param + LR * grad_out if output_grad: return param_out, mom1_out, mom2_out, grad_out else: return param_out, mom1_out, mom2_out @staticmethod def ref_smart_decay_adam(param, mom1, mom2, last_seen, grad, LR, ITER, beta1, beta2, epsilon): t = ITER + 1 k = int(np.array(t - last_seen).flatten()[0]) last_seen_out = t if beta1 == 0.0: mom1_out = grad mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad) grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon) param_out = param + LR * grad_out return param_out, mom1_out, mom2_out, last_seen_out # Make up for lost minibatches. else: mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad) p_out = param m = mom1 # For catchup for i in range(k - 1): m *= beta1 update = m / (np.sqrt(mom2_out) + epsilon) p_out += LR * update # For the single step update mom1_out = m * beta1 + grad * (1 - beta1) grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon) param_out = p_out + LR * grad_out return param_out, mom1_out, mom2_out, last_seen_out @staticmethod def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER, beta1, beta2, epsilon, output_grad=False): t = ITER + 1 corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \ (1 - np.power(beta1, t)) mom1_out = (beta1 * mom1) + (1 - beta1) * grad mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad)) grad_out = corrected_local_rate * mom1_out / (np.sqrt(mom2_out) + epsilon) param_out = param + LR * grad_out if output_grad: return param_out, mom1_out, mom2_out, grad_out else: return param_out, mom1_out, mom2_out @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs mom2 = np.abs(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) op = core.CreateOperator( "Adam", ["param", "mom1", "mom2", "grad", "lr", "iter"], ["output_param", "output_mom1", "output_mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, grad, LR, ITER], functools.partial( self.ref_adam, beta1=beta1, beta2=beta2, epsilon=epsilon), input_device_options=input_device_options) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs mom2 = np.abs(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) op = core.CreateOperator( "Adam", ["param", "mom1", "mom2", "grad", "lr", "iter"], ["output_param", "output_mom1", "output_mom2", "output_grad"], beta1=beta1, beta2=beta2, epsilon=epsilon) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, grad, LR, ITER], functools.partial( self.ref_adam, beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), input_device_options=input_device_options) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, mom2, grad = inputs mom2 = np.absolute(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Verify that the generated indices are unique hypothesis.assume( np.array_equal( np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index] = \ self.ref_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], ref_sparse, input_device_options=input_device_options) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_smart_decay_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, mom2, grad = inputs mom2 = np.absolute(mom2) ITER = np.array([ITER], dtype=np.int64) # Here we will define the last_seen tensor as being randomly from 0 to ITER # (the value of t to be tested will be ITER+1) last_seen = np.random.randint(low=0, high=ITER + 1, size=param.shape, dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Verify that the generated indices are unique hypothesis.assume( np.array_equal( np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SmartDecaySparseAdam", ["param", "mom1", "mom2", "last_seen", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2", "last_seen"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_sparse(param, mom1, mom2, last_seen, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) last_seen_out = np.copy(last_seen) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index], last_seen_out[index] = \ self.ref_smart_decay_adam(param[index], mom1[index], mom2[index], last_seen[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out, last_seen_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, last_seen, indices, grad, LR, ITER], ref_sparse, input_device_options=input_device_options) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, mom2, grad = inputs mom2 = np.absolute(mom2) ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Verify that the generated indices are unique hypothesis.assume( np.array_equal( np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2", "output_grad"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER, beta1, beta2, epsilon, output_grad): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) grad_out = np.copy(grad) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \ self.ref_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon, output_grad) return (param_out, mom1_out, mom2_out, grad_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], functools.partial( ref_sparse_output_grad, beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), input_device_options=input_device_options) @given(inputs=hu.tensors(n=3), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create a 1D row-wise average 2nd moment tensor. mom2 = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32)) ) mom2 = np.absolute(mom2) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # Verify that the generated indices are unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index] = \ self.ref_row_wise_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon) return (param_out, mom1_out, mom2_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertDeviceChecks( dc, op, [param, mom1, mom2, indices, grad, LR, ITER], [0, 1, 2], input_device_options=input_device_options) self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], ref_row_wise_sparse, input_device_options=input_device_options) @given(inputs=hu.tensors(n=3), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, data_strategy, gc, dc): param, mom1, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) # Create a 1D row-wise average 2nd moment tensor. mom2 = data_strategy.draw( hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0], elements=hu.elements_of_type(dtype=np.float32)) ) mom2 = np.absolute(mom2) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor( max_dim=1, min_value=1, max_value=grad.shape[0], dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0])), ), ) # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment # tensor that is strictly 1-dimensional and equal in length to the # first dimension of the parameters, so indices must also be # 1-dimensional. indices = indices.flatten() hypothesis.note('indices.shape: %s' % str(indices.shape)) # Verify that the generated indices are unique hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "RowWiseSparseAdam", ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"], ["param", "mom1", "mom2", "output_grad"], beta1=beta1, beta2=beta2, epsilon=epsilon) def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER, beta1, beta2, epsilon, output_grad): param_out = np.copy(param) mom1_out = np.copy(mom1) mom2_out = np.copy(mom2) grad_out = np.copy(grad) for i, index in enumerate(indices): param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \ self.ref_row_wise_adam(param[index], mom1[index], mom2[index], grad[i], LR, ITER, beta1, beta2, epsilon, output_grad) return (param_out, mom1_out, mom2_out, grad_out) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do} self.assertDeviceChecks( dc, op, [param, mom1, mom2, indices, grad, LR, ITER], [0, 1, 2, 3], input_device_options=input_device_options) self.assertReferenceChecks( gc, op, [param, mom1, mom2, indices, grad, LR, ITER], functools.partial( ref_row_wise_sparse_output_grad, beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True), input_device_options=input_device_options)
class LpnormTest(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32), **hu.gcs) @settings(deadline=1000) def test_Lp_Norm(self, inputs, gc, dc): X = inputs[0] # avoid kinks by moving away from 0 X += 0.02 * np.sign(X) X[X == 0.0] += 0.02 self.ws.create_blob("X").feed(X) op = core.CreateOperator( 'LpNorm', ['X'], ['l1_norm'], p=1, ) self.ws.run(op) np.testing.assert_allclose(self.ws.blobs[("l1_norm")].fetch(), np.linalg.norm((X).flatten(), ord=1), rtol=1e-4, atol=1e-4) self.assertDeviceChecks(dc, op, [X], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2) op = core.CreateOperator( 'LpNorm', ['X'], ['l2_norm'], p=2, ) self.ws.run(op) np.testing.assert_allclose(self.ws.blobs[("l2_norm")].fetch(), np.linalg.norm((X).flatten(), ord=2)**2, rtol=1e-4, atol=1e-4) self.assertDeviceChecks(dc, op, [X], [0]) # Gradient check wrt X self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2) op = core.CreateOperator('LpNorm', ['X'], ['l2_averaged_norm'], p=2, average=True) self.ws.run(op) np.testing.assert_allclose(self.ws.blobs[("l2_averaged_norm")].fetch(), np.linalg.norm( (X).flatten(), ord=2)**2 / X.size, rtol=1e-4, atol=1e-4) @given(x=hu.tensor(min_dim=1, max_dim=10, dtype=np.float32, elements=st.integers(min_value=-100, max_value=100)), p=st.integers(1, 2), average=st.integers(0, 1)) def test_lpnorm_shape_inference(self, x, p, average): workspace.FeedBlob('x', x) net = core.Net("lpnorm_test") result = net.LpNorm(['x'], p=p, average=bool(average)) (shapes, types) = workspace.InferShapesAndTypes([net]) workspace.RunNetOnce(net) self.assertEqual(shapes[result], list(workspace.blobs[result].shape)) self.assertEqual(types[result], core.DataType.FLOAT)
class TestAdagrad(hu.HypothesisTestCase): @staticmethod def ref_adagrad(param_in, mom_in, grad, lr, epsilon): mom_out = mom_in + np.square(grad) grad_adj = lr * grad / (np.sqrt(mom_out) + epsilon) param_out = param_in + grad_adj return (param_out, mom_out) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_adagrad(self, inputs, lr, epsilon, gc, dc): param, momentum, grad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adagrad", ["param", "momentum", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, momentum, grad, lr], functools.partial(self.ref_adagrad, epsilon=epsilon)) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adagrad(self, inputs, lr, epsilon, data_strategy, gc, dc): param, momentum, grad = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) # Create an indexing array containing values which index into grad indices = data_strategy.draw( hu.tensor(dtype=np.int64, elements=st.sampled_from(np.arange(grad.shape[0]))), ) hypothesis.note('indices.shape: %s' % str(indices.shape)) # For now, the indices must be unique hypothesis.assume( np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) for i, index in enumerate(indices): param_out[index], momentum_out[index] = self.ref_adagrad( param[index], momentum[index], grad[i], lr, epsilon) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_sparse) @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs) def test_sparse_adagrad_empty(self, inputs, lr, epsilon, data_strategy, gc, dc): param, momentum = inputs momentum = np.abs(momentum) lr = np.array([lr], dtype=np.float32) grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0, ), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "SparseAdagrad", ["param", "momentum", "indices", "grad", "lr"], ["param", "momentum"], epsilon=epsilon, device_option=gc) def ref_sparse(param, momentum, indices, grad, lr): param_out = np.copy(param) momentum_out = np.copy(momentum) return (param_out, momentum_out) self.assertReferenceChecks(gc, op, [param, momentum, indices, grad, lr], ref_sparse)
class TestLearningRateAdaption(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr_alpha=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_learning_rate_adaption_op_normalization(self, inputs, lr, lr_alpha, gc, dc): grad, effgrad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator('LearningRateAdaption', ['lr', 'grad', 'effgrad'], ['output_lr'], lr_alpha=lr_alpha) def ref(lr, grad, effgrad): flattened_grad = grad.flatten() flattened_effgrad = effgrad.flatten() x = np.dot(flattened_grad, flattened_effgrad) kEps = 1e-12 y = np.linalg.norm(flattened_grad, ord=2) y = np.maximum(y, kEps) z = np.linalg.norm(flattened_effgrad, ord=2) z = np.maximum(z, kEps) output_lr = lr output_lr[0] -= lr[0] * lr_alpha * float(x / (y * z)) return output_lr, self.assertReferenceChecks(gc, op, [lr, grad, effgrad], ref) @given(inputs=hu.tensors(n=2), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr_alpha=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_learning_rate_adaption_op_without_normalization( self, inputs, lr, lr_alpha, gc, dc): grad, effgrad = inputs lr = np.array([lr], dtype=np.float32) op = core.CreateOperator('LearningRateAdaption', ['lr', 'grad', 'effgrad'], ['output_lr'], lr_alpha=lr_alpha, normalized_lr_adaption=False) def ref(lr, grad, effgrad): flattened_grad = grad.flatten() flattened_effgrad = effgrad.flatten() x = np.dot(flattened_grad, flattened_effgrad) output_lr = lr output_lr[0] -= lr_alpha * x return output_lr, self.assertReferenceChecks(gc, op, [lr, grad, effgrad], ref)
class PythonOpTest(hu.HypothesisTestCase): @given(x=hu.tensor()) def test_feed(self, x): def f(inputs, _): self.assertEqual(x.shape, inputs[0].shape) self.assertEqual(type(inputs[0].shape), tuple) self.assertEqual(type(inputs[0].data), np.ndarray) np.testing.assert_almost_equal(x, inputs[0].data) op = CreatePythonOperator(f, ["x"], []) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) def test_exception(self): op = CreatePythonOperator(MainOpFunctionThatThrowsRuntimeError, [], []) with self.assertRaisesRegexp( RuntimeError, "This is an intentional exception." ): workspace.RunOperatorOnce(op) @given(x=hu.tensor()) def test_feed_with_helper_function(self, x): def f(inputs, _): self.assertEqual(x.shape, inputs[0].shape) self.assertEqual(type(inputs[0].shape), tuple) self.assertEqual(type(inputs[0].data), np.ndarray) np.testing.assert_almost_equal(x, inputs[0].data) net = core.Net("test") net.Python(f)(["x"], []) workspace.FeedBlob("x", x) workspace.RunNetOnce(net) def test_builder_tuple(self): net = core.Net("builder_template") iter_blob = 'iter' net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], []) net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], []) for repeat in range(2): # check that the builder will be called exactly once for each # PythonOp constructor. Cloning the net will also trigger a call # to the builder when the net is created. cloned_net = net.Clone('builder_%d' % repeat) workspace.FeedBlob(iter_blob, np.array([0])) # Builder gets called once per python op in the line below workspace.CreateNet(cloned_net) for i in range(10): workspace.FeedBlob(iter_blob, np.array([i])) workspace.RunNet(cloned_net) @given(x=hu.tensor()) def test_feed_with_gc(self, x): def f(inputs, _): self.assertEqual(x.shape, inputs[0].shape) np.testing.assert_almost_equal(x, inputs[0].data) op = CreatePythonOperator(f, ["x"], []) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) del f workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) @given(x=hu.tensor()) def test_reshape(self, x): def f(inputs, outputs): outputs[0].reshape(inputs[0].shape) self.assertEqual(x.shape, inputs[0].shape) self.assertEqual(x.shape, outputs[0].shape) outputs[0].data[...] = inputs[0].data op = CreatePythonOperator(f, ["x"], ["y"]) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) y = workspace.FetchBlob("y") np.testing.assert_almost_equal(x, y) @given(x=hu.tensor()) def test_workspace_manipulation(self, x): """ Verify that python op can manipulate workspace directly """ def f(inputs, outputs, ws): fetched = ws.blobs['internal'].fetch() np.testing.assert_almost_equal(fetched, x) ws = workspace.C.Workspace() net = core.Net("test") net.GivenTensorFill([], ['internal'], values=x, shape=x.shape) net.Python(f, pass_workspace=True)([], []) ws.run(net) @given(x=hu.tensor()) def test_caught_exception_doesnt_terminate(self, x): def f(inputs, outputs): try: raise Exception("Exception in handler") except Exception: pass op = CreatePythonOperator(f, ["x"], ["y"]) workspace.FeedBlob("x", x) workspace.RunOperatorOnce(op) @given(x=hu.tensor(), n=st.integers(min_value=1, max_value=20), w=st.integers(min_value=1, max_value=20)) def test_multithreaded_evaluation(self, x, n, w): def f(inputs, outputs): outputs[0].reshape(inputs[0].shape) outputs[0].data[...] = inputs[0].data ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)] net = core.Net("net") net.Proto().op.extend(ops) net.Proto().type = "dag" net.Proto().num_workers = w iters = 100 plan = core.Plan("plan") plan.AddStep(core.ExecutionStep("test-step", net, iters)) workspace.FeedBlob("x", x) workspace.RunPlan(plan.Proto().SerializeToString()) for i in range(n): y = workspace.FetchBlob(str(i)) np.testing.assert_almost_equal(x, y) @given(x=hu.tensor(), in_place=st.booleans(), **hu.gcs) def test_gradient(self, x, in_place, gc, dc): def f(inputs, outputs): outputs[0].reshape(inputs[0].shape) outputs[0].data[...] = inputs[0].data * 2 def grad_f(inputs, outputs): # Ordering is [inputs, outputs, grad_outputs] grad_output = inputs[2] grad_input = outputs[0] grad_input.reshape(grad_output.shape) grad_input.data[...] = grad_output.data * 2 op = CreatePythonOperator( f, ["x"], ["x" if in_place else "y"], grad_f=grad_f) self.assertGradientChecks(gc, op, [x], 0, [0]) self.assertDeviceChecks(dc, op, [x], [0]) @given(inputs=hu.tensors(n=2), **hu.gcs) def test_gradient_multiple(self, inputs, gc, dc): (x1, x2) = inputs def f(inputs, outputs): for idx in [0, 1]: self.assertEqual(type(inputs[idx].shape), tuple) outputs[idx].reshape(inputs[idx].shape) outputs[idx].data[...] = inputs[idx].data * 2 def grad_f(inputs, outputs): # Ordering is [inputs, outputs, grad_outputs] self.assertEqual(len(inputs), 6) self.assertEqual(len(outputs), 2) for (grad_output_idx, grad_input_idx) in [(4, 0), (5, 1)]: grad_output = inputs[grad_output_idx] grad_input = outputs[grad_input_idx] grad_input.reshape(grad_output.shape) grad_input.data[...] = grad_output.data * 2 op = CreatePythonOperator(f, ["x1", "x2"], ["y1", "y2"], grad_f=grad_f) for idx in [0, 1]: self.assertGradientChecks(gc, op, [x1, x2], idx, [0, 1]) self.assertDeviceChecks(dc, op, [x1, x2], [0, 1]) @given(inputs=hu.tensors(n=3), **hu.gcs) def test_gradient_multiple_with_indices(self, inputs, gc, dc): (x1, x2, x3) = inputs def f(inputs, outputs): for idx in [0, 1, 2]: self.assertEqual(type(inputs[idx].shape), tuple) outputs[idx].reshape(inputs[idx].shape) outputs[idx].data[...] = inputs[idx].data * 2 def grad_f(inputs, outputs): # Ordering is [inputs, outputs, grad_outputs] self.assertEqual(len(inputs), 8) self.assertEqual(len(outputs), 1) for (grad_output_idx, grad_input_idx) in [(6, 0)]: grad_output = inputs[grad_output_idx] grad_input = outputs[grad_input_idx] grad_input.reshape(grad_output.shape) grad_input.data[...] = grad_output.data * 2 op = CreatePythonOperator( f, ["x1", "x2", "x3"], ["y1", "y2", "y3"], grad_f=grad_f, grad_output_indices=[0, 2], # Receive grad outputs for y1 and y3 grad_input_indices=[0] # Produce grad inputs for x1 ) self.assertGradientChecks(gc, op, [x1, x2, x3], 0, [0, 2]) self.assertDeviceChecks(dc, op, [x1, x2, x3], [0, 1, 2])
class TestAdadelta(serial.SerializedTestCase): @staticmethod def ref_adadelta(param_in, mom_in, mom_delta_in, grad, lr, epsilon, decay, using_fp16=False): param_in_f32 = param_in mom_in_f32 = mom_in mom_delta_in_f32 = mom_delta_in if (using_fp16): param_in_f32 = param_in.astype(np.float32) mom_in_f32 = mom_in.astype(np.float32) mom_delta_in_f32 = mom_delta_in.astype(np.float32) mom_out = decay * mom_in_f32 + (1.0 - decay) * grad * grad new_grad = (np.sqrt(mom_delta_in_f32 + epsilon) / np.sqrt(mom_out + epsilon)) * grad param_out = param_in_f32 + lr * new_grad mom_delta_out = decay * mom_delta_in_f32 + ( 1.0 - decay) * new_grad * new_grad if (using_fp16): return (param_out.astype(np.float16), mom_out.astype(np.float16), mom_delta_out.astype(np.float16)) else: return (param_out.astype(np.float32), mom_out.astype(np.float32), mom_delta_out.astype(np.float32)) @given(inputs=hu.tensors(n=4), lr=hu.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=hu.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), decay=hu.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) @settings(deadline=1000) def test_adadelta(self, inputs, lr, epsilon, decay, gc, dc): param, moment, moment_delta, grad = inputs moment = np.abs(moment) moment_delta = np.abs(moment_delta) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Adadelta", ["param", "moment", "moment_delta", "grad", "lr"], ["param", "moment", "moment_delta"], epsilon=epsilon, decay=decay, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, moment, moment_delta, grad, lr], functools.partial(self.ref_adadelta, epsilon=epsilon, decay=decay)) # Suppress filter_too_much health check. # Likely caused by `assume` call falling through too often. @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000) @given(inputs=hu.tensors(n=4), lr=hu.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=hu.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), decay=hu.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) def test_sparse_adadelta(self, inputs, lr, epsilon, decay, gc, dc): param, moment, moment_delta, grad = inputs moment = np.abs(moment) moment_delta = np.abs(moment_delta) lr = np.array([lr], dtype=np.float32) # Create an indexing array containing values that are lists of indices, # which index into grad indices = np.random.choice(np.arange(grad.shape[0]), size=np.random.randint(grad.shape[0]), replace=False) # Sparsify grad grad = grad[indices] op = core.CreateOperator( "SparseAdadelta", ["param", "moment", "moment_delta", "indices", "grad", "lr"], ["param", "moment", "moment_delta"], epsilon=epsilon, decay=decay, device_option=gc) def ref_sparse(param, moment, moment_delta, indices, grad, lr, decay, ref_using_fp16): param_out = np.copy(param) moment_out = np.copy(moment) moment_delta_out = np.copy(moment_delta) for i, index in enumerate(indices): param_out[index], moment_out[index], moment_delta_out[ index] = self.ref_adadelta(param[index], moment[index], moment_delta[index], grad[i], lr, epsilon, decay, ref_using_fp16) return (param_out, moment_out, moment_delta_out) ref_using_fp16_values = [False] if gc == hu.gpu_do: ref_using_fp16_values.append(True) for ref_using_fp16 in ref_using_fp16_values: moment_i = None moment_delta_i = None param_i = None if (ref_using_fp16): moment_i = moment.astype(np.float16) moment_delta_i = moment_delta.astype(np.float16) param_i = param.astype(np.float16) else: moment_i = moment.astype(np.float32) moment_delta_i = moment_delta.astype(np.float32) param_i = param.astype(np.float32) self.assertReferenceChecks(gc, op, [ param_i, moment_i, moment_delta_i, indices, grad, lr, decay, ref_using_fp16 ], ref_sparse) @given(inputs=hu.tensors(n=3), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), decay=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs) @settings(deadline=1000) def test_sparse_adadelta_empty(self, inputs, lr, epsilon, decay, gc, dc): param, moment, moment_delta = inputs moment = np.abs(moment) lr = np.array([lr], dtype=np.float32) grad = np.empty(shape=(0, ) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0, ), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "SparseAdadelta", ["param", "moment", "moment_delta", "indices", "grad", "lr"], ["param", "moment", "moment_delta"], epsilon=epsilon, decay=decay, device_option=gc) def ref_sparse_empty(param, moment, moment_delta, indices, grad, lr, decay): param_out = np.copy(param) moment_out = np.copy(moment) moment_delta_out = np.copy(moment_delta) return (param_out, moment_out, moment_delta_out) ref_using_fp16_values = [False] if gc == hu.gpu_do: ref_using_fp16_values.append(True) for ref_using_fp16 in ref_using_fp16_values: moment_i = None moment_delta_i = None param_i = None if (ref_using_fp16): moment_i = moment.astype(np.float16) moment_delta_i = moment_delta.astype(np.float16) param_i = param.astype(np.float16) else: moment_i = moment.astype(np.float32) moment_delta_i = moment_delta.astype(np.float32) param_i = param.astype(np.float32) self.assertReferenceChecks( gc, op, [param_i, moment_i, moment_delta_i, indices, grad, lr, decay], ref_sparse_empty)
class TestAdamOps(hu.HypothesisTestCase): @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **mu.gcs) def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) mom2 = np.absolute(mom2) op = core.CreateOperator( "Adam", ["param", "mom1", "mom2", "grad", "lr", "iter"], ["output_param", "output_mom1", "output_mom2"], beta1=beta1, beta2=beta2, epsilon=epsilon) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do, 'lr': hu.cpu_do} self.assertDeviceChecks( dc, op, [param, mom1, mom2, grad, LR, ITER], [0], input_device_options=input_device_options, threshold=0.001) @given(inputs=hu.tensors(n=4), ITER=st.integers(min_value=0, max_value=10000), LR=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta1=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), beta2=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **mu.gcs) def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc): param, mom1, mom2, grad = inputs ITER = np.array([ITER], dtype=np.int64) LR = np.array([LR], dtype=np.float32) mom2 = np.absolute(mom2) op = core.CreateOperator( "Adam", ["param", "mom1", "mom2", "grad", "lr", "iter"], ["output_param", "output_mom1", "output_mom2", "output_grad"], beta1=beta1, beta2=beta2, epsilon=epsilon) # Iter lives on the CPU input_device_options = {'iter': hu.cpu_do, 'lr': hu.cpu_do} self.assertDeviceChecks( dc, op, [param, mom1, mom2, grad, LR, ITER], [0], input_device_options=input_device_options, threshold=0.001)
class TestWngrad(serial.SerializedTestCase): @serial.given(inputs=hu.tensors(n=2), seq_b=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_wngrad_dense_base(self, inputs, seq_b, lr, epsilon, gc, dc): param, grad = inputs seq_b = np.array([seq_b, ], dtype=np.float32) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Wngrad", ["param", "seq_b", "grad", "lr"], ["param", "seq_b"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, seq_b, grad, lr], functools.partial(ref_wngrad, epsilon=epsilon)) @given(inputs=hu.tensors(n=2), seq_b=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_wngrad_dense_output_effective_lr(self, inputs, seq_b, lr, epsilon, gc, dc): param, grad = inputs seq_b = np.array([seq_b, ], dtype=np.float32) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Wngrad", ["param", "seq_b", "grad", "lr"], ["param", "seq_b", "effective_lr"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, seq_b, grad, lr], functools.partial(ref_wngrad, epsilon=epsilon, output_effective_lr=True)) @given(inputs=hu.tensors(n=2), seq_b=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_wngrad_dense_output_effective_lr_and_update( self, inputs, seq_b, lr, epsilon, gc, dc): param, grad = inputs seq_b = np.abs(np.array([seq_b, ], dtype=np.float32)) lr = np.array([lr], dtype=np.float32) op = core.CreateOperator( "Wngrad", ["param", "seq_b", "grad", "lr"], ["param", "seq_b", "effective_lr", "update"], epsilon=epsilon, device_option=gc, ) self.assertReferenceChecks( gc, op, [param, seq_b, grad, lr], functools.partial(ref_wngrad, epsilon=epsilon, output_effective_lr_and_update=True)) # Suppress filter_too_much health check. # Likely caused by `assume` call falling through too often. @settings(suppress_health_check=[HealthCheck.filter_too_much]) @given(inputs=hu.tensors(n=2), seq_b=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), **hu.gcs_cpu_only) def test_sparse_wngrad(self, inputs, seq_b, lr, epsilon, gc, dc): return wngrad_sparse_test_helper(self, inputs, seq_b, lr, epsilon, None, gc, dc) @serial.given(inputs=hu.tensors(n=1), lr=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), seq_b=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), epsilon=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False), data_strategy=st.data(), **hu.gcs_cpu_only) def test_sparse_wngrad_empty(self, inputs, seq_b, lr, epsilon, data_strategy, gc, dc): param = inputs[0] seq_b = np.array([seq_b, ], dtype=np.float32) lr = np.array([lr], dtype=np.float32) grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32) indices = np.empty(shape=(0,), dtype=np.int64) hypothesis.note('indices.shape: %s' % str(indices.shape)) op = core.CreateOperator( "SparseWngrad", ["param", "seq_b", "indices", "grad", "lr"], ["param", "seq_b"], epsilon=epsilon, device_option=gc) def ref_sparse(param, seq_b, indices, grad, lr): param_out = np.copy(param) seq_b_out = np.copy(seq_b) return (param_out, seq_b_out) print('test_sparse_adagrad_empty with full precision embedding') seq_b_i = seq_b.astype(np.float32) param_i = param.astype(np.float32) self.assertReferenceChecks( gc, op, [param_i, seq_b_i, indices, grad, lr], ref_sparse )