def test_grad_scale(): x = scalar() z = grad(grad_scale(x, 2) ** 2, x) z2 = grad(x ** 2, x) f = aesara.function([x], outputs=[z, z2]) if config.mode != "FAST_COMPILE": topo = f.maker.fgraph.toposort() assert not any(isinstance(node.op, GradScale) for node in topo) out = f(2.0) assert np.allclose(out, (8, 4))
def test_grad_clip(): x = scalar() z = grad(grad_clip(x, -1, 1)**2, x) z2 = grad(x**2, x) f = aesara.function([x], outputs=[z, z2]) if config.mode != "FAST_COMPILE": topo = f.maker.fgraph.toposort() assert not any([isinstance(node.op, GradClip) for node in topo]) out = f(2.0) assert np.allclose(out, (1, 4)) assert not np.allclose(out[0], out[1])
def test_known_grads(): # Tests that the grad method with no known_grads # matches what happens if you put its own known_grads # in for each variable full_range = aet.arange(10) x = scalar("x") t = iscalar("t") ft = full_range[t] ft.name = "ft" coeffs = vector("c") ct = coeffs[t] ct.name = "ct" p = x**ft p.name = "p" y = ct * p y.name = "y" cost = sqr(y) cost.name = "cost" layers = [[cost], [y], [ct, p], [ct, x, ft], [coeffs, t, full_range, x]] inputs = [coeffs, t, x] rng = np.random.default_rng([2012, 11, 15]) values = [ rng.standard_normal((10)), rng.integers(10), rng.standard_normal() ] values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)] true_grads = grad(cost, inputs, disconnected_inputs="ignore") true_grads = aesara.function(inputs, true_grads) true_grads = true_grads(*values) for layer in layers: first = grad(cost, layer, disconnected_inputs="ignore") known = OrderedDict(zip(layer, first)) full = grad(cost=None, known_grads=known, wrt=inputs, disconnected_inputs="ignore") full = aesara.function(inputs, full) full = full(*values) assert len(true_grads) == len(full) for a, b, var in zip(true_grads, full, inputs): assert np.allclose(a, b)
def test_xent_thing_int32(self): x = matrix("x") y = lvector("y") yi = aet.cast(y, "int32") expressions = [ aet_sum(-log(softmax(x)[aet.arange(yi.shape[0]), yi])), -aet_sum(log(softmax(x)[aet.arange(yi.shape[0]), yi])), -aet_sum(log(softmax(x))[aet.arange(yi.shape[0]), yi]), aet_sum(-log(softmax(x))[aet.arange(yi.shape[0]), yi]), ] for expr in expressions: fgraph = FunctionGraph([x, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 5 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] # Also verify the gradient wrt x fgraph = FunctionGraph([x, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 3 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops
def test_connection_pattern_override(self, cls_ofg): x, y = vectors("xy") def f1(x, y): del x # but we know how to backpropagate for x for some reasons # and we don't care about the gradient wrt y. return y + aet_round(y) def f1_back(inputs, output_gradients): return [output_gradients[0], disconnected_type()] op = cls_ofg( inputs=[x, y], outputs=[f1(x, y)], grad_overrides=f1_back, connection_pattern=[[True], [False]], # This is new on_unused_input="ignore", ) # This is new c = op(x, y) g1 = grad(c.sum(), x) out = g1.eval({ x: np.ones((5, ), dtype=np.float32), y: np.ones((5, ), dtype=np.float32) }) assert np.allclose(out, [1.0] * 5)
def test_zero_gradient_shape(self): # Ensure that a zero gradient has the proper shape. x = dmatrix() f = aesara.function([x], grad(dscalar(), x, disconnected_inputs="ignore")) a = np.ones((3, 7)) assert (f(a) == 0).all() # Zero gradient assert a.shape == f(a).shape # With proper shape
def test_grad_name(self): A = matrix("A") x = vector("x") f = dot(x, dot(A, x)) f.name = "f" g = grad(f, x) assert g.name == "(df/dx)"
def test_scipy_paper_example2(self): """ This just sees if things compile well and if they run """ rng = numpy.random x = matrix() y = vector() w = shared(rng.randn(100)) b = shared(np.zeros(())) # Construct Aesara expression graph p_1 = 1 / (1 + exp(-dot(x, w) - b)) xent = -y * log(p_1) - (1 - y) * log(1 - p_1) prediction = p_1 > 0.5 cost = xent.mean() + 0.01 * (w ** 2).sum() gw, gb = grad(cost, [w, b]) # Compile expressions to functions train = function( inputs=[x, y], outputs=[prediction, xent], updates=[(w, w - 0.1 * gw), (b, b - 0.1 * gb)], ) function(inputs=[x], outputs=prediction) N = 4 feats = 100 D = (rng.randn(N, feats), rng.randint(size=4, low=0, high=2)) training_steps = 10 for i in range(training_steps): pred, err = train(D[0], D[1])
def test_Nparam(self): # grad: Test passing multiple variable params o = TestGrad.Obj1() a1 = o.make_node() g0, g1 = grad(a1.outputs[0], a1.inputs) g0.name = None assert o.gval0 is g0 assert o.gval1 is g1
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): # This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias # We check that we loop when their is too much threads n_in = 1000 batch_size = 4097 n_out = 1250 if not isinstance(mode_with_gpu, aesara.compile.debugmode.DebugMode): n_in = 4098 n_out = 4099 y = lvector("y") b = fvector("b") # we precompute the dot with big shape before to allow the test of # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error # (the launch timed out and was terminated) on GPU card not # powerful enough. We need the big shape to check for corner # case. dot_result = fmatrix("dot_result") xx = np.asarray(np.random.rand(batch_size, n_in), dtype=np.float32) yy = np.ones((batch_size, ), dtype="int32") b_values = np.zeros((n_out, ), dtype="float32") W_values = np.asarray(np.random.rand(n_in, n_out), dtype="float32") dot_value = np.asarray(np.dot(xx, W_values), dtype="float32") del W_values p_y_given_x = aesara.tensor.nnet.softmax(dot_result + b) y_pred = argmax(p_y_given_x, axis=-1) loss = -mean(log(p_y_given_x)[aet.arange(y.shape[0]), y]) dW = grad(loss, dot_result) classify = aesara.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu) classify_gpu = aesara.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu) assert any([ isinstance(node.op, aesara.tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.fgraph.toposort() ]) assert any([ isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.fgraph.toposort() ]) out = classify(yy, b_values, dot_value) gout = classify_gpu(yy, b_values, dot_value) assert len(out) == len(gout) == 3 utt.assert_allclose(out[0], gout[0]) utt.assert_allclose(out[2], gout[2], atol=3e-6) utt.assert_allclose(out[1], gout[1])
def test_disconnected_paths(self): # Test that taking gradient going through a disconnected # path rasises an exception a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = matrix("x") # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. with pytest.raises(DisconnectedInputError): grad(disconnected_grad(x).sum(), x) # This MUST NOT raise a DisconnectedInputError error. y = grad((x + disconnected_grad(x)).sum(), x) a = matrix("a") b = matrix("b") y = a + disconnected_grad(b) # This MUST raise a DisconnectedInputError error. # This also rasies an additional warning from gradients.py. with pytest.raises(DisconnectedInputError): grad(y.sum(), b) # This MUST NOT raise a DisconnectedInputError error. grad(y.sum(), a)
def test_compute_test_value(self): x = scalar("x") x.tag.test_value = np.array(1.0, dtype=config.floatX) op = OpFromGraph([x], [x**3]) y = scalar("y") y.tag.test_value = np.array(1.0, dtype=config.floatX) f = op(y) grad_f = grad(f, y) assert grad_f.tag.test_value is not None
def test_shared_grad(self, cls_ofg): x, y, z = matrices("xyz") s = shared(np.random.rand(2, 2).astype(config.floatX)) e = x + y * z + s op = cls_ofg([x, y, z], [e]) f = op(x, y, z) f = f - grad(tt_sum(f), y) fn = function([x, y, z], f) xv = np.ones((2, 2), dtype=config.floatX) yv = np.ones((2, 2), dtype=config.floatX) * 3 zv = np.ones((2, 2), dtype=config.floatX) * 5 assert np.allclose(11.0 + s.get_value(), fn(xv, yv, zv)) # grad again the shared variable f = op(x, y, z) f = f - grad(tt_sum(f), s) fn = function([x, y, z], f) assert np.allclose(15.0 + s.get_value(), fn(xv, yv, zv))
def test_Rop_dot_bug_18Oct2013_Jeremiah(self): # This test refers to a bug reported by Jeremiah Lowin on 18th Oct # 2013. The bug consists when through a dot operation there is only # one differentiable path (i.e. there is no gradient wrt to one of # the inputs). x = aet.arange(20.0).reshape([1, 20]) v = aesara.shared(np.ones([20])) d = dot(x, v).sum() Rop(grad(d, v), v, v)
def test_downsample(self): rng = np.random.RandomState(utt.fetch_seed()) # ws, shp examples = ( ((2, ), (16, )), ( (2, ), ( 4, 16, ), ), ( (2, ), ( 4, 2, 16, ), ), ((1, 1), (4, 2, 16, 16)), ((2, 2), (4, 2, 16, 16)), ((3, 3), (4, 2, 16, 16)), ((3, 2), (4, 2, 16, 16)), ((3, 2, 2), (3, 2, 16, 16, 16)), ((2, 3, 2), (3, 2, 16, 16, 16)), ((2, 2, 3), (3, 2, 16, 16, 16)), ((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)), ) for example, ignore_border in itertools.product( examples, [True, False]): (ws, shp) = example vx = rng.rand(*shp) vex = rng.rand(*shp) x = aesara.shared(vx) ex = aesara.shared(vex) maxpool_op = Pool(ignore_border, ndim=len(ws)) a_pooled = maxpool_op(x, ws).flatten() yv = Rop(a_pooled, x, ex) mode = None if aesara.config.mode == "FAST_COMPILE": mode = "FAST_RUN" rop_f = function([], yv, on_unused_input="ignore", mode=mode) sy, _ = aesara.scan( lambda i, y, x, v: (grad(y[i], x) * v).sum(), sequences=aet.arange(a_pooled.shape[0]), non_sequences=[a_pooled, x, ex], mode=mode, ) scan_f = function([], sy, on_unused_input="ignore", mode=mode) v1 = rop_f() v2 = scan_f() assert np.allclose(v1, v2), f"Rop mismatch: {v1} {v2}"
def test_1D_grad(self): c = vector() p_y = exp(c) / exp(c).sum() # test that function contains softmax and no div. g = aesara.function([c], grad(p_y.sum(), c), mode=self.mode) g_ops = [n.op for n in g.maker.fgraph.toposort()] assert len(g_ops) == 2 assert isinstance(g_ops[0], Softmax) assert isinstance(g_ops[1], SoftmaxGrad)
def __init__( self, input=None, target=None, n_input=1, n_hidden=1, n_output=1, lr=1e-3, **kw, ): super().__init__(**kw) if input is None: input = dvector("input") if target is None: target = dvector("target") self.input = input self.target = target self.lr = shared(lr, "learning_rate") self.w1 = shared(np.zeros((n_hidden, n_input)), "w1") self.w2 = shared(np.zeros((n_output, n_hidden)), "w2") # print self.lr.type self.hidden = sigmoid(dot(self.w1, self.input)) self.output = dot(self.w2, self.hidden) self.cost = aet_sum((self.output - self.target)**2) self.sgd_updates = { self.w1: self.w1 - self.lr * grad(self.cost, self.w1), self.w2: self.w2 - self.lr * grad(self.cost, self.w2), } self.sgd_step = pfunc( params=[self.input, self.target], outputs=[self.output, self.cost], updates=self.sgd_updates, ) self.compute_output = pfunc([self.input], self.output) self.output_from_hidden = pfunc([self.hidden], self.output)
def test_shared_grad(self, cls_ofg): x, y, z = matrices("xyz") s = shared(np.random.random((2, 2)).astype(config.floatX)) e = x + y * z + s op = cls_ofg([x, y, z], [e]) f = op(x, y, z) f = f - grad(at_sum(f), y) fn = function([x, y, z], f) xv = np.ones((2, 2), dtype=config.floatX) yv = np.ones((2, 2), dtype=config.floatX) * 3 zv = np.ones((2, 2), dtype=config.floatX) * 5 np.testing.assert_array_almost_equal(11.0 + s.get_value(), fn(xv, yv, zv), 4) # grad again the shared variable f = op(x, y, z) f = f - grad(at_sum(f), s) fn = function([x, y, z], f) np.testing.assert_array_almost_equal(15.0 + s.get_value(), fn(xv, yv, zv), 4)
def test_transpose_grad(self): # this should be a transposed softmax c = matrix() p_y = exp(c) / exp(c).sum(axis=0) # test that function contains softmax and no div. g = aesara.function([c], grad(p_y.sum(), c), mode=self.mode) g_ops = [n.op for n in g.maker.fgraph.toposort()] assert len(g_ops) == 2 assert isinstance(g_ops[0], Softmax) assert isinstance(g_ops[1], SoftmaxGrad)
def get_outputs(x, w): features, _ = scan( outer_scan_step, sequences=[x], non_sequences=[w], strict=True, name="the_outer_scan", ) return_val = grad(features.sum(), w) return return_val
def test_grad_constant(self): # Test that the gradient handles Constants and consider_constant variables # consistently x = scalar() y = scalar() z_x = x + y z_one = one + y g_x = grad(z_x, x, consider_constant=[x]) g_one = grad(z_one, one) f = aesara.function([x, y], [g_x, g_one]) g_x, g_one = f(1, 0.5) if not np.allclose(g_x, g_one): raise AssertionError( "Gradient using consider constant is " + str(g_x) + " but gradient with respect to the same Constant is " + str(g_one))
def test_grad(self, cls_ofg): x, y, z = matrices("xyz") e = x + y * z op = cls_ofg([x, y, z], [e]) f = op(x, y, z) f = f - grad(tt_sum(f), y) fn = function([x, y, z], f) xv = np.ones((2, 2), dtype=config.floatX) yv = np.ones((2, 2), dtype=config.floatX) * 3 zv = np.ones((2, 2), dtype=config.floatX) * 5 assert np.all(11.0 == fn(xv, yv, zv))
def test_undefined_grad_grad(self): # tests that undefined grads are caught in the grad method class DummyOp(Op): __props__ = () def make_node(self, x): return Apply(self, [x], [x.type()]) def grad(self, inputs, output_grads): return [grad_undefined(self, 0, inputs[0])] def perform(self, *args, **kwargs): raise NotImplementedError() a = scalar() b = DummyOp()(a) with pytest.raises(TypeError): grad(b, a)
def test_NNone_rval(self): # grad: Test returning some zero value from grad o = TestGrad.Obj1() a1 = o.make_node() g0, g1, g2 = grad(a1.outputs[0], a1.inputs + [scalar("z")], disconnected_inputs="ignore") assert o.gval0 is g0 assert o.gval1 is g1 assert g2.owner.op == aet.fill assert g2.owner.inputs[1].data == 0
def setup_method(self): self.k = iscalar("k") self.A = vector("A") result, _ = scan( fn=lambda prior_result, A: prior_result * A, outputs_info=ones_like(self.A), non_sequences=self.A, n_steps=self.k, ) result_check, _ = scan_checkpoints( fn=lambda prior_result, A: prior_result * A, outputs_info=ones_like(self.A), non_sequences=self.A, n_steps=self.k, save_every_N=100, ) self.result = result[-1] self.result_check = result_check[-1] self.grad_A = grad(self.result.sum(), self.A) self.grad_A_check = grad(self.result_check.sum(), self.A)
def test_disconnected_cost_grad(): # Tests that if we say the cost is disconnected via the # known_grads mechanism, it is treated as such by the rest of the # system. # This is so that Ops that are built around minigraphs like OpFromGraph # and scan can implement Op.grad by passing ograds to known_grads x = iscalar() y = iscalar() cost = x + y assert cost.dtype in discrete_dtypes try: grad( cost, [x, y], known_grads={cost: DisconnectedType()()}, disconnected_inputs="raise", ) except DisconnectedInputError: return raise AssertionError("A disconnected gradient has been ignored.")
def setup_gpu_op(self, activations, labels, input_length, compute_grad=True): gpu_ctc_cost = gpu_ctc(activations, labels, input_length) outputs = [gpu_ctc_cost] if compute_grad: # Symbolic gradient of CTC cost gpu_ctc_grad = grad(mean(gpu_ctc_cost), activations) outputs += [gpu_ctc_grad] return aesara.function([], outputs, mode=mode_with_gpu)
def test_gradient_scan(): # Test for a crash when using MRG inside scan and taking the gradient # See https://groups.google.com/d/msg/theano-dev/UbcYyU5m-M8/UO9UgXqnQP0J aesara_rng = MRG_RandomStream(10) w = shared(np.ones(1, dtype="float32")) def one_step(x): return x + aesara_rng.uniform((1, ), dtype="float32") * w x = vector(dtype="float32") values, updates = scan(one_step, outputs_info=x, n_steps=10) gw = grad(aet_sum(values[-1]), w) f = function([x], gw) f(np.arange(1, dtype="float32"))
def test_observed(): rv_var = normal(0, 1, size=3) obs_var = observed(rv_var, np.array([0.2, 0.1, -2.4], dtype=config.floatX)) assert obs_var.owner.inputs[0] is rv_var with raises(TypeError): observed(rv_var, np.array([1, 2], dtype=int)) with raises(TypeError): observed(rv_var, np.array([[1.0, 2.0]], dtype=rv_var.dtype)) obs_rv = observed(None, np.array([0.2, 0.1, -2.4], dtype=config.floatX)) assert isinstance(obs_rv.owner.inputs[0].type, NoneTypeT) rv_val = vector() rv_val.tag.test_value = np.array([0.2, 0.1, -2.4], dtype=config.floatX) obs_var = observed(rv_var, rv_val) with raises(NullTypeGradError): grad(obs_var.sum(), [rv_val])
def test_dxdx(): # Tests that the gradient of a scalar with respect to itself is 1 # I use an integer in this case because people keep changing this # gradient to be 0 on integers but according to our interpretation # of the gradient as defined in the Op contract, it should be 1. # If you feel the need to change this unit test you are probably # modifying the Op contract and should definitely get the approval # of multiple people on aesara-dev. x = iscalar() g = grad(x, x) g = g.eval({x: 12}) assert np.allclose(g, 1.0)