def test_xent_thing_int32(self): x = matrix("x") y = lvector("y") yi = aet.cast(y, "int32") expressions = [ aet_sum(-log(softmax(x)[aet.arange(yi.shape[0]), yi])), -aet_sum(log(softmax(x)[aet.arange(yi.shape[0]), yi])), -aet_sum(log(softmax(x))[aet.arange(yi.shape[0]), yi]), aet_sum(-log(softmax(x))[aet.arange(yi.shape[0]), yi]), ] for expr in expressions: fgraph = FunctionGraph([x, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 5 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] # Also verify the gradient wrt x fgraph = FunctionGraph([x, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 3 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops
def __init__(self, input, n_in, n_out, name_prefix=""): """Initialize the parameters of the logistic regression :type input: TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # initialize with 0 the weights W as a matrix of shape (n_in, n_out) self.W = aesara.shared( value=np.zeros((n_in, n_out), dtype=aesara.config.floatX), name=name_prefix + "W", ) # compute vector of class-membership probabilities in symbolic form self.p_y_given_x = softmax(dot(input, self.W)) # compute prediction as class whose probability is maximal in # symbolic form self.y_pred = argmax(self.p_y_given_x, axis=1) # parameters of the model self.params = [self.W]
def test_softmax_with_bias_trace(self): rng = np.random.default_rng(utt.fetch_seed()) a = aesara.shared(rng.standard_normal((3, )).astype(config.floatX)) b = aesara.shared(np.float32(rng.standard_normal())) sm = softmax(a + b) f = aesara.function([], sm) assert check_stack_trace(f, ops_to_check="last")
def test_vector_perform(self): x = vector() f = aesara.function([x], softmax(x, axis=None)) rng = np.random.default_rng(utt.fetch_seed()) xv = rng.standard_normal((6, )).astype(config.floatX) assert np.allclose(f(xv), sp.softmax(xv))
def test_perform(self, axis): x = tensor4("x") rng = np.random.default_rng(utt.fetch_seed()) xv = rng.standard_normal((2, 3, 4, 5)).astype(config.floatX) f = aesara.function([x], softmax(x, axis=axis)) assert np.allclose(f(xv), sp.softmax(xv, axis=axis))
def test_local_logsoftmax_opt(self, axis): # Test the Logsoftmax substitution # # Check that Log(Softmax(x)) is substituted with Logsoftmax(x). Note that # only the forward pass is checked (i.e., doesn't check the gradient) x = matrix("x") sm = softmax(x, axis=axis) logsm = log(sm) f = aesara.function([x], logsm) assert isinstance(f.maker.fgraph.outputs[0].owner.op, LogSoftmax) assert check_stack_trace(f, ops_to_check=LogSoftmax)
def test_stabilize_log_softmax(): mode = aesara.compile.mode.get_default_mode() mode = mode.including("local_log_softmax", "specialize") x = matrix() y = softmax(x) z = log(y) f = aesara.function([x], z, mode=mode) assert check_stack_trace(f, ops_to_check="all") # check that the softmax has been optimized out for node in f.maker.fgraph.toposort(): assert not isinstance(node.op, y.owner.op.__class__) # call the function so debug mode can verify the optimized # version matches the unoptimized version rng = np.random.default_rng([2012, 8, 22]) f(np.cast[config.floatX](rng.random((2, 3))))
def test_asymptotic_32(): # This test makes sure that our functions behave sensibly when # huge values are present # TODO: consider adding the optimization of crossentropy into the current # mode for the purpose of running this test for dtype in "float32", "float64": if dtype == "float32": x = fmatrix() x2 = fvector() else: x = dmatrix() x2 = dvector() y = lvector() c = categorical_crossentropy(softmax(x + x2), y) f = aesara.function([x, y, x2], [c.sum(), grad(c.sum(), x)], mode="FAST_RUN") xval = np.zeros((5, 5), dtype=dtype).astype(dtype) x2val = np.zeros(5, dtype=xval.dtype).astype(dtype) for i in range(100): cval, gxval = f(xval, np.arange(5), x2val) xval -= 100.3 * gxval assert cval == 0 # no problem going to zero error # what about when x gets really big? xval = np.zeros((5, 5), dtype=dtype) x2val = np.zeros(5, dtype=xval.dtype) for i in range(100): cval, gxval = f(xval, np.arange(5), x2val) xval += 100000.3 * gxval assert cval > 61750000 assert gxval[0, 0] == -1.0 assert gxval[0, 1] == 0.25
def test_logsoftmax_grad_true_div_elemwise(self): # Checks that the gradient of an expression similar to a log(softmax) # but with a different elemwise operation than true_div is not # optimized. x = matrix("x") y = log(softmax(x)) g = grad(y.sum(), x) softmax_grad_node = g.owner assert softmax_grad_node.op == softmax_grad_legacy true_div_node = softmax_grad_node.inputs[0].owner assert true_div_node.op == true_div # We replace the elemwise true_div op by an elemwise add. new_g = softmax_grad_legacy(add(*true_div_node.inputs), softmax_grad_node.inputs[1]) fgraph = FunctionGraph([x], [new_g]) optdb.query(OPT_FAST_RUN).optimize(fgraph) assert softmax_grad_legacy in [n.op for n in fgraph.toposort()]
def test_matrix_perform_and_opt(self): m = config.mode m = aesara.compile.get_mode(m) m.check_isfinite = False x, y = matrices("xy") # regular softmax and crossentropy sm = softmax(x) cm = categorical_crossentropy(sm, y) # numerically stable log-softmax with crossentropy logsm = logsoftmax(x) sm2 = exp(logsm) # just used to show equivalence with sm cm2 = -aet_sum(y * logsm, axis=1) grad_node = grad(cm2.mean(), x) # create some inputs into a softmax that are large and labels a = np.exp(10 * np.random.random((5, 10)).astype(config.floatX)) # create some one-hot coded labels b = np.eye(5, 10).astype(config.floatX) # show equivalence of softmax and exponentiated numerically stable # log-softmax f1 = aesara.function([x], [sm, sm2]) sm_, sm2_ = f1(a) utt.assert_allclose(sm_, sm2_) # now show that the two versions result in the same crossentropy cost # this indicates that the forward function does provide some numerical # stability f2 = aesara.function([x, y], [cm, cm2], mode=m) cm_, cm2_ = f2(a, b) utt.assert_allclose(cm_, cm2_) # now, show that in the standard softmax case the gradients blow up # while in the log-softmax case they don't f3 = aesara.function([x, y], [grad_node]) grad_ = f3(a, b) assert not np.any(np.isnan(grad_))
def test_softmax_with_bias_trace(self): a = aesara.shared(np.random.randn(3).astype(config.floatX)) b = aesara.shared(np.float32(np.random.randn())) sm = softmax(a + b) f = aesara.function([], sm) assert check_stack_trace(f, ops_to_check="last")
def test_crossentropy_softmax_1hot_with_bias_dxcale_cost(self): x = matrix("x") y = lvector("y") a = scalar("a") def validate_grad_graph(func): # The graph of the gradient should not have softmaxgrad anymore has_cx1hotdx = False has_softmax = False has_softmaxdx = False for node in func.maker.fgraph.toposort(): if node.op == crossentropy_softmax_1hot_with_bias_dx: has_cx1hotdx = True if node.op == softmax_legacy: has_softmax = True if node.op == softmax_grad_legacy: has_softmaxdx = True assert has_cx1hotdx assert has_softmax assert not has_softmaxdx # Cases to test expressions = [ a * aet_sum(-log(softmax(x)[aet.arange(y.shape[0]), y])), -a * aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y])), a * (-aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y]))), a * aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y])), a * aet_sum(-log(softmax(x))[aet.arange(y.shape[0]), y]), -a * aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y]), a * (-aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y])), a * aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y]), a * mean(-log(softmax(x)[aet.arange(y.shape[0]), y])), -a * mean(log(softmax(x)[aet.arange(y.shape[0]), y])), a * (-mean(log(softmax(x)[aet.arange(y.shape[0]), y]))), a * mean(log(softmax(x)[aet.arange(y.shape[0]), y])), a * mean(-log(softmax(x))[aet.arange(y.shape[0]), y]), -a * mean(log(softmax(x))[aet.arange(y.shape[0]), y]), a * (-mean(log(softmax(x))[aet.arange(y.shape[0]), y])), a * mean(log(softmax(x))[aet.arange(y.shape[0]), y]), ] for expr in expressions: fgraph = FunctionGraph([x, y, a], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) assert 5 <= len(fgraph.toposort()) <= 10 ops = {node.op for node in fgraph.toposort()} assert crossentropy_softmax_argmax_1hot_with_bias in ops assert softmax_legacy not in ops # Verify the gradient wrt x fgraph = FunctionGraph([x, y, a], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) assert 3 <= len(fgraph.toposort()) <= 6 ops = {node.op for node in fgraph.toposort()} assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops # Verify the gradient when providing output gradient fgraph = FunctionGraph( [x, y, a], [grad(expr, x, known_grads={expr: a * x.sum()})]) optdb.query(OPT_FAST_RUN).optimize(fgraph) assert 6 <= len(fgraph.toposort()) <= 8 ops = {node.op for node in fgraph.toposort()} assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops
def test_perform(self, axis): x = tensor4("x") xv = np.random.randn(2, 3, 4, 5).astype(config.floatX) f = aesara.function([x], softmax(x, axis=axis)) assert np.allclose(f(xv), sp.softmax(xv, axis=axis))
def test_get_rid_of_advanced_indexing_version_of_xent(self): x = matrix("x") b = vector("b") y = lvector("y") # Basic case expressions = [ aet_sum(-log(softmax(x)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(x)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(x))[aet.arange(y.shape[0]), y]), aet_sum(-log(softmax(x))[aet.arange(y.shape[0]), y]), ] for expr in expressions: fgraph = FunctionGraph([x, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 4 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] # Also verify the gradient wrt x fgraph = FunctionGraph([x, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 2 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops # Test that a biased softmax is optimized correctly bias_expressions = [ aet_sum(-log(softmax(x + b)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(b + x)[aet.arange(y.shape[0]), y])), -aet_sum(log(softmax(x + b))[aet.arange(y.shape[0]), y]), aet_sum(-log(softmax(b + x))[aet.arange(y.shape[0]), y]), ] for expr in bias_expressions: fgraph = FunctionGraph([x, b, y], [expr, x]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 2 # [big_op, sum] assert crossentropy_softmax_argmax_1hot_with_bias in ops fgraph = FunctionGraph([x, b, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 2 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_with_bias in ops assert softmax_grad_legacy not in ops # Test that using "mean" instead of sum works, too mean_expressions = [ mean(-log(softmax(x)[aet.arange(y.shape[0]), y])), -mean(log(softmax(x)[aet.arange(y.shape[0]), y])), -mean(log(softmax(x))[aet.arange(y.shape[0]), y]), mean(-log(softmax(x))[aet.arange(y.shape[0]), y]), ] for expr in mean_expressions: fgraph = FunctionGraph([x, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 6 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] fgraph = FunctionGraph([x, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 5 # there's an extra dimshuffle in there # but I can't think of a good rule to get rid of it assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_legacy in ops assert softmax_grad_legacy not in ops mean_bias_expressions = [ mean(-log(softmax(x + b)[aet.arange(y.shape[0]), y])), -mean(log(softmax(b + x)[aet.arange(y.shape[0]), y])), -mean(log(softmax(x + b))[aet.arange(y.shape[0]), y]), mean(-log(softmax(b + x))[aet.arange(y.shape[0]), y]), ] for expr in mean_bias_expressions: fgraph = FunctionGraph([x, b, y], [expr]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 4 assert crossentropy_softmax_argmax_1hot_with_bias in ops assert not [1 for o in ops if isinstance(o, AdvancedSubtensor)] fgraph = FunctionGraph([x, b, y], [grad(expr, x)]) optdb.query(OPT_FAST_RUN).optimize(fgraph) ops = [node.op for node in fgraph.toposort()] assert len(ops) == 5 assert crossentropy_softmax_1hot_with_bias_dx in ops assert softmax_with_bias in ops assert softmax_grad_legacy not in ops
def myfunc(x): sm = softmax(x, axis=axis) logsm = log(sm) return logsm
def f(a): return softmax(a, axis=axis)[:, column]
def test_vector_perform(self): x = vector() f = aesara.function([x], softmax(x, axis=None)) xv = np.random.randn(6).astype(config.floatX) assert np.allclose(f(xv), sp.softmax(xv))
def f(a): return softmax(a, axis=None)