def test_random_sample_with_replacement(weights, num_samples, tolerance, raises_exception, device_id, precision): weights = AA(weights, precision) expected_relative_frequency = weights / np.sum(weights) num_calls = 10 identity = np.identity(weights.size) allow_duplicates = True # sample with replacement if raises_exception: with pytest.raises(ValueError): result = random_sample(weights, num_samples, allow_duplicates) result.eval() else: observed_frequency = np.empty_like(weights) for i in range(0, num_calls): result = random_sample(weights, num_samples, allow_duplicates) denseResult = times(result, identity) observed_frequency += np.sum(denseResult.eval(), 0) observed_relative_frequency = observed_frequency / \ (num_calls * num_samples) assert np.allclose(observed_relative_frequency, expected_relative_frequency, atol=tolerance)
def test_random_sample_with_explicit_seed(device_id, precision): weights = AA([x for x in range(0, 10)], precision) identity = np.identity(weights.size) allow_duplicates = False # sample without replacement num_samples = 5; seed = 123 to_dense = lambda x: times(x, identity).eval() result1 = to_dense(random_sample(weights, num_samples, allow_duplicates, seed)) result2 = to_dense(random_sample(weights, num_samples, allow_duplicates, seed)) result3 = to_dense(random_sample(weights, num_samples, allow_duplicates, seed+1)) result4 = to_dense(random_sample(weights, num_samples, allow_duplicates)) assert np.allclose(result1, result2) assert not np.allclose(result1, result3) assert not np.allclose(result1, result4)
def test_random_sample_without_replacement(weights, num_samples, expected_count, tolerance, raises_exception, device_id, precision): weights = AA(weights, precision) identity = np.identity(weights.size) allow_duplicates = False # sample without replacement if raises_exception: with pytest.raises(ValueError): result = random_sample(weights, num_samples, allow_duplicates) result.eval() else: result = random_sample(weights, num_samples, allow_duplicates) denseResult = times(result, identity) observed_count = np.sum(denseResult.eval(), 0) assert np.allclose(observed_count, expected_count, atol=tolerance)
def test_random_sample_with_explicit_seed(device_id, precision): weights = AA([x for x in range(0, 10)], precision) identity = np.identity(weights.size) allow_duplicates = False # sample without replacement num_samples = 5 seed = 123 to_dense = lambda x: times(x, identity).eval() result1 = to_dense( random_sample(weights, num_samples, allow_duplicates, seed)) result2 = to_dense( random_sample(weights, num_samples, allow_duplicates, seed)) result3 = to_dense( random_sample(weights, num_samples, allow_duplicates, seed + 1)) result4 = to_dense(random_sample(weights, num_samples, allow_duplicates)) assert np.allclose(result1, result2) assert not np.allclose(result1, result3) assert not np.allclose(result1, result4)
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates=False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.layers.Parameter(shape=(vocab_dim, 1), init=0) weights = C.layers.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample( sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency( sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: " + str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:" + str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times( sample_selector, bias, name='zS2') - C.times_transpose( sample_selector, log_prior, name='zS3') # [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times( target_vector, bias, name='zT2') - C.times_transpose( target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape=(vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def test_set_rng_seed_attribute(): from cntk import random_sample, input; random_sample_node = random_sample(input(1), 1, True, seed=123) key = 'rngSeed' root = random_sample_node.root_function assert root.attributes[key] == 123 root.set_attribute(key, 11530328594546889191) assert root.attributes[key] == 11530328594546889191 random_sample_node.set_attribute(key, 2**31) assert root.attributes[key] == 2**31
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: "+str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:"+str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision): """ Simple test that makes sure that the derivatives have the correct sparsity pattern """ # ignore precision, only sparsity pattern matters for this test dt = np.float32 from cntk.losses import nce_loss import scipy trials = 10 # Establish baseline expected_count = np.zeros(classes) I = C.constant(np.eye(classes, dtype=dt)) q = np.arange(classes, dtype=dt) + 1 z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0) for i in range(trials): expected_count[np.nonzero(z.eval().ravel())] += 1 # Set things up to measure the same thing with nce_loss x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape( (batch, xdim)) / (batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10, 10 * batch + 1, 10)) indptr = list(range(batch + 1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) b = C.parameter((classes, 1)) W = C.parameter((classes, C.InferredDimension)) gb = np.zeros(classes) vb = C.input_variable((classes, 1), dtype=dt) Ib = C.constant(np.eye(1, dtype=dt)) zb = C.times(vb, Ib) loss = C.nce_loss(W, b, x, y, q, seed=98052) for i in range(trials): v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False) gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1 for i in range(classes): assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision): """ Simple test that makes sure that the derivatives have the correct sparsity pattern """ # ignore precision, only sparsity pattern matters for this test dt = np.float32 from cntk.losses import nce_loss import scipy trials = 10 # Establish baseline expected_count = np.zeros(classes) I = C.constant(np.eye(classes, dtype=dt)) q = np.arange(classes, dtype=dt) + 1 z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0) for i in range(trials): expected_count[np.nonzero(z.eval().ravel())] += 1 # Set things up to measure the same thing with nce_loss x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10,10*batch+1,10)) indptr = list(range(batch+1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) b = C.parameter((classes, 1)) W = C.parameter((classes, C.InferredDimension)) gb = np.zeros(classes) vb = C.input_variable((classes, 1), dtype=dt) Ib = C.constant(np.eye(1, dtype=dt)) zb = C.times(vb, Ib) loss = C.nce_loss(W, b, x, y, q, seed=98052) for i in range(trials): v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False) gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1 for i in range(classes): assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
def cross_entropy_with_sampled_softmax( hidden_vector, label_vector, vocab_dim, hidden_dim, num_samples, sampling_weights, allow_duplicates = False ): bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) sample_selector = sample_selector_sparse inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) log_prior = C.log(inclusion_probs) wS = C.times(sample_selector, weights, name='wS') zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3') # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(label_vector, weights, name='wT') zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(label_vector, bias, name='zT2') - C.times_transpose(label_vector, log_prior, name='zT3') zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)