def test_to_sequence_basic(device_id): dev = cntk_device(device_id) x = C.input_variable((C.FreeDimension, 2)) x_seq = C.to_sequence(x) assert len(x_seq.dynamic_axes) == 2 x_data = np.asarray([[[1, 2], [-1000, -1000]], [[3, 4], [5, 6]]], dtype=np.float32) result = x_seq.eval({x : x_data}, device=dev) assert np.array_equal(result, x_data) x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=True) x_seq_lens = C.input_variable(()) x_seq = C.to_sequence(x, x_seq_lens) seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(2, 2, 3), device=C.cpu()) seq2_data = [[0, 1, 1], [1, 1, 0]] csr_seq2 = _to_csr([seq2_data, [[0, 0, 0], [0, 0, 0]]]) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 2, 3), device=C.cpu()) x_data = C.Value.create(C.input_variable((2, 2, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=dev).data x_seq_lens_data = np.asarray([2, 1], dtype=np.float32) result = x_seq.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev, as_numpy=False) result_dense = _to_dense(result, True) assert np.array_equal(result_dense[0], seq1_data) assert np.array_equal(result_dense[1], [seq2_data])
def test_sequence_unpack_backprop(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features') label_input = C.input_variable(num_labels, is_sparse=True, name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = C.sequence.last(C.layers.Recurrence(C.plus)(model)) ce = C.cross_entropy_with_softmax(z, label_input) seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]] seq2_data = [[0, 0, 1], [0, 1, 1]] label_data = _to_csr([[0, 1], [1, 0]]) param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_input : label_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) z = C.sequence.reduce_sum(model) ce = C.cross_entropy_with_softmax(z, label_input) param_grads_2, loss_result_2 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_input : label_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) assert np.allclose(loss_result_1.asarray(), loss_result_2.asarray()) for param in param_grads_1: if not param_grads_1[param].is_sparse: reference_grad_value = param_grads_1[param].asarray() grad_value = param_grads_2[param].asarray() assert np.allclose(reference_grad_value, grad_value)
def test_sequence_unpack_backprop(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features') label_input = C.input_variable(num_labels, is_sparse=True, name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = C.sequence.last(C.layers.Recurrence(C.plus)(model)) ce = C.cross_entropy_with_softmax(z, label_input) seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]] seq2_data = [[0, 0, 1], [0, 1, 1]] label_data = _to_csr([[0, 1], [1, 0]]) param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_input : label_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) z = C.sequence.reduce_sum(model) ce = C.cross_entropy_with_softmax(z, label_input) param_grads_2, loss_result_2 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_input : label_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) assert np.allclose(loss_result_1.asarray(), loss_result_2.asarray()) for param in param_grads_1: if not param_grads_1[param].is_sparse: reference_grad_value = param_grads_1[param].asarray() grad_value = param_grads_2[param].asarray() assert np.allclose(reference_grad_value, grad_value)
def test_to_sequence_basic(device_id): dev = cntk_device(device_id) x = C.input_variable((C.FreeDimension, 2)) x_seq = C.to_sequence(x) assert len(x_seq.dynamic_axes) == 2 x_data = np.asarray([[[1, 2], [-1000, -1000]], [[3, 4], [5, 6]]], dtype=np.float32) result = x_seq.eval({x : x_data}, device=dev) assert np.array_equal(result, x_data) x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=True) x_seq_lens = C.input_variable(()) x_seq = C.to_sequence(x, x_seq_lens) seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(2, 2, 3), device=C.cpu()) seq2_data = [[0, 1, 1], [1, 1, 0]] csr_seq2 = _to_csr([seq2_data, [[0, 0, 0], [0, 0, 0]]]) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 2, 3), device=C.cpu()) x_data = C.Value.create(C.input_variable((2, 2, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=dev).data x_seq_lens_data = np.asarray([2, 1], dtype=np.float32) result = x_seq.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev, as_numpy=False) result_dense = _to_dense(result, True) assert np.array_equal(result_dense[0], seq1_data) assert np.array_equal(result_dense[1], [seq2_data])
def test_2d_sparse_sequences_value(device_id): dev = cntk_device(device_id) seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(2, 2, 3), device=C.cpu()) seq2_data = [[0, 1, 1], [1, 1, 0]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(1, 2, 3), device=C.cpu()) x = C.sequence.input_variable((2, 3)) sequence_value = C.Value.create(x, [ndarrayview1, ndarrayview2], device=dev) assert np.array_equal(_to_dense(sequence_value.data), [seq1_data, [seq2_data, [[0, 0, 0], [0, 0, 0]]]])
def test_to_sequence_backprop(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels') ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]] seq2_data = [[0, 0, 1], [0, 1, 1]] seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) # Create a clone of the model that uses a non-sequence input # and converts it to a sequence using to_sequence x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features') x_seq_lens = C.input_variable((), name='sequence_lengths') x_seq = C.to_sequence(x_non_seq_input, x_seq_lens) x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input) ce_clone = ce.clone('share', {x_seq_input : x_seq}) x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3)) x_seq_lens_data = np.asarray([3, 2], dtype=np.float32) x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features') label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels') x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths') param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data}, wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False) assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0]) assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1]) for param in param_grads_1: if not param_grads_1[param].is_sparse: reference_grad_value = param_grads_1[param].asarray() grad_value = param_grads_2[param].asarray() assert np.array_equal(reference_grad_value, grad_value)
def test_to_sequence_backprop(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels') ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]] seq2_data = [[0, 0, 1], [0, 1, 1]] seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) # Create a clone of the model that uses a non-sequence input # and converts it to a sequence using to_sequence x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features') x_seq_lens = C.input_variable((), name='sequence_lengths') x_seq = C.to_sequence(x_non_seq_input, x_seq_lens) x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input) ce_clone = ce.clone('share', {x_seq_input : x_seq}) x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3)) x_seq_lens_data = np.asarray([3, 2], dtype=np.float32) x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features') label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels') x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths') param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data}, wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False) assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0]) assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1]) for param in param_grads_1: if not param_grads_1[param].is_sparse: reference_grad_value = param_grads_1[param].asarray() grad_value = param_grads_2[param].asarray() assert np.array_equal(reference_grad_value, grad_value)
def test_lstm_over_lstm_thought_vectors_2(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 utterances_input = C.sequence.input_variable((input_vocab_size), is_sparse=True, name='utterances') conversation_lengths_input = C.input_variable((), name='conversation_sequence_lengths') label_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(utterances_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.user_function(UtteranceBatchReshape(model, conversation_lengths_input)) model = C.to_sequence_like(model, label_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_input) sentinel_utt_data = C.NDArrayView.from_csr(_to_csr([[0, 0, 1]]), device=C.cpu()) c1_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0], [1, 0, 0]]), device=C.cpu()) c1_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1]]), device=C.cpu()) c1_utt3_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0]]), device=C.cpu()) c2_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1]]), device=C.cpu()) c3_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1], [1, 0, 0]]), device=C.cpu()) c3_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0]]), device=C.cpu()) all_utt_data = C.Value.create(C.sequence.input_variable((input_vocab_size), is_sparse=True), [c1_utt1_data, c1_utt2_data, c1_utt3_data, c2_utt1_data, sentinel_utt_data, sentinel_utt_data, c3_utt1_data, c3_utt2_data, sentinel_utt_data], device=C.cpu()).data conversation_lengths_data = np.asarray([3, 1, 2], dtype=np.float32) seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0]] seq3_label_data = [[1, 0], [0, 1]] label_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data), _to_csr(seq3_label_data)] param_grads, loss_result = ce.grad({utterances_input : all_utt_data, label_input : label_data, conversation_lengths_input : conversation_lengths_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.01 assert np.allclose(loss_result[0], [[0.678914], [0.668076], [0.728129]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.679029]], atol=absolute_tolerance) assert np.allclose(loss_result[2], [[0.705393], [0.674243]], atol=absolute_tolerance)
def test_lstm_over_lstm_thought_vectors_2(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 utterances_input = C.sequence.input_variable((input_vocab_size), is_sparse=True, name='utterances') conversation_lengths_input = C.input_variable((), name='conversation_sequence_lengths') label_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(utterances_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.user_function(UtteranceBatchReshape(model, conversation_lengths_input)) model = C.to_sequence_like(model, label_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_input) sentinel_utt_data = C.NDArrayView.from_csr(_to_csr([[0, 0, 1]]), device=C.cpu()) c1_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0], [1, 0, 0]]), device=C.cpu()) c1_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1]]), device=C.cpu()) c1_utt3_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0]]), device=C.cpu()) c2_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1]]), device=C.cpu()) c3_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1], [1, 0, 0]]), device=C.cpu()) c3_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0]]), device=C.cpu()) all_utt_data = C.Value.create(C.sequence.input_variable((input_vocab_size), is_sparse=True), [c1_utt1_data, c1_utt2_data, c1_utt3_data, c2_utt1_data, sentinel_utt_data, sentinel_utt_data, c3_utt1_data, c3_utt2_data, sentinel_utt_data], device=C.cpu()).data conversation_lengths_data = np.asarray([3, 1, 2], dtype=np.float32) seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0]] seq3_label_data = [[1, 0], [0, 1]] label_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data), _to_csr(seq3_label_data)] param_grads, loss_result = ce.grad({utterances_input : all_utt_data, label_input : label_data, conversation_lengths_input : conversation_lengths_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.01 assert np.allclose(loss_result[0], [[0.678914], [0.668076], [0.728129]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.679029]], atol=absolute_tolerance) assert np.allclose(loss_result[2], [[0.705393], [0.674243]], atol=absolute_tolerance)
def test_ndarrayview_from_csr(device_id): dev = cntk_device(device_id) data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]] csr_data = _to_csr(data) ndarrayview = C.NDArrayView.from_csr(csr_data, shape=(2, 2, 3)) assert np.array_equal(_to_dense(ndarrayview), data) with pytest.raises(ValueError): ndarrayview = C.NDArrayView.from_csr(csr_data, shape=(3, 2, 3)) with pytest.raises(ValueError): ndarrayview = C.NDArrayView.from_csr(csr_data, shape=(2, 2, 4))
def test_lstm_over_lstm_thought_vectors(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='features') label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.to_sequence_like(model, label_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu()) seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu()) x_seq_data = C.Value.create(C.sequence.input_variable((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads, loss_result = ce.grad({x_seq_input : x_seq_data, label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.02 assert np.allclose(loss_result[0], [[0.67126], [0.676331], [0.765814]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.685199], [0.681736]], atol=absolute_tolerance)
def test_lstm_over_lstm_thought_vectors(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='features') label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.to_sequence_like(model, label_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu()) seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu()) x_seq_data = C.Value.create(C.sequence.input_variable((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads, loss_result = ce.grad({x_seq_input : x_seq_data, label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.02 assert np.allclose(loss_result[0], [[0.67126], [0.676331], [0.765814]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.685199], [0.681736]], atol=absolute_tolerance)
def test_lstm_over_lstm_thought_vectors(device_id): previous_random_seed = C.cntk_py.get_random_seed() C.cntk_py.reset_random_seed(0) dev = cntk_device(device_id) input_vocab_size = 3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input((C.FreeDimension, input_vocab_size), is_sparse=True, name='features') label_seq_input = C.sequence.input(num_labels, is_sparse=True, sequence_axis=Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.to_sequence_like(model, label_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu()) seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu()) x_seq_data = C.Value.create(C.sequence.input((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads, loss_result = ce.grad( { x_seq_input: x_seq_data, label_seq_input: label_seq_data }, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() # TODO: The tolerance here is inordinately high due to the non-determinism in initialization # of parameters as the individual tests are not run in separate processes resulting in the # addition or removal of tests to affect the random initialization of parameters in all other # tests that do not explicitly specify the random seed. The tolerance should be lowered to # 0.01 after this issue in the test infrastructure has been fixed. absolute_tolerance = 0.02 assert np.allclose(loss_result[0], [[0.63504], [0.673343], [0.698446]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.772344], [0.64295]], atol=absolute_tolerance) C.cntk_py.reset_random_seed(previous_random_seed)