def test_to_sequence_basic(device_id): dev = cntk_device(device_id) x = C.input_variable((C.FreeDimension, 2)) x_seq = C.to_sequence(x) assert len(x_seq.dynamic_axes) == 2 x_data = np.asarray([[[1, 2], [-1000, -1000]], [[3, 4], [5, 6]]], dtype=np.float32) result = x_seq.eval({x : x_data}, device=dev) assert np.array_equal(result, x_data) x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=True) x_seq_lens = C.input_variable(()) x_seq = C.to_sequence(x, x_seq_lens) seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(2, 2, 3), device=C.cpu()) seq2_data = [[0, 1, 1], [1, 1, 0]] csr_seq2 = _to_csr([seq2_data, [[0, 0, 0], [0, 0, 0]]]) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 2, 3), device=C.cpu()) x_data = C.Value.create(C.input_variable((2, 2, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=dev).data x_seq_lens_data = np.asarray([2, 1], dtype=np.float32) result = x_seq.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev, as_numpy=False) result_dense = _to_dense(result, True) assert np.array_equal(result_dense[0], seq1_data) assert np.array_equal(result_dense[1], [seq2_data])
def inner(a): values, valid = C.sequence.unpack(a, padding_value=0).outputs values_reversed = C.slice(values, 0, 0, 0, -1) valid_reversed = C.slice(valid, 0, 0, 0, -1) values_seq = C.to_sequence(values_reversed) valid_seq = C.to_sequence(C.expand_dims(valid_reversed, axis=-1)) a_reversed = C.sequence.gather(values_seq, valid_seq) return a_reversed
def inner(a, b): a_unpacked, a_mask = C.sequence.unpack(a, padding_value=0).outputs b_unpacked, b_mask = C.sequence.unpack(b, padding_value=0).outputs ab_unpacked = C.splice(a_unpacked, b_unpacked, axis=0) ab_mask = C.expand_dims(C.splice(a_mask, b_mask), axis=-1) ab_w_pad = C.to_sequence(ab_unpacked) ab_condition = C.to_sequence(ab_mask) ab = C.sequence.gather(ab_w_pad, ab_condition) return ab
def test_sequence_unpack_basic(device_id): dev = cntk_device(device_id) x = C.input((C.FreeDimension, 2, 3), is_sparse=False) x_seq_lens = C.input(()) x_seq = C.to_sequence(x, x_seq_lens) x_seq_unpacked = C.sequence.unpack(x_seq, padding_value=-1000.0) x_seq_unpacked_value_output = x_seq_unpacked.outputs[0] x_seq_unpacked_mask_output = x_seq_unpacked.outputs[1] assert len(x_seq_unpacked_value_output.dynamic_axes) == 1 assert x_seq_unpacked_value_output.shape == (C.FreeDimension, 2, 3) seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]] seq2_data = [[0, 1, 1], [1, 1, 0]] x_data = [ np.asarray(seq1_data, dtype=np.float32), np.asarray( [seq2_data, [[-100.0, -100.0, -100.0], [-100.0, -100.0, -100.0]]], dtype=np.float32) ] x_seq_lens_data = np.asarray([2, 1], dtype=np.float32) result = x_seq_unpacked.eval({ x: x_data, x_seq_lens: x_seq_lens_data }, device=dev) value = result[x_seq_unpacked_value_output] mask = result[x_seq_unpacked_mask_output] assert np.array_equal(value[0], seq1_data) assert np.array_equal(value[1], [ seq2_data, [[-1000.0, -1000.0, -1000.0], [-1000.0, -1000.0, -1000.0]] ]) assert np.array_equal(mask, [[1, 1], [1, 0]])
def test_sequence_unpack_basic(device_id): dev = cntk_device(device_id) # Unpack a placeholder p = C.placeholder() p_unpacked_outputs = C.sequence.unpack(p, padding_value=0).outputs assert len(p_unpacked_outputs) == 2 x = C.input_variable((C.FreeDimension, 2, 3), is_sparse=False) x_seq_lens = C.input_variable(()) x_seq = C.to_sequence(x, x_seq_lens) x_seq_unpacked = C.sequence.unpack(x_seq, padding_value=-1000.0) x_seq_unpacked_value_output = x_seq_unpacked.outputs[0] x_seq_unpacked_mask_output = x_seq_unpacked.outputs[1] assert len(x_seq_unpacked_value_output.dynamic_axes) == 1 assert x_seq_unpacked_value_output.shape == (C.FreeDimension, 2, 3) seq1_data = [[[0, 1, 1], [0, 1, 0]], [[1, 0, 0], [1, 0, 1]]] seq2_data = [[0, 1, 1], [1, 1, 0]] x_data = [np.asarray(seq1_data, dtype=np.float32), np.asarray([seq2_data, [[-100.0, -100.0, -100.0], [-100.0, -100.0, -100.0]]], dtype=np.float32)] x_seq_lens_data = np.asarray([2, 1], dtype=np.float32) result = x_seq_unpacked.eval({x : x_data, x_seq_lens : x_seq_lens_data}, device=dev) value = result[x_seq_unpacked_value_output] mask = result[x_seq_unpacked_mask_output] assert np.array_equal(value[0], seq1_data) assert np.array_equal(value[1], [seq2_data, [[-1000.0, -1000.0, -1000.0], [-1000.0, -1000.0, -1000.0]]]) assert np.array_equal(mask, [[1, 1], [1, 0]])
def pad(x, pattern, mode=C.CONSTANT_PAD, constant_value=0, name=''): """ Pads a tensor in the sequence axis according to the specified patterns. Three padding modes are supported: CONSTANT / REFLECT / SYMMETRIC. Arguments: x: tensor to be padded. pattern (tuple with 2 integers): how many values to add before and after the contents in the sequence axis. mode (int): padding mode: C.ops.CONSTANT_PAD, C.ops.REFLECT_PAD and C.ops.SYMMETRIC_PAD constant_value: the value used to fill the padding cells, only meaningful under CONSTANT mode. name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` """ if not all(isinstance(i, int) for i in pattern) or not isinstance(pattern, tuple): raise ValueError(f"pattern {pattern} must be a tuple with 2 integers") ndim = len(x.shape) null_pattern = [(0, 0)] * ndim final_pattern = [pattern] + null_pattern b, valid = C.sequence.unpack(x, padding_value=0).outputs c = C.pad(b, final_pattern, mode=mode, constant_value=constant_value) seq_length = C.reduce_sum(valid, axis=0) + C.Constant(sum(pattern)) d = C.to_sequence(c, seq_length, name=name) return d
def test_gather_op(device_id, precision): a_data = [AA([[0],[1]], dtype=PRECISION_TO_TYPE[precision]), AA([[3],[4]], dtype=PRECISION_TO_TYPE[precision])] a = C.input_variable((2,1)) r_data = np.arange(12).reshape(6,2).astype('f') r = C.parameter(shape=r_data.data, init=r_data) res = C.gather(r, a).eval({a:a_data}) expectd = np.asarray([[[[0., 1.]],[[2., 3.]]],[[[6., 7.]],[[8.,9.]]]]) assert np.array_equal(res, expectd) grads = C.gather(r, a).grad({a:a_data}, [r]) expectd_grad = np.asarray([[1,1],[1,1],[0,0],[1,1],[1,1],[0,0]], dtype=np.float32) assert np.array_equal(grads, expectd_grad) #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed) indices_params = C.parameter(shape=(1,), init=1.0) grads = C.gather(r, (indices_params *a)).grad({a:a_data}, [r, indices_params]) assert np.array_equal(grads[r], expectd_grad) assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32)) b_data = [AA([[0,2],[1,3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2,4],[3,5]], dtype=PRECISION_TO_TYPE[precision])] b = C.input_variable((2,2)) res2 = C.gather(r, b).eval({b:b_data}) expectd2 = np.asarray([[[[0., 1.],[4.,5.]],[[2., 3.],[6., 7.]]],[[[4., 5.],[8.,9.]],[[6., 7.], [10., 11.]]]]) assert np.array_equal(res2, expectd2) #the following small model is to test the memory reuse issue of gather node. x = C.input((3, 4)) x1 = C.to_sequence(x) w = C.parameter((5, 6), init=1) z = C.gather(w, x1) assert z.shape == (4, 6) #need the unpack node to trigger memory reuse. f = C.sequence.unpack(z, 0, no_mask_output=True) y = C.input((3, 4, 6)) loss = C.reduce_mean(C.square(f - y), axis=-1) loss = C.reduce_mean(loss, axis=C.Axis.all_axes()) g = C.constant(0, shape=w.shape) u = C.assign(w, g + 1) learner = C.cntk_py.universal_learner([w], [g], u) trainer = C.trainer.Trainer(loss, [loss], [learner]) indices = np.asarray([[[1, 2, 1, 2]]]) input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0) lable = np.full((10, 3, 4, 6), 2) trainer.train_minibatch({x: input, y: lable}) # the 2nd and 3rd rows should be udpated by gradients. assert np.mean(w.value[1, :]) < 1 assert np.mean(w.value[2, :]) < 1 # the other three rows should keep as 1 assert np.isclose(np.mean(w.value[0, :]), 1) assert np.isclose(np.mean(w.value[3, :]), 1) assert np.isclose(np.mean(w.value[4, :]), 1)
def zeros_like(x, seq_length: int): """ helper function to construct a sequence of zeros """ if seq_length > 1: b = C.zeros_like(C.sequence.slice(x, 0, seq_length)) elif seq_length == 1: b = C.to_sequence( C.expand_dims(C.zeros_like(C.sequence.first(x)), axis=C.Axis.new_leading_axis())) else: raise ValueError(f"length ({seq_length}) must be larger than 0") return b
def test_to_sequence_backprop(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels') ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]] seq2_data = [[0, 0, 1], [0, 1, 1]] seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) # Create a clone of the model that uses a non-sequence input # and converts it to a sequence using to_sequence x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features') x_seq_lens = C.input_variable((), name='sequence_lengths') x_seq = C.to_sequence(x_non_seq_input, x_seq_lens) x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input) ce_clone = ce.clone('share', {x_seq_input : x_seq}) x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3)) x_seq_lens_data = np.asarray([3, 2], dtype=np.float32) x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features') label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels') x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths') param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data}, wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False) assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0]) assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1]) for param in param_grads_1: if not param_grads_1[param].is_sparse: reference_grad_value = param_grads_1[param].asarray() grad_value = param_grads_2[param].asarray() assert np.array_equal(reference_grad_value, grad_value)
def test_to_sequence_error_for_operand_with_sequence_axis(): x = C.sequence.input(C.FreeDimension, 2) with pytest.raises(ValueError): op = C.to_sequence(x)
def batchmatmul(left, right, output_rank=1, infer_input_rank_to_map=C.TIMES_NO_INFERRED_INPUT_RANK, name=''): """ Batch Matrix Multiplication The output of this operation is the matrix product of the two input batch matrices. This implementation is similar to tensorflow.matmul. Currently assumes the first axis to be the static batch axis. Does not accept multiple static batch axis. Example: a = C.sequence.input_variable((3, 4, 5)) # batch matrix b = C.sequence.input_variable((3, 5, 6)) # batch matrix c = Cx.batchmatmul(a, b) assert c.shape == (3, 4, 6) # 3 is treated as a batch axis a = C.sequence.input_variable((3, 4, 5)) # batch matrix b = C.sequence.input_variable((3, 5, 6, 7)) # batch tensor c = Cx.batchmatmul(a, b, output_rank=2) assert c.shape == (3, 4, 6, 7) # 3 is treated as a batch axis a = C.input_variable((3, 4, 5)) # batch matrix b = C.input_variable((3, 5, 6, 7)) # batch tensor c = Cx.batchmatmul(a, b, output_rank=2) assert c.shape == (3, 4, 6, 7) Arguments: left: left side matrix or tensor right: right side matrix or tensor output_rank (int): in case we have tensors as arguments, output_rank represents the number of axes to be collapsed in order to transform the tensors into matrices, perform the operation and then reshape back (explode the axes) infer_input_rank_to_map (int): meant for internal use only. Always use default value name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` """ left_shape = left.shape right_shape = right.shape seq_axis_present = len(left.dynamic_axes) == 2 static_batch_axis = left_shape[ 0] # assumes the first axis to be the static batch axis. if left_shape[0] != right_shape[0]: raise ValueError( "first axis of left operand and right operand must be the same") if (left_shape[0] < 0 or right_shape[0] < 0) and seq_axis_present: raise ValueError( "Static batch axis cannot be a free axis when dynamic sequence axis is also present" ) # Combine dynamic sequence axis and static batch axis if not seq_axis_present: left_unpacked = left right_unpacked = right else: left_unpacked = C.sequence.unpack(left, padding_value=0, no_mask_output=True) right_unpacked = C.sequence.unpack(right, padding_value=0, no_mask_output=True) left_unpacked = C.reshape(left_unpacked, (-1, ) + left_shape[1:]) right_unpacked = C.reshape(right_unpacked, (-1, ) + right_shape[1:]) # Fold static batch axis into dynamic sequence axis left_folded = C.to_sequence( left_unpacked ) # do not set sequence length as batch axis has been folded in right_folded = C.to_sequence_like( right_unpacked, left_folded ) # seq_length / axis set here to tell cntk they have the same seq axis # Matrix Multiply when no static batch axis is present result = C.times(left_folded, right_folded, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map) # Split dynamic sequence axis back to original dynamic sequence and static batch axis result_unpacked = C.sequence.unpack(result, padding_value=0, no_mask_output=True) if not seq_axis_present: result_packed = C.reshape(result_unpacked, (static_batch_axis, ) + result.shape) else: result_unfolded = C.reshape(result_unpacked, (-1, static_batch_axis) + result.shape) result_packed = C.to_sequence_like(result_unfolded, left) return _inject_name(result_packed, name)
def test_to_sequence_error_for_operand_with_sequence_axis(): x = C.sequence.input_variable(C.FreeDimension, 2) with pytest.raises(ValueError): op = C.to_sequence(x)
def attention_layer(self, context, query, dim): input_ph = C.placeholder(shape=(dim, )) input_mem = C.placeholder(shape=(dim, )) with C.layers.default_options(bias=False, activation=C.relu): attn_proj_enc = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] cln_mem_ph = C.placeholder() # [#,q][?=d] cln_inp_ph = C.placeholder() # [#,c][?=d] unpack_inputs, inputs_mask = C.sequence.unpack( cln_inp_ph, 0).outputs # [#][*=c,d] [#][*=c] expand_inputs = C.sequence.broadcast_as(unpack_inputs, cln_mem_ph) # [#,q][*=c,d] matrix = C.reshape( C.times_transpose(cln_mem_ph, expand_inputs) / (self.hidden_dim**0.5), (-1, )) # [#,q][*=c] matrix = C.element_select( C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix, C.constant(-1e30)) logits = C.softmax(matrix, axis=0, name='level 1 weight') # [#,q][*=c] trans_expand_inputs = C.transpose(expand_inputs, [1, 0]) # [#,q][d,*=c] q_over_c = C.reshape( C.reduce_sum(logits * trans_expand_inputs, axis=1), (-1, )) / (self.hidden_dim**0.5) # [#,q][d] new_q = C.splice(cln_mem_ph, q_over_c) # [#,q][2*d] # over unpack_matrix, matrix_mask = C.sequence.unpack( matrix, 0).outputs # [#][*=q,*=c] [#][*=q] inputs_mask_s = C.to_sequence(C.reshape(inputs_mask, (-1, 1))) # [#,c'][1] trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]), inputs_mask_s) # [#,c'][*=q] trans_matrix = C.sequence.gather(trans_matrix, inputs_mask_s) # [#,c2][*=q] trans_matrix = C.element_select( C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix, C.constant(-1e30)) logits2 = C.softmax(trans_matrix, axis=0, name='level 2 weight') # [#,c2][*=c] unpack_new_q, new_q_mask = C.sequence.unpack( new_q, 0).outputs # [#][*=q,2*d] [#][*=q] expand_new_q = C.transpose( C.sequence.broadcast_as(unpack_new_q, trans_matrix), [1, 0]) # [#,c2][2d,*=q] c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1), (-1, )) / (2 * self.hidden_dim)**0.5 # [#,c2][2d] c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph) weighted_q = c_over_q.clone(C.CloneMethod.share, { cln_mem_ph: memory_, cln_inp_ph: inputs_ }) # [#,c][2d] c2c = q_over_c.clone(C.CloneMethod.share, { cln_mem_ph: inputs_, cln_inp_ph: inputs_ }) # [#,c][2d] att_context = C.splice(input_ph, weighted_q, c2c) # 2d+2d+2d return C.as_block(att_context, [(input_ph, context), (input_mem, query)], 'attention_layer', 'attention_layer')