def seqcla(): # LSTM params input_dim = 50 output_dim = 128 cell_dim = 128 # model num_labels = 5 vocab = 2000 embed_dim = 50 t = C.dynamic_axis(name='t') features = C.sparse_input(vocab, dynamic_axis=t, name='features') labels = C.input(num_labels, name='labels') train_reader = C.CNTKTextFormatReader(train_file) # setup embedding matrix embedding = C.parameter((embed_dim, vocab), learning_rate_multiplier=0.0, init_from_file_path=embedding_file) # get the vector representing the word sequence = C.times(embedding, features, name='sequence') # add an LSTM layer L = lstm_layer(output_dim, cell_dim, sequence, input_dim) # add a softmax layer on top w = C.parameter((num_labels, output_dim), name='w') b = C.parameter((num_labels), name='b') z = C.times(w, L) + b z.name='z' z.tag = "output" # and reconcile the shared dynamic axis pred = C.reconcile_dynamic_axis(z, labels, name='pred') ce = C.cross_entropy_with_softmax(labels, pred) ce.tag = "criterion" my_sgd = C.SGDParams(epoch_size=0, minibatch_size=10, learning_rates_per_mb=0.1, max_epochs=3) with C.LocalExecutionContext('seqcla') as ctx: # train the model ctx.train(root_nodes=[ce], training_params=my_sgd, input_map=train_reader.map( features, alias='x', dim=vocab, format='Sparse').map( labels, alias='y', dim=num_labels, format='Dense')) # write out the predictions ctx.write(input_map=train_reader.map( features, alias='x', dim=vocab, format='Sparse').map( labels, alias='y', dim=num_labels, format='Dense')) # do some manual accuracy testing acc = calc_accuracy(train_file, ctx.output_filename_base) # and test for the same number... TOLERANCE_ABSOLUTE = 1E-02 assert np.allclose(acc, 0.6006415396952687, atol=TOLERANCE_ABSOLUTE)
def test_op_times_reduce_sequence_axis(device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] from cntk import times, Value, TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK from cntk import sequence dim = 10 seq = [[0,1,2], [3], [4,5,6,7,8,9]] right_data = Value.one_hot(seq, dim, dtype=dt_precision) right_var = sequence.input_variable(shape=(dim), is_sparse=True, dtype=dt_precision) left_data = [AA([1,1,1],dtype=dt_precision), AA([1],dtype=dt_precision), AA([1,1,1,1,1,1],dtype=dt_precision)] left_var = sequence.input_variable(shape=(1), dtype=dt_precision) func = times(left_var, right_var, infer_input_rank_to_map=TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK) func2 = sequence.reduce_sum(times(left_var, right_var)) assert func.dynamic_axes == func2.dynamic_axes _, forward_output = func.forward({left_var:left_data, right_var:right_data}) actual_forward = forward_output[func.output] expected_forward = AA([[[1,1,1,0,0,0,0,0,0,0]], [[0,0,0,1,0,0,0,0,0,0]], [[0,0,0,0,1,1,1,1,1,1]]]) assert np.allclose(actual_forward, expected_forward)
def test_validation_before_eval(): w = C.parameter((4,C.InferredDimension)) v = C.parameter((C.InferredDimension,5)) wv = C.times(w,v) p = C.input((4,1)) wp = C.times(w,p) q = C.input((1,5)) qv = C.times(q,v) with pytest.raises(ValueError): wv.eval()
def test_free_static_axis_in_recurrence(): x = C.sequence.input_variable((C.FreeDimension, 2)) out_placeholder = C.placeholder() out_past = C.sequence.past_value(out_placeholder) wh = C.parameter(init=np.asarray([[2, 5], [1, 3]], dtype=np.float32)) wx = C.parameter(init=np.asarray([[1, 4], [2, 5]], dtype=np.float32)) out = C.times(x, wx) + C.times(out_past, wh) out.replace_placeholders({out_placeholder : out}) x_data = np.asarray([[0.5, 0.2], [-0.7, 1.2]], np.float32) w_grad, out_val = out.grad({x : x_data}, wrt=[wh, wx], outputs=[out]) assert np.allclose(out_val, [[[[0.9, 3.], [1.7, 3.2]]]]) assert np.allclose(w_grad[wx], [[-0.2, -0.2], [1.4, 1.4]])
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates = False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: "+str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:"+str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3')# [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(target_vector, bias, name='zT2') - C.times_transpose(target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def test_replace_placeholder_s(): left_val = [[10,2]] right_val = [[2],[3]] p = C.placeholder(shape=(1,2)) c = C.constant(left_val) op = C.times(p, right_val) op.replace_placeholders({p:c}) assert op.eval() == 26 op = C.times(p, right_val) op.replace_placeholder(c) assert op.eval() == 26
def test_clone_freeze(): inputs = 3 outputs = 5 features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) weights = C.parameter((inputs, outputs)) const_weights = C.constant(weights.value) z = C.times(features, weights) c = C.times(features, const_weights) z_clone = z.clone('freeze') c_clone = c.clone('freeze') # check that z and z_clone are the same for p, q in zip(z.parameters, z_clone.constants): assert np.array_equal(p.value, q.value) # check that c and c_clone are the same for p, q in zip(c.constants, c_clone.constants): assert np.array_equal(p.value, q.value) # keep copies of the old values z_copies = [q.value for q in z_clone.constants] c_copies = [q.value for q in c_clone.constants] # update z trainer = C.Trainer(z, C.squared_error(z, label), C.sgd(z.parameters, C.learning_rate_schedule(1.0, C.UnitType.minibatch))) x = np.random.randn(16,3).astype('f') y = np.random.randn(16,5).astype('f') trainer.train_minibatch({features: x, label: y}) # update c for cc in c.constants: cc.value = np.random.randn(*cc.value.shape).astype('f') # check that z changed for p, q in zip(z.parameters, z_clone.constants): assert not np.array_equal(p.value, q.value) # check that z_clone did not change for p, q in zip(z_copies, z_clone.constants): assert np.array_equal(p, q.value) # check that c changed for p, q in zip(c.constants, c_clone.constants): assert not np.array_equal(p.value, q.value) # check that c_clone did not change for p, q in zip(c_copies, c_clone.constants): assert np.array_equal(p, q.value)
def _graph_dict(): # This function creates a graph that has no real meaning other than # providing something to traverse. d = {} d['i1'] = C.sequence.input_variable(shape=(2, 3), sequence_axis=Axis('ia'), name='i1') d['c1'] = C.constant(shape=(2, 3), value=6, name='c1') d['p1'] = C.parameter(shape=(3, 2), init=7, name='p1') d['op1'] = C.plus(d['i1'], d['c1'], name='op1') d['op2'] = C.times(d['op1'], d['p1'], name='op2') #d['slice'] = slice(d['c1'], Axis.default_dynamic_axis(), 0, 3) #label_sentence_start = sequence.first(raw_labels) # no name d['p2'] = C.parameter(shape=(2, 2)) # duplicate names d['op3a'] = C.plus(d['op2'], d['p2'], name='op3') d['op3b'] = C.plus(d['op3a'], d['p2'], name='op3') d['first'] = C.sequence.first(d['op3b'], name='past') d['root'] = d['first'] return d
def test_op_gather_sparse(device_id): input_sparse_indices = [[1, 3, 5, 5], [2, 4], [0, 2]] vocab_size = 6 input_data = Value.one_hot(input_sparse_indices, vocab_size) a = C.sequence.input_variable(shape=(vocab_size,), is_sparse=True, name='a') a_last = C.sequence.last(a) a_last_dense = C.times(a_last, np.eye(vocab_size)) res = a_last_dense.eval({a : input_data}) assert np.array_equal(res, [[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0]]) a_last_2 = C.sequence.slice(a, -2, 0) a_last_2_dense = C.times(a_last_2, np.eye(vocab_size)) res = a_last_2_dense.eval({a : input_data}) assert np.array_equal(res, [[[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]], [[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0]], [[1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]])
def test_trainer_with_some_params_not_learned(): input_dim = 2 proj_dim = 2 x = input_variable(shape=(input_dim,)) W = parameter(shape=(input_dim, proj_dim), init=glorot_uniform()) B = parameter(shape=(proj_dim,), init=glorot_uniform()) t = times(x, W) z = t + B W_orig_value = W.value B_orig_value = B.value labels = input_variable(shape=(proj_dim,)) ce = cross_entropy_with_softmax(z, labels) pe = classification_error(z, labels) lr_per_sample = learning_rate_schedule(0.1, UnitType.sample) trainer = Trainer(z, (ce, pe), sgd([W], lr_per_sample)) x_value = [[1, 1],[2, 2]] label_value = [[0, 1], [1, 0]] arguments = {x: x_value, labels: label_value} num_iters = 3 for i in range(num_iters): trainer.train_minibatch(arguments) assert np.array_equal(B.value, B_orig_value) assert not np.array_equal(W.value, W_orig_value) W_orig_value = W.value trainer.test_minibatch(arguments)
def test_op_batch_times_grad_with_beta_equals_to_one(left_operand, right_operand, device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] a = AA(left_operand, dtype=dt_precision) b = AA(right_operand, dtype=dt_precision) root_gradient = np.ones_like(a) input1 = C.input_variable((2,2), needs_gradient=True) input2 = C.input_variable((2,2), needs_gradient=True) z = input1 + input2 + C.times(input1, input2) state, actual_forward = z.forward({input1: a, input2: b}, [z.output], {z.output}, cntk_device(device_id)) actual_backwards = z.backward(state, {z.output: root_gradient}, [input1, input2]) k = a.shape[0] left_backward = np.ones_like(a) for x in range(k): left_backward[x, ...] += b[x].sum(axis=-1) right_backward = np.ones_like(b) for x in range(k): transpose_axes = list(np.roll(np.arange(len(b.shape[1:])), -1)) sum_axes = tuple(np.arange(0, len(a.shape) - len(b.shape) + 1)) right_backward[x, ...] += np.transpose( AA([a[x].sum(axis=sum_axes)]), axes=transpose_axes) assert np.allclose(actual_backwards[input1], left_backward) assert np.allclose(actual_backwards[input2], right_backward)
def create_fast_rcnn_predictor(conv_out, rois, fc_layers): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (roi_dim, roi_dim), spatial_scale=1/16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, globalvars['num_classes']), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=globalvars['num_classes'], init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, globalvars['num_classes']*4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=globalvars['num_classes']*4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def test_data_type_inference(): x_float = C.input_variable((1,), dtype = np.float64) param1 = C.parameter((C.InferredDimension, 1), init = C.glorot_uniform(), dtype = C.cntk_py.DataType_Unknown) assert (param1.get_data_type() == C.cntk_py.DataType_Unknown) x_times_param1 = C.times(x_float, param1) assert (param1.dtype == np.float64)
def session(is_sparse): x = C.input_variable((200,), is_sparse=is_sparse) w = C.parameter((200, 100)) y = C.times(x, w) z = [0] * 100 + [1] * 100 for i in range(200): j = (3 * i * i + 5 * i + 1) % 200 # just a random looking index z[i], z[j] = z[j], z[i] import scipy.sparse x11 = scipy.sparse.csr_matrix(np.array([1] * 200).astype('f')) x01 = scipy.sparse.csr_matrix(np.array(z).astype('f')) t = C.Trainer(y, y, learner(y.parameters)) w.value = 0 * w.value t.train_minibatch({x: [x11]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) if checkpoint: t.save_checkpoint(str(tmpdir.join('checkpoint'))) t.train_minibatch({x: [x11]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) t.restore_from_checkpoint(str(tmpdir.join('checkpoint'))) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x01]}) t.train_minibatch({x: [x11]}) return w.value
def create_fast_rcnn_predictor(conv_out, rois, fc_layers, cfg): # RCNN roi_out = roipooling(conv_out, rois, cntk.MAX_POOLING, (cfg["MODEL"].ROI_DIM, cfg["MODEL"].ROI_DIM), spatial_scale=1/16.0) fc_out = fc_layers(roi_out) # prediction head W_pred = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES), init=normal(scale=0.01), name="cls_score.W") b_pred = parameter(shape=cfg["DATA"].NUM_CLASSES, init=0, name="cls_score.b") cls_score = plus(times(fc_out, W_pred), b_pred, name='cls_score') # regression head W_regr = parameter(shape=(4096, cfg["DATA"].NUM_CLASSES*4), init=normal(scale=0.001), name="bbox_regr.W") b_regr = parameter(shape=cfg["DATA"].NUM_CLASSES*4, init=0, name="bbox_regr.b") bbox_pred = plus(times(fc_out, W_regr), b_regr, name='bbox_regr') return cls_score, bbox_pred
def create_model(self): self.input_dim = 1000 self.embed_dim = 30 i = C.input_variable((self.input_dim,), is_sparse=True) self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1) o = C.times(i, self.p) self.z = C.reduce_sum(o)
def test_large_model_serialization_double(tmpdir): import os; two_gb = 2**31 type_size = np.dtype(np.float64).itemsize size = two_gb / type_size + 10 assert size * type_size > two_gb device = C.device.cpu() i = C.sequence.input(size, dtype=np.float64) w = C.Parameter((size,), dtype=np.float64, init=C.uniform(3.0, seed=12345), device=device) z = C.times(i, w) filename = str(tmpdir / 'test_large_model_serialization_double.out') z.save(filename) assert os.path.getsize(filename) > two_gb y = C.Function.load(filename, device=device) assert (len(z.parameters) == len(y.parameters)) for param_pair in zip(z.parameters, y.parameters): assert param_pair[0].shape == param_pair[1].shape assert np.allclose(param_pair[0].value, param_pair[1].value)
def linear_layer(input_var, output_dim): input_dim = input_var.shape[0] times_param = C.parameter(shape=(input_dim, output_dim)) bias_param = C.parameter(shape=(output_dim)) t = C.times(input_var, times_param) return bias_param + t
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys): """ inputs: shape=(n, dim) inputs_weight: shape=(dim, dim) decode: shape=(1, dec_dim) decode_weights: shape=(dec_dim, dim) keys: shape=(dim, 1) """ w_in = C.times(inputs, inputs_weights) #shape=(n, dim) w_dec = C.times(decode, decode_weights) #shape=(dim, 1) S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim) S = C.element_select(inputs_mask, S, C.constant(-1e+30)) S = C.times(S, keys) #shape=(n) S = C.ops.sequence.softmax(S, name="softmax") attention = C.reduce_sum(inputs * S, axis=0) return attention
def _sparse_to_dense_network_cache(input_shape, is_sequence, device): if is_sequence: temp_input = C.sequence.input_variable(input_shape, is_sparse=True) else: temp_input = C.input_variable(input_shape, is_sparse=True) eye_shape = input_shape[-1] return C.times(temp_input, np.eye(eye_shape))
def _to_dense(val, is_sequence=False): if is_sequence: x = C.sequence.input_variable(val.shape[2:], is_sparse=True) else: x = C.input_variable(val.shape[1:], is_sparse=True) dense = C.times(x, C.constant(value=np.eye(val.shape[-1], dtype=np.float32))) return dense.eval({x : val}, device=val.device)
def linear_layer(input_var, output_dim): input_dim = input_var.shape[0] weight_param = C.parameter(shape=(input_dim, output_dim)) bias_param = C.parameter(shape=(output_dim)) param_dict['w'], param_dict['b'] = weight_param, bias_param return C.times(input_var, weight_param) + bias_param
def test_nce_backward_indices(classes, xdim, batch, expected_value, device_id, precision): """ Simple test that makes sure that the derivatives have the correct sparsity pattern """ # ignore precision, only sparsity pattern matters for this test dt = np.float32 from cntk.losses import nce_loss import scipy trials = 10 # Establish baseline expected_count = np.zeros(classes) I = C.constant(np.eye(classes, dtype=dt)) q = np.arange(classes, dtype=dt) + 1 z = C.reduce_sum(C.times(C.random_sample(q, 32, True, seed=98052), I), axis=0) for i in range(trials): expected_count[np.nonzero(z.eval().ravel())] += 1 # Set things up to measure the same thing with nce_loss x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10,10*batch+1,10)) indptr = list(range(batch+1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) b = C.parameter((classes, 1)) W = C.parameter((classes, C.InferredDimension)) gb = np.zeros(classes) vb = C.input_variable((classes, 1), dtype=dt) Ib = C.constant(np.eye(1, dtype=dt)) zb = C.times(vb, Ib) loss = C.nce_loss(W, b, x, y, q, seed=98052) for i in range(trials): v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False) gb[np.nonzero(zb.eval({vb: v[b]}).ravel())] += 1 for i in range(classes): assert gb[i] == expected_count[i] or (i in indices and gb[i] == trials)
def test_2d_sparse_csr_batch_input(device_id): dev = cntk_device(device_id) features = C.input_variable((2, 3), is_sparse=True) w = C.parameter(init=np.asarray([[0.5, 1], [-.5, 2], [1., 1.5]], dtype=np.float32), device=dev) t = C.times(features, w) features_data = [sp.sparse.csr_matrix(np.asarray([[1.,0.,0.], [0.,1.,0.]], dtype=np.float32)), sp.sparse.csr_matrix(np.asarray([[0.,0.,1.], [1.,0.,0.]], dtype=np.float32))] result = t.eval({features : features_data}, device=dev) assert np.array_equal(result, [[[.5, 1], [-.5, 2]], [[1, 1.5], [.5, 1]]])
def returnFunction(): left_val = [[10,2]] right_val = [[2],[3]] p = placeholder(shape=(1,2)) op = times(p, right_val) c = constant(left_val) return op.replace_placeholders({p:c})
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.ops import input_variable, times input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=False, epoch_size = 2) batch_axis = Axis.default_batch_axis() input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') input_dynamic_axes = [batch_axis, input_seq_axis] raw_input = input_variable( shape=input_vocab_dim, dynamic_axes=input_dynamic_axes, name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input : mbs.streams.features}, device=cntk_device(device_id)) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid, device=cntk_device(device_id)) # CSR with the raw_input encoding in ctf_data one_hot_data = [ [3, 4, 5, 4, 7, 12, 1], [60, 61] ] data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id)) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
def test_op_gather_grad(device_id): dim = 10 ii = C.sequence.input_variable(()) param = C.parameter((dim, 1), init=np.reshape(np.arange(dim), (dim,1)).astype(np.float32)) ss = C.gather(param, ii) data = [[0], [0,1,2], [1,2,3,4,5, 6]] grad1 = ss.grad(data, wrt=[param]) ss2 = C.times(C.one_hot(ii, num_classes=dim, sparse_output=False), param) grad2 = ss2.grad(data, wrt=[param]) assert np.array_equal(grad1, grad2)
def test_ext_eval_5_times(): dim = 2 p_init = 10 p = C.parameter(shape=(dim,), init=p_init, name='p') m = C.user_function(MyPlus(p, C.constant(3))) z = C.times(m, C.parameter(shape=(2, 50), init=2)) result = z.eval() # No batch dimension since we have no input assert np.allclose(result, ((p_init * np.ones_like(result)) + 3) * 2 * 2)
def LSTMCell(x, y, dh, dc): '''LightLSTM Cell''' b = C.parameter(shape=(4 * cell_dim), init=0) W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform()) H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform()) # projected contribution from input x, hidden, and bias proj4 = b + C.times(x, W) + C.times(dh, H) it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim) bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim) ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim) ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim) it = C.sigmoid(it_proj) # input gate bit = it * C.tanh(bit_proj) ft = C.sigmoid(ft_proj) # forget gate bft = ft * dc ct = bft + bit ot = C.sigmoid(ot_proj) # output gate ht = ot * C.tanh(ct) # projected contribution from input y, hidden, and bias proj4_2 = b + C.times(y, W) + C.times(ht, H) it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim) bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim) ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim) ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim) it_2 = C.sigmoid(it_proj_2) # input gate bit_2 = it_2 * C.tanh(bit_proj_2) ft_2 = C.sigmoid(ft_proj_2) # forget gate bft_2 = ft_2 * ct ct2 = bft_2 + bit_2 ot_2 = C.sigmoid(ot_proj_2) # output gate ht2 = ot_2 * C.tanh(ct2) return (ht, ct, ht2, ct2)
def test_eval_sparse_no_seq(batch_index_data, device_id): dim = 10 multiplier = 2 for var_is_sparse in [True, False]: in1 = input_variable(shape=(dim,), is_sparse=var_is_sparse) z = times(in1, multiplier*np.eye(dim)) batch = np.eye(dim)[batch_index_data] expected = batch * multiplier sparse_val = csr(batch.astype('f')) result = z.eval({in1: [sparse_val]}, device=cntk_device(device_id)) assert np.allclose(result, [expected])
def test_op_scatter_sparse(device_id): input_sparse_indices = [[1, 3, 5, 5], [2, 4], [0, 2]] vocab_size = 6 input_data = Value.one_hot(input_sparse_indices, vocab_size) a = C.sequence.input_variable(shape=(vocab_size,), is_sparse=True, name='a') a_last_scatter = C.sequence.scatter(C.sequence.last(a), C.sequence.is_first(a)) a_last_scatter_dense = C.times(a_last_scatter, np.eye(vocab_size)) res = a_last_scatter_dense.eval({a : input_data}) assert np.array_equal(res[0], np.asarray([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]])) assert np.array_equal(res[1], np.asarray([[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0]])) assert np.array_equal(res[2], np.asarray([[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0]]))
def func(x_var): x = C.placeholder() WT = C.Parameter(( dim, dim, ), init=transform_weight_initializer, name=name + '_WT') bT = C.Parameter(dim, init=transform_bias_initializer, name=name + '_bT') WU = C.Parameter(( dim, dim, ), init=update_weight_initializer, name=name + '_WU') bU = C.parameter(dim, init=update_bias_initializer, name=name + '_bU') transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT) update = C.tanh(C.times(x, WU, name=name + '_U') + bU) return C.as_block(update * transform_gate + (1 - transform_gate) * x, [(x, x_var)], 'SingleInner', 'SingleInner' + name)
def cross_entropy_with_sampled_softmax( hidden_vector, label_vector, vocab_dim, hidden_dim, num_samples, sampling_weights, allow_duplicates = False ): bias = C.layers.Parameter(shape = (vocab_dim, 1), init = 0) weights = C.layers.Parameter(shape = (vocab_dim, hidden_dim), init = C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample(sampling_weights, num_samples, allow_duplicates) sample_selector = sample_selector_sparse inclusion_probs = C.random_sample_inclusion_frequency(sampling_weights, num_samples, allow_duplicates) log_prior = C.log(inclusion_probs) wS = C.times(sample_selector, weights, name='wS') zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times(sample_selector, bias, name='zS2') - C.times_transpose (sample_selector, log_prior, name='zS3') # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(label_vector, weights, name='wT') zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times(label_vector, bias, name='zT2') - C.times_transpose(label_vector, log_prior, name='zT3') zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape = (vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def test_eval_one_hot_seq(one_hot_batch, device_id): dim = 10 multiplier = 2 for var_is_sparse in [True, False]: in1 = sequence.input_variable(shape=(dim,), is_sparse=var_is_sparse) # Convert CNTK node value to dense so that we can compare it later z = times(in1, np.eye(dim)*multiplier) # Convert expectation to dense expected = [np.eye(dim)[seq]*multiplier for seq in one_hot_batch] batch = Value.one_hot(one_hot_batch, num_classes=dim, device=cntk_device(device_id)) result = z.eval({in1: batch}, device=cntk_device(device_id)) assert np.all([np.allclose(a,b) for a,b in zip(result, expected)])
def createNetwork(self, inputEmb, preHidden, preMem): WX = C.times(inputEmb, self.W) + self.Wb UH = C.times(preHidden, self.U) + self.Ub I = C.sigmoid( C.slice(WX, -1, 0, self.hiddenSize) + C.slice(UH, -1, 0, self.hiddenSize)) O = C.sigmoid( C.slice(WX, -1, self.hiddenSize, self.hiddenSize * 2) + C.slice(UH, -1, self.hiddenSize, self.hiddenSize * 2)) F = C.sigmoid( C.slice(WX, -1, self.hiddenSize * 2, self.hiddenSize * 3) + C.slice(UH, -1, self.hiddenSize * 2, self.hiddenSize * 3)) N = C.tanh( C.slice(WX, -1, self.hiddenSize * 3, self.hiddenSize * 4) + C.slice(UH, -1, self.hiddenSize * 3, self.hiddenSize * 4)) NI = C.element_times(N, I) FM = C.element_times(F, preMem) CurMem = NI + FM CurH = C.element_times(C.tanh(CurMem), O) return (CurH, CurMem)
def test_disallow_seq_starts_with_Value_objects(): one_hot_batch = [[2, 5], [0, 1, 6]] dim = 10 in1 = input(shape=(dim, ), is_sparse=True) z = times(in1, np.eye(dim)) batch = Value.one_hot(one_hot_batch, num_classes=dim) with pytest.raises(ValueError): result = z.eval(({in1: batch}, len(batch) * [True])) with pytest.raises(ValueError): result = z.eval({in1: (batch, len(batch) * [True])})
def func(x_var): x = C.placeholder() WT = C.Parameter(( dim, dim, ), init=transform_weight_initializer, name=name + '_WT') bT = C.Parameter(dim, init=transform_bias_initializer, name=name + '_bT') WU = C.Parameter(( dim, dim, ), init=update_weight_initializer, name=name + '_WU') bU = C.Parameter(dim, init=update_bias_initializer, name=name + '_bU') transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT) update = C.relu(C.times(x, WU, name=name + '_U') + bU) return C.as_block(x + transform_gate * (update - x), [(x, x_var)], 'HighwayBlock', 'HighwayBlock' + name)
def test_gather_implementation_using_one_hot_and_times(): num_classes = 4 w_init = np.asarray([[0, 1], [2, 3], [4, 5], [6, 7]]).astype(np.float32) w = C.parameter(init=w_init) x = C.input_variable((2, )) sparse_one_hot = C.one_hot(x, num_classes, sparse_output=True) t = C.times(sparse_one_hot, w) indices = np.asarray([[0, 3], [2, 1]], dtype=np.float32) result = t.eval({x: indices}) expected_result = np.asarray([[[0., 1.], [6., 7.]], [[4., 5.], [2., 3.]]]) assert np.array_equal(result, expected_result)
def test_op_times_reduce_sequence_axis(device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] from cntk import times, Value, TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK from cntk import sequence dim = 10 seq = [[0, 1, 2], [3], [4, 5, 6, 7, 8, 9]] right_data = Value.one_hot(seq, dim, dtype=dt_precision) right_var = sequence.input_variable(shape=(dim), is_sparse=True, dtype=dt_precision) left_data = [ AA([1, 1, 1], dtype=dt_precision), AA([1], dtype=dt_precision), AA([1, 1, 1, 1, 1, 1], dtype=dt_precision) ] left_var = sequence.input_variable(shape=(1), dtype=dt_precision) func = times(left_var, right_var, infer_input_rank_to_map= TIMES_REDUCE_SEQUENCE_AXIS_WITHOUT_INFERRED_INPUT_RANK) func2 = sequence.reduce_sum(times(left_var, right_var)) assert func.dynamic_axes == func2.dynamic_axes _, forward_output = func.forward({ left_var: left_data, right_var: right_data }) actual_forward = forward_output[func.output] expected_forward = AA([[[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]]) assert np.allclose(actual_forward, expected_forward)
def test_debug_multi_output(): input_dim = 2 num_output_classes = 2 f_input = input_variable(input_dim, np.float32, needs_gradient=True, name='features') p = parameter(shape=(input_dim, ), init=10, name='p') comb = combine([f_input, p]) ins = InStream(['n', 'n', 'n', 'n', 'n']) outs = OutStream() z = times(comb.outputs[0], comb.outputs[1], name='z') z = debug_model(z, ins, outs) l_input = input_variable(num_output_classes, np.float32, name='labels') loss = cross_entropy_with_softmax(z, l_input) eval_error = classification_error(z, l_input) _train(z, loss, eval_error, loss.find_by_name('features'), loss.find_by_name('labels'), num_output_classes, 1) # outs.written contains something like # =================================== forward =================================== # Parameter('p', [], [2]) with uid 'Parameter4' # Input('features', [#, *], [2]) with uid 'Input3' # Times: Output('UserDefinedFunction12_Output_0', [#, *], [2]), Output('UserDefinedFunction15_Output_0', [], [2]) -> Output('z', [#, *], [2 x 2]) with uid 'Times21' # =================================== backward =================================== # Times: Output('UserDefinedFunction12_Output_0', [#, *], [2]), Output('UserDefinedFunction15_Output_0', [], [2]) -> Output('z', [#, *], [2 x 2]) with uid 'Times21' # Input('features', [#, *], [2]) with uid 'Input3' # Parameter('p', [], [2]) with uid 'Parameter4' assert outs.written == out_stuff assert len(outs.written) == 8 v_p = "Parameter('p', " v_i = "Input('features'" v_t = 'Times: ' assert outs.written[0].startswith('=') and 'forward' in outs.written[0] line_1, line_2, line_3 = outs.written[1:4] assert outs.written[4].startswith('=') and 'backward' in outs.written[4] line_5, line_6, line_7 = outs.written[5:8] assert line_5.startswith(v_t) assert line_6.startswith(v_p) and line_7.startswith(v_i) or \ line_6.startswith(v_i) and line_7.startswith(v_p)
def test_to_sequence_backprop(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels') ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]] seq2_data = [[0, 0, 1], [0, 1, 1]] seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) # Create a clone of the model that uses a non-sequence input # and converts it to a sequence using to_sequence x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features') x_seq_lens = C.input_variable((), name='sequence_lengths') x_seq = C.to_sequence(x_non_seq_input, x_seq_lens) x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input) ce_clone = ce.clone('share', {x_seq_input : x_seq}) x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3)) x_seq_lens_data = np.asarray([3, 2], dtype=np.float32) x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features') label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels') x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths') param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data}, wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False) assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0]) assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1]) for param in param_grads_1: if not param_grads_1[param].is_sparse: reference_grad_value = param_grads_1[param].asarray() grad_value = param_grads_2[param].asarray() assert np.array_equal(reference_grad_value, grad_value)
def test_eval_sparse_dense(tmpdir, device_id): from cntk import Axis from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs from cntk.ops import times input_vocab_dim = label_vocab_dim = 69 ctf_data = '''\ 0 |S0 3:1 |# <s> |S1 3:1 |# <s> 0 |S0 4:1 |# A |S1 32:1 |# ~AH 0 |S0 5:1 |# B |S1 36:1 |# ~B 0 |S0 4:1 |# A |S1 31:1 |# ~AE 0 |S0 7:1 |# D |S1 38:1 |# ~D 0 |S0 12:1 |# I |S1 47:1 |# ~IY 0 |S0 1:1 |# </s> |S1 1:1 |# </s> 2 |S0 60:1 |# <s> |S1 3:1 |# <s> 2 |S0 61:1 |# A |S1 32:1 |# ~AH ''' ctf_file = str(tmpdir/'2seqtest.txt') with open(ctf_file, 'w') as f: f.write(ctf_data) mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs( features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True), labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True) )), randomize=False, max_samples = 2) raw_input = sequence.input_variable(shape=input_vocab_dim, sequence_axis=Axis('inputAxis'), name='raw_input', is_sparse=True) mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100, input_map={raw_input : mbs.streams.features}, device=cntk_device(device_id)) z = times(raw_input, np.eye(input_vocab_dim)) e_reader = z.eval(mb_valid, device=cntk_device(device_id)) # CSR with the raw_input encoding in ctf_data one_hot_data = [ [3, 4, 5, 4, 7, 12, 1], [60, 61] ] data = [csr(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in one_hot_data] e_csr = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)]) # One-hot with the raw_input encoding in ctf_data data = Value.one_hot(one_hot_data, num_classes=input_vocab_dim, device=cntk_device(device_id)) e_hot = z.eval({raw_input: data}, device=cntk_device(device_id)) assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
def GenMatMul_1k(): feature = C.input_variable( ( 1024, 1024, ), np.float32, ) model = C.times(feature, C.parameter((1024, 1024), init=C.glorot_uniform())) data_feature = np.random.rand(1, *feature.shape).astype(np.float32) data_output = model.eval(data_feature) Save("test_MatMul_1k", model, data_feature, data_output)
def __init__(self): self.input_dim = 40000 self.embed_dim = 100 self.batch_size = 20 i = C.input_variable((self.input_dim, ), is_sparse=True) self.p = C.parameter(shape=(self.input_dim, self.embed_dim), init=1) o = C.times(i, self.p) z = C.reduce_sum(o) learner = C.data_parallel_distributed_learner( C.sgd( z.parameters, C.learning_rate_schedule(0.01, unit=C.learners.UnitType.sample))) self.trainer = C.Trainer(z, (z, None), learner, [])
def createNetwork(self, length): networkHiddenTrg = {} networkMemTrg = {} inputTrg = C.reshape(self.inputMatrixTrg, shape=(Config.TrgMaxLength, Config.BatchSize, Config.TrgVocabSize)) tce = 0 for i in range(0, length - 1, 1): if (i == 0): networkHiddenTrg[i] = self.firstHidden networkMemTrg[i] = networkHiddenTrg[i] else: (networkHiddenTrg[i], networkMemTrg[i]) = self.Decoder.createNetwork( self.Emb(inputTrg[i]), networkHiddenTrg[i - 1], networkMemTrg[i - 1]) preSoftmax = C.times(networkHiddenTrg[i], self.Wt) + self.Wtb ce = C.cross_entropy_with_softmax(preSoftmax, inputTrg[i + 1], 2) tce += C.times( C.reshape(ce, shape=(1, Config.BatchSize)), C.reshape(self.maskMatrixTrg[i], shape=(Config.BatchSize, 1))) return tce
def train_eval_logistic_regression_from_file(criterion_name=None, eval_name=None, device_id=-1): cur_dir = os.path.dirname(__file__) # Using data from https://github.com/Microsoft/CNTK/wiki/Tutorial train_file = os.path.join(cur_dir, "Train-3Classes.txt") test_file = os.path.join(cur_dir, "Test-3Classes.txt") X = C.input(2) y = C.input(3) W = C.parameter(value=np.zeros(shape=(3, 2))) b = C.parameter(value=np.zeros(shape=(3, 1))) out = C.times(W, X) + b out.tag = 'output' ce = C.cross_entropy_with_softmax(y, out) ce.name = criterion_name ce.tag = 'criterion' eval = C.ops.square_error(y, out) eval.tag = 'eval' eval.name = eval_name # training data readers train_reader = C.CNTKTextFormatReader(train_file, randomize=None) # testing data readers test_reader = C.CNTKTextFormatReader(test_file, randomize=None) my_sgd = C.SGDParams(epoch_size=0, minibatch_size=25, learning_rates_per_mb=0.1, max_epochs=3) with C.LocalExecutionContext('logreg') as ctx: ctx.device_id = device_id ctx.train(root_nodes=[ce, eval], training_params=my_sgd, input_map=train_reader.map(X, alias='I', dim=2).map(y, alias='L', dim=3)) result = ctx.test(root_nodes=[ce, eval], input_map=test_reader.map(X, alias='I', dim=2).map(y, alias='L', dim=3)) return result
def linear_units(input_var, output_dim): input_dim = input_var.shape[0] # Introduce model parameters weight_param = C.parameter(shape=(output_dim, input_dim), name="weights") bias_param = C.parameter(shape=(output_dim, 1), name="biases") # Reshape to facilitate matrix multiplication input_reshaped = C.reshape(input_var, (input_dim, 1)) # Weighted sums params['w'], params['b'] = weight_param, bias_param part1 = C.times(weight_param, input_reshaped) # Add biases part2 = part1 + bias_param # Return 1-D representation return C.reshape(part2, (num_classes))
def test_input_without_dynamic_axes(): x = C.input_variable(shape=(2,), dynamic_axes=[], needs_gradient=True, name='x') assert len(x.dynamic_axes) == 0 op = x * .01 + 3.0 grad_result, eval_result = op.grad({x : np.asarray([.6, -.8], dtype=np.float32)}, outputs=[op], wrt=[x]) assert np.allclose(eval_result, [3.006, 2.992]) assert np.allclose(grad_result, [.01, .01]) w = C.parameter(init=np.asarray([[0.5], [-1.5]], dtype=np.float32)) op = C.times(x, w) + 3.0 grad_result, eval_result = op.grad({x : np.asarray([.6, -.8], dtype=np.float32)}, outputs=[op], wrt=[w]) assert np.allclose(eval_result, [4.5]) assert np.allclose(grad_result, [[.6], [-.8]])
def test_gather_2D_using_one_hot_and_times(): i = C.sequence.input_variable((1, )) indices = [[2, 0], [1]] sparse_one_hot = C.one_hot(i, num_classes=3, sparse_output=True) w = C.parameter((-1, 2, 3), init=C.glorot_uniform()) t = C.times(sparse_one_hot, w, output_rank=2) result = t.eval({i: indices}) w_value = w.value expected_result = [ np.stack( [np.expand_dims(np.asarray(w_value[idx]), axis=0) for idx in seq]) for seq in indices ] assert np.array_equal(result[0], expected_result[0]) assert np.array_equal(result[1], expected_result[1])
def test_eval_sparse_seq_1(batch, device_id): dim = 4 multiplier = 2 for var_is_sparse in [True, False]: in1 = sequence.input_variable(shape=(dim,), is_sparse=var_is_sparse) z = times(in1, multiplier*np.eye(dim)) if isinstance(batch[0], list): expected = [np.vstack([m.todense() * multiplier for m in seq]) for seq in batch] else: expected = [seq.todense() * multiplier for seq in batch] result = z.eval({in1: batch}, device=cntk_device(device_id)) assert np.all([np.allclose(a,b) for a,b in zip(result, expected)]), \ "%s != %s"%(result,expected)
def createAttentionNet(self, hiddenSrc, curHiddenTrg, srcLength): srcHiddenSize = Config.SrcHiddenSize * 2 hsw = C.times(hiddenSrc, self.Was) htw = C.times(curHiddenTrg, self.Wat) hst = C.reshape( hsw, shape=(srcLength, Config.BatchSize * Config.TrgHiddenSize) ) + C.reshape(htw, shape=(1, Config.BatchSize * Config.TrgHiddenSize)) hstT = C.reshape(C.tanh(hst), shape=(srcLength * Config.BatchSize, Config.TrgHiddenSize)) attScore = C.reshape(C.times(hstT, self.Wav), shape=(srcLength, Config.BatchSize)) maskOut = (C.slice(self.maskMatrixSrc, 0, 0, srcLength) - 1) * 99999999 nAttScore = attScore + maskOut attProb = C.reshape(C.softmax(nAttScore, axis=0), shape=(srcLength, Config.BatchSize, 1)) attVector = hiddenSrc * attProb contextVector = C.reduce_sum(C.reshape( attVector, shape=(srcLength, Config.BatchSize * srcHiddenSize)), axis=0) contextVector = C.reshape(contextVector, shape=(1, Config.BatchSize, srcHiddenSize)) return (contextVector, attProb)
def test_as_composite(): input_dim = 1 proj_dim = 2 x = C.input_variable((input_dim, )) b = C.parameter((proj_dim)) w = C.parameter((input_dim, proj_dim)) func_name = 't_plus_b' t_plus_b = C.plus(C.times(x, w), b, name=func_name) assert (t_plus_b.root_function.name == func_name) composite = C.as_composite(t_plus_b.root_function) assert (composite.root_function.name == func_name) composite = C.as_composite(composite) assert (composite.root_function.name == func_name) composite = C.as_composite(t_plus_b) assert (composite.root_function.name == func_name)
def _simple_dict(): d = {} d['i1'] = C.input_variable(shape=(2, 3), name='i1') d['c1'] = C.constant(shape=(2, 3), value=6, name='c1') d['p1'] = C.parameter(shape=(3, 2), init=7, name='p1') d['op1'] = C.plus(d['i1'], d['c1'], name='op1') d['op2'] = C.times(d['op1'], d['p1'], name='op2') d['root'] = d['op2'] d['target'] = C.input_variable((), name='label') d['all'] = C.combine([d['root'], C.minus( d['target'], C.constant(1, name='c2'), name='minus')], name='all') return d
def test_2d_sparse_csr_batch_input(device_id): dev = cntk_device(device_id) features = C.input_variable((2, 3), is_sparse=True) w = C.parameter(init=np.asarray([[0.5, 1], [-.5, 2], [1., 1.5]], dtype=np.float32), device=dev) t = C.times(features, w) features_data = [ sp.sparse.csr_matrix( np.asarray([[1., 0., 0.], [0., 1., 0.]], dtype=np.float32)), sp.sparse.csr_matrix( np.asarray([[0., 0., 1.], [1., 0., 0.]], dtype=np.float32)) ] result = t.eval({features: features_data}, device=dev) assert np.array_equal(result, [[[.5, 1], [-.5, 2]], [[1, 1.5], [.5, 1]]])
def test_free_static_axis_times_free_static_axis(output_rank, x_input_shape, x_data, y_input_shape, y_data): x = C.input_variable(x_input_shape) y = C.input_variable(y_input_shape) t = C.times(x, y, output_rank=output_rank) cntk_result = t.eval({x: x_data, y: y_data})[0] np_result = [] for x_item, y_item in zip(x_data, y_data): #zip over the batch axis item_res = np.tensordot(x_item, y_item, axes=len(x_item.shape) - output_rank) np_result.append(item_res) np_result = np.vstack(np_result) np.testing.assert_allclose(np_result, cntk_result)
def cumsum(x, axis=0): dim = x.shape[axis] print('dim') print(dim) U = C.constant(np.triu(np.ones((dim, dim))).astype(x.dtype)) print('U') print(U) if axis != -1: x = C.swapaxes(x, -1, axis) print('swapped') print(x()) out = C.times(x, U) if axis != -1: out = C.swapaxes(out, -1, axis) return out
def test_unpack_axis_times_transpose_unpack_axis(output_rank, x_input_shape, x_data, y_input_shape, y_data): #test free axis times from unpack batch x = C.input_variable(x_input_shape) y = C.input_variable(y_input_shape) xx = C.unpack_batch(x) yy = C.unpack_batch(y) yyy = C.transpose(yy, range(len(yy.shape))[::-1]) t = C.times(xx, yyy, output_rank=output_rank) cntk_result = t.eval({x: x_data, y: y_data}) np_result = np.tensordot(x_data, np.transpose(y_data), axes=len(x_data.shape) - output_rank) np.testing.assert_allclose(np_result, cntk_result)
def test_op_times_sparse_grad(device_id, precision): dt_precision = PRECISION_TO_TYPE[precision] from cntk import times, times_transpose, parameter, reshape, one_hot dim = 5 num_sequences = 2 seq = [i for i in range(dim)] identity = np.identity(dim, dtype=np.float32) input_data = one_hot([seq] * num_sequences, dim) input_var = I(shape=(dim), is_sparse=True, needs_gradient=False) e = parameter(shape=(dim, dim), init=identity) z = reshape(times_transpose(e, times(input_var, e)), dim) e_grad = z.grad({input_var: input_data}, [e]) assert np.allclose(e_grad, np.ones((dim, dim)) * 4)
def scale_dot_product_attention_block(self, contextQ, contextV, contextK, name): Q = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis]) V = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis]) K = C.placeholder(shape=(2*self.hidden_dim,), dynamic_axes=[self.b_axis, self.q_axis]) Ql = C.layers.Dense(100)(Q) Vl = C.layers.Dense(100)(V) Kl = C.layers.Dense(100)(K) kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs KT = C.swapaxes(kvw) S = C.reshape(C.times(Ql, KT)/math.sqrt(100), -1) kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql) S = C.softmax(C.element_select(kvw_mask_expanded, S, C.constant(-1e+30))) att = C.times(S, vvw) return C.as_block( att, [(Q, contextQ), (V, contextV), (K, contextK)], 'sdp_attention_block' + name, 'sdp_attention_block' + name)
def rnet_output_layer(self, attention_context, query): att_context = C.placeholder(shape=(2*self.hidden_dim,)) q_processed = C.placeholder(shape=(2*self.hidden_dim,)) wuq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) whp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wha = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) bias = C.parameter(shape=(2*self.hidden_dim), init=C.glorot_uniform()) whp_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wha_end = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) v_end = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # sequence[tensor[1]] q_len x 1 s0 = C.times(C.tanh(C.times(q_processed, wuq) + bias), v) a0 = C.sequence.softmax(s0) rQ = C.sequence.reduce_sum(a0 * q_processed) # sequence[tensor[1]] plen x 1 ts = C.reshape(C.times(C.tanh( C.times(att_context, whp) + C.times(C.sequence.broadcast_as(rQ, att_context), wha)), v), (-1)) # sequence[tensor[1]] ta = C.sequence.softmax(ts) # sequence[2d] 1 x 2d c0 = C.reshape(C.sequence.reduce_sum(ta * att_context), (2*self.hidden_dim)) # sequence[tensor[2d]] ha1 = C.layers.blocks.GRU(2*self.hidden_dim)(rQ, c0) # sequence[tensor[1]] plen x 1 s1 = C.reshape(C.times(C.tanh(C.times(att_context, whp_end) + C.times( C.sequence.broadcast_as(ha1, att_context), wha_end)), v_end), (-1)) # sequence[tensor[1]] plen x 1 a1 = C.sequence.softmax(s1) return C.as_block( C.combine([ts, s1]), [(att_context, attention_context), (q_processed, query)], 'output_layer', 'output_layer')