def test_trainer_with_some_params_not_learned(): input_dim = 2 proj_dim = 2 x = C.input_variable(shape=(input_dim,)) W = parameter(shape=(input_dim, proj_dim), init=C.glorot_uniform()) B = parameter(shape=(proj_dim,), init=C.glorot_uniform()) t = times(x, W) z = t + B W_orig_value = W.value B_orig_value = B.value labels = C.input_variable(shape=(proj_dim,)) ce = cross_entropy_with_softmax(z, labels) pe = classification_error(z, labels) lr_per_sample = C.learning_parameter_schedule(0.1, minibatch_size =1) trainer = C.Trainer(z, (ce, pe), C.sgd([W], lr_per_sample)) x_value = [[1, 1],[2, 2]] label_value = [[0, 1], [1, 0]] arguments = {x: x_value, labels: label_value} num_iters = 3 for i in range(num_iters): trainer.train_minibatch(arguments) assert np.array_equal(B.value, B_orig_value) assert not np.array_equal(W.value, W_orig_value) W_orig_value = W.value trainer.test_minibatch(arguments)
def test_convert_optimized_rnnstack(num_layers, bidirectional, recurrent_op, device_id): if device_id == -1: pytest.skip('only runs on GPU') input_dim = 5 hidden_dim = 3 data = [np.random.random((20,input_dim)).astype(np.float32), np.random.random((10,input_dim)).astype(np.float32), np.random.random((40,input_dim)).astype(np.float32)] input_var = C.sequence.input_variable(shape=(input_dim,)) W1 = C.parameter((-1,1), init = C.glorot_uniform()) W2 = C.parameter((-1,1), init = C.glorot_uniform()) cudnn_rnn1 = C.optimized_rnnstack(input_var, W1, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) dense1 = C.layers.Dense(hidden_dim)(cudnn_rnn1) cudnn_rnn2 = C.optimized_rnnstack(dense1, W2, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) dense2 = C.layers.Dense(hidden_dim)(cudnn_rnn2) cudnn_rnn3 = C.optimized_rnnstack(dense2, W2, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) # test shared parameter W2 def blocked(d): blocked_W = C.parameter((-1,d), init = C.glorot_uniform()) @C.layers.BlockFunction('', '') def func(x): return C.optimized_rnnstack(x, blocked_W, d, 1, recurrent_op='lstm') return func cudnn_model = C.layers.Sequential([blocked(hidden_dim), blocked(2*hidden_dim), blocked(3*hidden_dim)])(cudnn_rnn3) cudnn_out = cudnn_model.eval({input_var:data}) model = C.misc.convert_optimized_rnnstack(cudnn_model) # make sure original cudnn model is intact cudnn_out2 = cudnn_model.eval({input_var:data}) assert all(np.allclose(cudnn_out[i], cudnn_out2[i]) for i in range(len(cudnn_out))) model_out = model.eval({model.arguments[0]:data}) assert all(np.allclose(cudnn_out[i], model_out[i]) for i in range(len(cudnn_out)))
def ffnet(learner, trainer=None): inputs = 5 outputs = 3 layers = 2 hidden_dimension = 3 if trainer is None: # input variables denoting the features and label data features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) # Instantiate the feedforward classification model my_model = Sequential([ Dense(hidden_dimension, activation=C.sigmoid, init=C.glorot_uniform(seed=98052)), Dense(outputs, init=C.glorot_uniform(seed=98052)) ]) z = my_model(features) ce = C.cross_entropy_with_softmax(z, label) pe = C.classification_error(z, label) # Instantiate the trainer object to drive the model training progress_printer = ProgressPrinter(0) trainer = C.Trainer(z, (ce, pe), [learner(z.parameters)], [progress_printer]) else: features = trainer.loss_function.arguments[0] label = trainer.loss_function.arguments[1] # Get minibatches of training data and perform model training minibatch_size = 25 num_minibatches_to_train = 100 aggregate_loss = 0.0 for i in range(num_minibatches_to_train): train_features, labels = generate_random_data(minibatch_size, inputs, outputs) # Specify the mapping of input variables in the model to actual minibatch data to be trained with trainer.train_minibatch({features: train_features, label: labels}) sample_count = trainer.previous_minibatch_sample_count aggregate_loss += trainer.previous_minibatch_loss_average * sample_count last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs) avg_error = trainer.test_minibatch({ features: test_features, label: test_labels }) print(' error rate on an unseen minibatch: {}'.format(avg_error)) return last_avg_error, avg_error, trainer
def attention_layer(self, context, query, layer): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) p_processed = C.placeholder(shape=(2*self.hidden_dim,)) qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # seq[tensor[2d]] p_len x 2d wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim)) # q_len x 2d wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim)) # seq[tensor[q_len]] S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1)) qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed) # seq[tensor[q_len]] S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) # seq[tensor[q_len]] A = C.softmax(S, axis=0) # seq[tensor[2d]] swap_qvw = C.swapaxes(qvw) cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1)) # seq[tensor[4d]] uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq) # seq[tensor[4d]] gt = C.tanh(C.times(uc_concat, wg)) # seq[tensor[4d]] uc_concat_star = gt * uc_concat # seq[tensor[4d]] vp = C.layers.Sequential([ C.layers.Dropout(self.dropout), OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star) return C.as_block( vp, [(p_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order: bool = None, max_seq_len: int = None, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), initial_scale=1, initial_bias=0, name=''): """ Multi head attention block as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Multi-head attention block comes with a residual connection and a layer norm. Example: a = C.sequence.input_variable(10) b = MultiHeadAttentionBlock(2, 10)(a, a, a) assert b.shape == (10, ) Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention obey_sequence_order: do not let attention peek into future values max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta Returns: :class:`~cntk.ops.functions.Function`: """ attention_layer = MultiHeadAttention(num_heads, model_dim, obey_sequence_order, max_seq_len, key_init=key_init, key_init_bias=key_init_bias, query_init=query_init, query_init_bias=query_init_bias, value_init=value_init, value_init_bias=value_init_bias, init=init, init_bias=init_bias, name='MultiheadAttention') layernorm = LayerNormalization(initial_scale=initial_scale, initial_bias=initial_bias, name='LayerNorm') @C.Function def inner(query, key, value): attended = attention_layer(query, key, value) skip_connect_attended = attended + query normed_skip_connect_attended = layernorm(skip_connect_attended) return normed_skip_connect_attended return _inject_name(inner, name)
def linear_layer(input_var, output_dim): input_dim = input_var.shape[0] # Introduce model parameters weight_param = C.parameter(shape=(output_dim, input_dim), name="weights", init=C.glorot_uniform()) bias_param = C.parameter(shape=(output_dim, 1), name="biases", init=C.glorot_uniform()) # Reshape to facilitate matrix multiplication input_reshaped = C.reshape(input_var, (input_dim, 1)) # Weighted sums params['w'], params['b'] = weight_param, bias_param part1 = C.times(weight_param, input_reshaped) # Add biases part2 = part1 + bias_param # Return 1-D representation return C.reshape(part2, (output_dim))
def test_convert_optimized_rnnstack(num_layers, bidirectional, recurrent_op, device_id): if device_id == -1: pytest.skip('only runs on GPU') input_dim = 5 hidden_dim = 3 data = [ np.random.random((20, input_dim)).astype(np.float32), np.random.random((10, input_dim)).astype(np.float32), np.random.random((40, input_dim)).astype(np.float32) ] input_var = C.sequence.input_variable(shape=(input_dim, )) W1 = C.parameter((-1, 1), init=C.glorot_uniform()) W2 = C.parameter((-1, 1), init=C.glorot_uniform()) cudnn_rnn1 = C.optimized_rnnstack(input_var, W1, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) dense1 = C.layers.Dense(hidden_dim)(cudnn_rnn1) cudnn_rnn2 = C.optimized_rnnstack(dense1, W2, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) dense2 = C.layers.Dense(hidden_dim)(cudnn_rnn2) cudnn_model = C.optimized_rnnstack( dense2, W2, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, recurrent_op=recurrent_op) # test shared parameter W2 cudnn_out = cudnn_model.eval({input_var: data}) model = C.utils.convert_optimized_rnnstack(cudnn_model) # make sure original cudnn model is intact cudnn_out2 = cudnn_model.eval({input_var: data}) assert all( np.allclose(cudnn_out[i], cudnn_out2[i]) for i in range(len(cudnn_out))) model_out = model.eval({model.arguments[0]: data}) assert all( np.allclose(cudnn_out[i], model_out[i]) for i in range(len(cudnn_out)))
def attention_layer(self, context, query): q_processed = C.placeholder(shape=(2 * self.hidden_dim, )) c_processed = C.placeholder(shape=(2 * self.hidden_dim, )) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # This part deserves some explanation # It is the attention layer # In the paper they use a 6 * dim dimensional vector # here we split it in three parts because the different parts # participate in very different operations # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u) ws1 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws3 = C.parameter(shape=(1, 2 * self.hidden_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times(c_processed, ws1) wu = C.reshape(C.times(qvw, ws2), (-1, )) whu = C.reshape( C.reduce_sum(c_processed * C.sequence.broadcast_as(qvw * ws3, c_processed), axis=1), (-1, )) S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S), (-1, 1)) #q_attn = print_node(q_attn) c2q = C.reshape( C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0), (-1)) max_col = C.reduce_max(S) c_attn = C.sequence.softmax(max_col) htilde = C.sequence.reduce_sum(c_processed * c_attn) q2c = C.sequence.broadcast_as(htilde, c_processed) q2c_out = c_processed * q2c att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out) return C.as_block(att_context, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def LinearAttentionModel(hidden_dim: int, model_dim: int, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), name=''): """ Convenience wrapper in the style of cntk.layers.AttentionModel """ attention = LinearAttention(hidden_dim=hidden_dim, model_dim=model_dim, key_init=key_init, key_init_bias=key_init_bias, query_init=query_init, query_init_bias=query_init_bias, value_init=value_init, value_init_bias=value_init_bias, name=name) def model(encoder_hidden_state, decoder_hidden_state): return attention(decoder_hidden_state, encoder_hidden_state, encoder_hidden_state) return model
def ffnet(optimizer, num_minibatches_to_train, learning_rate_func, lr_args, learner_kwargs): inputs = 2 outputs = 2 hidden_dimension = 50 # input variables denoting the features and label data features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) # Instantiate the feedforward classification model my_model = Sequential([ Dense(hidden_dimension, activation=C.sigmoid, init=C.glorot_uniform(seed=SEED)), Dense(outputs, init=C.glorot_uniform(seed=SEED)) ]) z = my_model(features) ce = C.cross_entropy_with_softmax(z, label) pe = C.classification_error(z, label) # Instantiate the trainer object to drive the model training lr = learning_rate_func(0.125, *lr_args) progress_printer = ProgressPrinter(0) learner = optimizer(z.parameters, lr) if optimizer != sgd else sgd( z.parameters, lr, **learner_kwargs) trainer = C.Trainer(z, (ce, pe), [learner], progress_printer) # Get minibatches of training data and perform model training minibatch_size = 25 for i in range(num_minibatches_to_train): train_features, labels = generate_random_data(minibatch_size, inputs, outputs) # Specify the mapping of input variables in the model to actual # minibatch data to be trained with trainer.train_minibatch({features: train_features, label: labels}) test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs) avg_error = trainer.test_minibatch({ features: test_features, label: test_labels }) print(' error rate on an unseen minibatch: {}'.format(avg_error)) return z.parameters
def test_output_subset_evaluation(device_id): try: gpu_device = C.gpu(0) except ValueError: pytest.skip('Test only runs when GPU available') device = cntk_device(device_id) x1 = C.input_variable(shape=()) op1 = C.constant(value=1, shape=(1), device=device) + (C.constant(value=1, shape=(1), device=device) + x1) x2 = C.input_variable(shape=(1)) # Deliberately locate the parameter on a different device # instead of the actual compute target device, so that # if we try to use this parameter, it results in an error if (device.type() == 0): parameter_device = gpu_device else: parameter_device = C.cpu() p = C.parameter(shape=(1), init=C.glorot_uniform(), device=parameter_device) op2 = (x2 - C.constant(value=10, shape=(1), device=device)) - p op = C.combine([op1, op2]); _, result = op.forward({x1 : np.asarray([1, 2, 3])}, [op1], device=device) assert np.array_equal(result[op1], np.asarray([[3], [4], [5]]))
def _create_convolution_model(): with C.layers.default_options(init=C.glorot_uniform(), activation=C.relu): h = feature_var h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=8, strides=(2, 2), pad=True, name='first_convo')(h) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=16, strides=(2, 2), pad=True, name='second_convo')(h) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=16, strides=(1, 1), pad=True, name='thrid_convo')(h) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=16, strides=(1, 1), pad=True, name='fourth_convo')(h) r = C.layers.Dense(num_classes, activation=None, name='classify')(h) return r
def create_model(features, num_hidden_layers, hidden_layer_dim): with C.layers.default_options(init=C.glorot_uniform(), activation=C.sigmoid): h = features for _ in range(num_hidden_layers): h = C.layers.Dense(hidden_layers_dim)(h) last_layer = C.layers.Dense(num_classes, activation = None) return last_layer(h)
def BinaryConvolution(operand, filter_shape, num_filters=1, channels = 1, init=C.glorot_uniform(), pad=False, strides=1, bias=True, init_bias=0, op_name='BinaryConvolution', name=''): """ arguments: operand: tensor to convolve filter_shape: tuple indicating filter size num_filters: number of filters to use channels: number of incoming channels init: type of initialization to use for weights """ kernel_shape = (num_filters, channels) + filter_shape W = C.parameter(shape=kernel_shape, init=init, name="filter") binary_convolve_operand_p = C.placeholder(operand.shape, operand.dynamic_axes, name="operand") binary_convolve = C.convolution(CustomMultibit(W, 1), CustomMultibit(binary_convolve_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides]) r = C.as_block(binary_convolve, [(binary_convolve_operand_p, operand)], 'binary_convolve') bias_shape = (num_filters, 1, 1) b = C.parameter(shape=bias_shape, init=init_bias, name="bias") r = r + b # apply learnable param relu P = C.parameter(shape=r.shape, init=init, name="prelu") r = C.param_relu(P, r) return r
def test_rnn(device_id): if device_id == -1: pytest.skip('Test only runs on GPU') batch_size = 8 sequence_len = 100 vocab_dim = 20 embed_dim = 10 hidden_dim = 7 input = C.cast(C.sequence.input_variable(()), np.float16) with C.default_options(dtype=np.float16): embed = C.layers.Embedding(embed_dim)(C.one_hot(input, num_classes=vocab_dim, sparse_output=False)) z = C.layers.Recurrence(C.layers.LSTM(hidden_dim))(embed) feed = np.floor( np.random.rand(batch_size, sequence_len).astype(np.float32) * (vocab_dim - 1)) z.grad(feed, wrt=z.parameters) num_layers = 2 W = C.parameter((C.InferredDimension, embed_dim), init=C.glorot_uniform(), dtype=np.float16) with C.default_options(dtype=np.float16): z = C.optimized_rnnstack(embed, W, hidden_dim, num_layers) feed = np.floor( np.random.rand(batch_size, sequence_len).astype(np.float32) * (vocab_dim - 1)) z.grad(feed, wrt=z.parameters)
def create_model_with_pooling(features): with C.layers.default_options(init=C.glorot_uniform(), activation=C.leaky_relu): h = features h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=8, strides=(1, 1), pad=True, name='first_conv')(h) h = C.layers.AveragePooling(filter_shape=(5, 5), strides=(2, 2), name='first_pool')(h) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=16, strides=(1, 1), pad=True, name='second_conv')(h) h = C.layers.AveragePooling(filter_shape=(5, 5), strides=(2, 2), name='second_pool')(h) r = C.layers.Dense(num_output_classes, activation=None, name='classify')(h) return r
def test_output_subset_evaluation(device_id): try: gpu_device = C.gpu(0) except ValueError: pytest.skip('Test only runs when GPU available') device = cntk_device(device_id) x1 = C.input_variable(shape=()) op1 = C.constant(value=1, shape=(1), device=device) + ( C.constant(value=1, shape=(1), device=device) + x1) x2 = C.input_variable(shape=(1)) # Deliberately locate the parameter on a different device # instead of the actual compute target device, so that # if we try to use this parameter, it results in an error if (device.type() == 0): parameter_device = gpu_device else: parameter_device = C.cpu() p = C.parameter(shape=(1), init=C.glorot_uniform(), device=parameter_device) op2 = (x2 - C.constant(value=10, shape=(1), device=device)) - p op = C.combine([op1, op2]) _, result = op.forward({x1: np.asarray([1, 2, 3])}, [op1], device=device) assert np.array_equal(result[op1], np.asarray([[3], [4], [5]]))
def test_cntk_cudnn(): try: import tensorflow has_tensorflow = True except: has_tensorflow = False if has_tensorflow: tf_baseline_lstm() else: cntk_baseline_lstm() import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.sequence.input(shape=(in_dim)) data = {input_var:data_cntk} ci.set_data(data) ci.set_workdir(workdir) W = C.parameter((-1,dim,), init=C.glorot_uniform()) cudnn_fwbw = C.optimized_rnnstack(input_var, W, dim, 1, bidirectional=True, recurrent_op='lstm') ci.watch(cudnn_fwbw, 'cntk_birnn_cudnn', var_type=cstk.RnnAttr, attr=cstk.RnnAttr(bidirectional=True, op_type='lstm', input_dim=in_dim, hidden_dim=dim, forget_bias=0)) ci.watch(cudnn_fwbw, 'cntk_birnn_cudnn_out') ci.assign('cntk_birnn_cudnn', load=True, load_name='cntk_birnn') assert ci.compare('cntk_birnn_cudnn_out', compare_name='cntk_birnn_out') ci.fetch('cntk_birnn_cudnn', save=True) ci.assign('cntk_birnn_cudnn', load=True) assert ci.compare('cntk_birnn_cudnn_out', compare_name='cntk_birnn_out') ci.reset()
def test_nce_loss(classes, xdim, batch, expected_value, device_id, precision): dt = PRECISION_TO_TYPE[precision] from cntk.losses import nce_loss import scipy x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape( (batch, xdim)) / (batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10, 10 * batch + 1, 10)) indptr = list(range(batch + 1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) q = np.arange(classes, dtype=dt) + 1 b = C.parameter((classes, 1), init=-np.log(classes)) W = C.parameter((classes, C.InferredDimension), init=C.glorot_uniform(seed=98052)) loss = C.nce_loss(W, b, x, y, q, seed=98052) v = loss.grad({x: x0, y: y0}, wrt=loss.parameters, as_numpy=False) for key in v: assert v[ key].is_sparse, "gradient of nce_loss with respect to %s is not sparse" % key losses = np.zeros((100, batch)) for i in range(100): losses[i, :] = loss.eval({x: x0, y: y0}) assert np.allclose(np.mean(losses, axis=0), AA(expected_value))
def create_model(self, features): with cntk.layers.default_options(init=cntk.glorot_uniform(), activation=cntk.relu): h = features h = cntk.layers.Convolution2D(filter_shape=(3,3), num_filters=16, strides=(1,1), pad=True, name="first_conv")(h) h = cntk.layers.MaxPooling(filter_shape=(2,2), strides=(2,2), name="first_max")(h) h = cntk.layers.Convolution2D(filter_shape=(3,3), num_filters=32, strides=(1,1), pad=True, name="second_conv")(h) h = cntk.layers.MaxPooling(filter_shape=(2,2), strides=(2,2), name="second_max")(h) h = cntk.layers.Convolution2D(filter_shape=(3,3), num_filters=64, strides=(1,1), pad=True, name="third_conv")(h) h = cntk.layers.MaxPooling(filter_shape=(2,2), strides=(2,2), name="third_max")(h) h = cntk.layers.Dense(500, name="fc0")(h) r = cntk.layers.Dense(self.num_output_classes, activation = None, name="classify")(h) return r
def test_nce_loss(classes, xdim, batch, expected_value, device_id, precision): dt = PRECISION_TO_TYPE[precision] from cntk.losses import nce_loss import scipy x = C.input_variable(xdim, needs_gradient=True) y = C.input_variable(classes, is_sparse=True) x0 = np.arange(batch * xdim, dtype=dt).reshape((batch, xdim))/(batch * xdim) data = np.ones(batch, dtype=dt) indices = list(range(10,10*batch+1,10)) indptr = list(range(batch+1)) y0 = scipy.sparse.csr_matrix((data, indices, indptr), shape=(batch, classes)) q = np.arange(classes, dtype=dt) + 1 b = C.parameter((classes, 1), init=-np.log(classes)) W = C.parameter((classes, C.InferredDimension), init=C.glorot_uniform(seed=98052)) loss = C.nce_loss(W, b, x, y, q, seed=98052) v = loss.grad({x:x0, y:y0}, wrt=loss.parameters, as_numpy=False) for key in v: assert v[key].is_sparse, "gradient of nce_loss with respect to %s is not sparse"%key losses = np.zeros((100,batch)) for i in range(100): losses[i,:] = loss.eval({x:x0, y:y0}) assert np.allclose(np.mean(losses, axis=0), AA(expected_value))
def embed(self): npglove = np.zeros((self.wg_dim, 1024 + 300), dtype=np.float32) hf = h5py.File( os.path.join(self.abs_path, '../data/elmo_embedding.bin'), 'r') with open(os.path.join(self.abs_path, '../data/glove.840B.300d.txt'), encoding='utf-8') as f: for line in f: parts = line.split() word = parts[0].lower() if word in self.vocab: try: if len(parts) == 301: npglove[self.vocab[word], :300] = np.asarray( [float(p) for p in parts[-300:]]) npglove[self.vocab[word], 300:] = np.average(hf[word][:], axis=0) except: npglove[self.vocab[word], 300:] = np.average(hf['<UNK>'][:], axis=0) glove = C.constant(npglove) nonglove = C.parameter(shape=(self.wn_dim, 1024 + 300), init=C.glorot_uniform(), name='TrainableE') def func(wg, wn): return C.times(wg, glove) + C.times(wn, nonglove) return func
def word_glove(self): # load glove if os.path.isfile('glove300.model'): print('[BUILD] load glove300.model') return C.load_model('glove300.model') npglove = np.zeros((self.wg_dim, self.word_emb_dim), dtype=np.float32) with open(os.path.join(self.abs_path, self.word_embed_file), encoding='utf-8') as f: for line in f: parts = line.split() word = parts[0].lower() if self.vocab.get(word, self.wg_dim) < self.wg_dim: npglove[self.vocab[word], :] = np.asarray( [float(p) for p in parts[-300:]]) glove = C.constant(npglove) nonglove = C.parameter(shape=(len(self.vocab) - self.wg_dim, self.word_emb_dim), init=C.glorot_uniform(), name='TrainableE') @C.Function def func(wg, wn): return C.times(wg, glove) + C.times(wn, nonglove) func.save('glove300.model') print('[BUILD] save glove300.model') return func
def create_network(para, verbose=False): with cntk.layers.default_options(init=cntk.glorot_uniform(), activation=cntk.ops.relu): # In order to accelerate the debugging step, we choose a simple structure with only 2 parameters h = cntk.layers.Convolution2D(filter_shape=(5, 5), num_filters=para[0], strides=(1, 1), pad=True, name='C1')(network_input / 255.0) h = cntk.layers.layers.MaxPooling(filter_shape=(5, 5), strides=(2, 2), )(h) h = cntk.layers.Convolution2D(filter_shape=(5, 5), num_filters=para[1], strides=(1, 1), pad=True, name='C2')(h) h = cntk.layers.layers.MaxPooling(filter_shape=(5, 5), strides=(2, 2))(h) h = cntk.layers.Convolution2D(filter_shape=(3, 3), num_filters=para[2], strides=(1, 1), pad=True, name='C2')(h) h = cntk.layers.Dense(para[3])(h) h = cntk.layers.Dropout(0.25)(h) z = cntk.layers.Dense(10, activation=None, name='R')(h) loss = cntk.cross_entropy_with_softmax(z, network_label) label_error = cntk.classification_error(z, network_label) lr_schedule = cntk.learning_rate_schedule(0.1, cntk.UnitType.minibatch) learner = cntk.momentum_sgd(z.parameters, lr_schedule, cntk.momentum_schedule(0.9)) trainer = cntk.Trainer(z, (loss, label_error), [learner]) if verbose: log = cntk.logging.ProgressPrinter(100) for _ in xrange(20000): data = train_reader.next_minibatch(100, input_map=mapping(train_reader)) trainer.train_minibatch(data) if verbose: log.update_with_trainer(trainer) return trainer
def _create_convolution_model(): with C.layers.default_options(init=C.glorot_uniform(), activation=C.relu): h = feature_var # The first two layers has bias=False to test, the conversion # work with and without bias in the Convolution. h = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(2,2), pad=True, bias=False, name='first_convo')(h) h = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(2,2), pad=True, bias=False, name='second_convo')(h) h = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(1,1), pad=True, name='thrid_convo')(h) h = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(1,1), pad=True, name='fourth_convo')(h) r = C.layers.Dense(num_classes, activation=None, name='classify')(h) return r
def test_data_type_inference(): x_float = C.input_variable((1,), dtype = np.float64) param1 = C.parameter((C.InferredDimension, 1), init = C.glorot_uniform(), dtype = C.cntk_py.DataType_Unknown) assert (param1.get_data_type() == C.cntk_py.DataType_Unknown) x_times_param1 = C.times(x_float, param1) assert (param1.dtype == np.float64)
def CreatRNN(cell_dim, activation, initial_state, direction, num_layers, init=C.default_override_or(C.glorot_uniform()), init_bias=C.default_override_or(0)): if direction == 'bidirectional': return C.layers.Sequential([ C.layers.For(range(num_layers), lambda i: [ (C.layers.Recurrence(C.layers.RNNStep(cell_dim, activation = activation, init = init, init_bias = init_bias), initial_state = initial_state, return_full_state = False, go_backwards=False), C.layers.Recurrence(C.layers.RNNStep(cell_dim, activation = activation, init = init, init_bias = init_bias), initial_state = initial_state, return_full_state = False, go_backwards=True)), C.splice])]) else: go_backward = False if direction == 'forward' else True return C.layers.Sequential([ C.layers.For(range(num_layers), lambda i: [ C.layers.Recurrence(C.layers.RNNStep(cell_dim, activation = activation, init = init, init_bias = init_bias), initial_state = initial_state, return_full_state = False, go_backwards=go_backward)])])
def create_model(features): with cntk.layers.default_options(init = cntk.glorot_uniform(), activation = cntk.ops.relu): input = features for _ in range(num_hidden_layers): input = cntk.layers.Dense(hidden_layers_dim)(input) r = cntk.layers.Dense(num_output_classes, activation = None)(input) return r
def OptimizedRnnStack(hidden_dim, num_layers=1, recurrent_op='gru', bidirectional=False, use_cudnn=True, name=''): if use_cudnn: W = C.parameter(_INFERRED + (hidden_dim, ), init=C.glorot_uniform()) def func(x): return C.optimized_rnnstack(x, W, hidden_dim, num_layers, bidirectional, recurrent_op=recurrent_op, name=name) return func else: def func(x): return C.splice(C.layers.Recurrence(C.layers.LSTM(hidden_dim))(x), C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=True)(x), name=name) return func
def cntk_baseline_lstm(): import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.sequence.input_variable(shape=(in_dim)) fwbw = C.splice( C.layers.Recurrence(C.layers.LSTM( dim, init_bias=C.glorot_uniform()))(input_var), C.layers.Recurrence(C.layers.LSTM(dim), go_backwards=True)(input_var)) ci.watch(fwbw, 'birnn', var_type=cstk.RnnAttr, attr=cstk.RnnAttr(bidirectional=True, op_type='lstm', input_dim=in_dim, hidden_dim=dim, forget_bias=0)) ci.watch(fwbw, 'birnn_out') data = {input_var: data_cntk} ci.set_data(data) ci.set_workdir(workdir) ci.fetch('birnn', save=True) ci.fetch('birnn_out', save=True) ci.reset()
def create_model(self): w = cntk.Parameter((self.number_features, self.number_labels), init=cntk.glorot_uniform(), name='W') b = cntk.Parameter((self.number_labels, ), init=0, name='b') self.model = cntk.times(self.input_transform, w) + b
def _create_model(net_input, num_output_classes, num_hidden_layers, hidden_layers_dim): h = net_input with C.layers.default_options(init=C.glorot_uniform()): for i in range(num_hidden_layers): h = C.layers.Dense(hidden_layers_dim, activation=C.relu)(h) return C.layers.Dense(num_output_classes, activation=None)(h)
def create_model(features): with C.layers.default_options(init=C.glorot_uniform()): # We scale the input pixels to 0-1 range encode = C.layers.Dense(encoding_dim, activation=C.relu)(features / 255.0) decode = C.layers.Dense(input_dim, activation=C.sigmoid)(encode) return decode
def ffnet(learner, trainer=None): inputs = 5 outputs = 3 layers = 2 hidden_dimension = 3 if trainer is None: # input variables denoting the features and label data features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) # Instantiate the feedforward classification model my_model = Sequential ([ Dense(hidden_dimension, activation=C.sigmoid, init=C.glorot_uniform(seed=98052)), Dense(outputs, init=C.glorot_uniform(seed=98052))]) z = my_model(features) ce = C.cross_entropy_with_softmax(z, label) pe = C.classification_error(z, label) # Instantiate the trainer object to drive the model training progress_printer = ProgressPrinter(0) trainer = C.Trainer(z, (ce, pe), [learner(z.parameters)], [progress_printer]) else: features = trainer.loss_function.arguments[0] label = trainer.loss_function.arguments[1] # Get minibatches of training data and perform model training minibatch_size = 25 num_minibatches_to_train = 100 aggregate_loss = 0.0 for i in range(num_minibatches_to_train): train_features, labels = generate_random_data(minibatch_size, inputs, outputs) # Specify the mapping of input variables in the model to actual minibatch data to be trained with trainer.train_minibatch({features : train_features, label : labels}) sample_count = trainer.previous_minibatch_sample_count aggregate_loss += trainer.previous_minibatch_loss_average * sample_count last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs) avg_error = trainer.test_minibatch({features : test_features, label : test_labels}) print(' error rate on an unseen minibatch: {}'.format(avg_error)) return last_avg_error, avg_error, trainer
def ffnet(optimizer, num_minibatches_to_train, learning_rate_func, lr_args, learner_kwargs): inputs = 2 outputs = 2 hidden_dimension = 50 # input variables denoting the features and label data features = C.input_variable((inputs), np.float32) label = C.input_variable((outputs), np.float32) # Instantiate the feedforward classification model my_model = Sequential([ Dense(hidden_dimension, activation=C.sigmoid, init=C.glorot_uniform(seed=SEED)), Dense(outputs, init=C.glorot_uniform(seed=SEED))]) z = my_model(features) ce = C.cross_entropy_with_softmax(z, label) pe = C.classification_error(z, label) # Instantiate the trainer object to drive the model training lr= learning_rate_func(0.125, *lr_args) progress_printer = ProgressPrinter(0) learner = optimizer(z.parameters, lr) if optimizer != sgd else sgd(z.parameters, lr, **learner_kwargs) trainer = C.Trainer(z, (ce, pe), [learner], progress_printer) # Get minibatches of training data and perform model training minibatch_size = 25 for i in range(num_minibatches_to_train): train_features, labels = generate_random_data( minibatch_size, inputs, outputs) # Specify the mapping of input variables in the model to actual # minibatch data to be trained with trainer.train_minibatch({features: train_features, label: labels}) test_features, test_labels = generate_random_data( minibatch_size, inputs, outputs) avg_error = trainer.test_minibatch( {features: test_features, label: test_labels}) print(' error rate on an unseen minibatch: {}'.format(avg_error)) return z.parameters
def test_gather_2D_using_one_hot_and_times(): i = C.sequence.input_variable((1,)) indices = [[2, 0], [1]] sparse_one_hot = C.one_hot(i, num_classes=3, sparse_output=True) w = C.parameter((-1, 2, 3), init=C.glorot_uniform()) t = C.times(sparse_one_hot, w, output_rank=2) result = t.eval({i : indices}) w_value = w.value expected_result = [np.stack([np.expand_dims(np.asarray(w_value[idx]), axis=0) for idx in seq]) for seq in indices] assert np.array_equal(result[0], expected_result[0]) assert np.array_equal(result[1], expected_result[1])
def create_model(input): with C.layers.default_options(init=C.glorot_uniform(), activation=C.relu): model = C.layers.Sequential([ C.layers.For(range(3), lambda i: [ C.layers.Convolution((5,5), [32,32,64][i], pad=True), C.layers.MaxPooling((3,3), strides=(2,2)) ]), C.layers.Dense(64), C.layers.Dense(10, activation=None) ]) return model(input)
def OptimizedRnnStack(hidden_dim, num_layers=1, recurrent_op='gru', bidirectional=False, use_cudnn=True, name=''): if use_cudnn: W = C.parameter(_INFERRED + (hidden_dim,), init=C.glorot_uniform()) def func(x): return C.optimized_rnnstack(x, W, hidden_dim, num_layers, bidirectional, recurrent_op=recurrent_op, name=name) return func else: def func(x): return C.splice( C.layers.Recurrence(C.layers.GRU(hidden_dim))(x), C.layers.Recurrence(C.layers.GRU(hidden_dim), go_backwards=True)(x), name=name) return func
def create_model(features): with C.layers.default_options(init=C.glorot_uniform(), activation=C.relu): h = features h = C.layers.Convolution2D(filter_shape=(util.KSIZE_CONV1, util.KSIZE_CONV1), num_filters=util.FILTERS_CONV1,strides=util.CONV1_STRIDE, pad=True, name='first_conv')(h) h = C.layers.MaxPooling(filter_shape=(util.POOL_SIZE1, util.POOL_SIZE1), name='first_max')(h) h = C.layers.Convolution2D(filter_shape=(util.KSIZE_CONV2, util.KSIZE_CONV2), num_filters=util.FILTERS_CONV2, strides=util.CONV2_STRIDE, pad=True, name='second_conv')(h) h = C.layers.MaxPooling(filter_shape=(util.POOL_SIZE2, util.POOL_SIZE2), name='second_max')(h) r = C.layers.Dense(num_output_clasess, activation=None, name='classify')(h) return r
def _create_convolution_model_with_skip_level_links(): with C.layers.default_options(init=C.glorot_uniform(), activation=C.relu): h = feature_var # The first two layers has bias=False to test, the conversion # work with and without bias in the Convolution. a = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(2,2), pad=True, bias=False, name='first_convo')(h) a = BatchNormalization(map_rank=1, normalization_time_constant=4096, use_cntk_engine=True, init_scale=1, disable_regularization=True)(a) b = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(2,2), pad=True, bias=False, name='second_convo')(h) b = BatchNormalization(map_rank=1, normalization_time_constant=4096, use_cntk_engine=True, init_scale=1, disable_regularization=True)(b) h = a + b h = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(1,1), pad=True, name='thrid_convo')(h) h = BatchNormalization(map_rank=1, normalization_time_constant=4096, use_cntk_engine=True, init_scale=1, disable_regularization=True)(h) h = C.layers.Convolution2D(filter_shape=(5,5), num_filters=64, strides=(1,1), pad=True, name='fourth_convo')(h) h = BatchNormalization(map_rank=1, normalization_time_constant=4096, use_cntk_engine=True, init_scale=1, disable_regularization=True)(h) r = C.layers.Dense(num_classes, activation=None, name='classify')(h) return r
def create_basic_model_with_batch_normalization(input, out_dims): with C.layers.default_options(activation=C.relu, init=C.glorot_uniform()): model = C.layers.Sequential([ C.layers.For(range(3), lambda i: [ C.layers.Convolution((5,5), [image_width,image_height,64][i], pad=True), C.layers.BatchNormalization(map_rank=1), C.layers.MaxPooling((3,3), strides=(2,2)) ]), C.layers.Dense(64), C.layers.BatchNormalization(map_rank=1), C.layers.Dense(out_dims, activation=None) ]) return model(input)
def cntk_baseline_conv2d(): import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.input_variable(shape=sample_shape) input_reshaped = C.reshape(input_var, (1,)+sample_shape) conv_out = C.layers.Convolution2D(filter_shape, num_filters, init_bias=C.glorot_uniform())(input_reshaped) ci.watch(conv_out, 'conv2d', var_type=cstk.Conv2DAttr, attr=cstk.Conv2DAttr(filter_shape=filter_shape, num_filters=num_filters)) ci.watch(conv_out, 'conv2d_out') data = {input_var:input_data} ci.set_data(data) ci.set_workdir(workdir) ci.fetch('conv2d', save=True) ci.fetch('conv2d_out', save=True) ci.reset()
def cntk_baseline_lstm(): import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.sequence.input_variable(shape=(in_dim)) fwbw = C.splice(C.layers.Recurrence(C.layers.LSTM(dim, init_bias=C.glorot_uniform()))(input_var), C.layers.Recurrence(C.layers.LSTM(dim), go_backwards=True)(input_var)) ci.watch(fwbw, 'birnn', var_type=cstk.RnnAttr, attr=cstk.RnnAttr(bidirectional=True, op_type='lstm', input_dim=in_dim, hidden_dim=dim, forget_bias=0)) ci.watch(fwbw, 'birnn_out') data = {input_var:data_cntk} ci.set_data(data) ci.set_workdir(workdir) ci.fetch('birnn', save=True) ci.fetch('birnn_out', save=True) ci.reset()
def test_conv_cudnn_batch_size_change(device_id): if device_id == -1: pytest.skip('Test only runs on GPU') np.random.seed(0) input_shape = (1, 16, 100) input1 = C.sequence.input_variable(input_shape, needs_gradient=True, sequence_axis=C.Axis.new_unique_dynamic_axis('c')) input2 = C.sequence.input_variable(input_shape, needs_gradient=True, sequence_axis=C.Axis.new_unique_dynamic_axis('q')) conv = C.layers.Convolution2D((5,8), 100, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0) output = C.reduce_sum(conv(input1), axis=C.Axis.all_axes()) + C.reduce_sum(conv(input2), axis=C.Axis.all_axes()) num_batches = 100 # change to greater value for a more thorough test batch_size = 1 max_seq_len = [100, 10] for batch in range(num_batches): seq_lens = [[int(x*msl+1) for x in np.random.random((batch_size))] for msl in max_seq_len] output.grad({input1:[np.random.random((sl,) + input_shape).astype(np.float32) for sl in seq_lens[0]], input2:[np.random.random((sl,) + input_shape).astype(np.float32) for sl in seq_lens[1]]})
def test_cntk_cudnn(): try: import tensorflow has_tensorflow = True except: has_tensorflow = False if has_tensorflow: tf_baseline_lstm() else: cntk_baseline_lstm() import cntk as C import cntk.contrib.crosstalk.crosstalk_cntk as crct ci = crct.instance input_var = C.sequence.input_variable(shape=(in_dim)) data = {input_var:data_cntk} ci.set_data(data) ci.set_workdir(workdir) W = C.parameter((-1,dim,), init=C.glorot_uniform()) cudnn_fwbw = C.optimized_rnnstack(input_var, W, dim, 1, bidirectional=True, recurrent_op='lstm') ci.watch(cudnn_fwbw, 'cntk_birnn_cudnn', var_type=cstk.RnnAttr, attr=cstk.RnnAttr(bidirectional=True, op_type='lstm', input_dim=in_dim, hidden_dim=dim, forget_bias=0)) ci.watch(cudnn_fwbw, 'cntk_birnn_cudnn_out') ci.assign('cntk_birnn_cudnn', load=True, load_name='birnn') assert ci.compare('cntk_birnn_cudnn_out', compare_name='birnn_out', rtol=1e-4, atol=1e-6) ci.fetch('cntk_birnn_cudnn', save=True) ci.assign('cntk_birnn_cudnn', load=True) assert ci.compare('cntk_birnn_cudnn_out', compare_name='birnn_out', rtol=1e-4, atol=1e-6) # test assign with value num_gates=4 ci.assign('cntk_birnn_cudnn', value=cstk.RnnArgs(fw_W=np.random.random((in_dim,num_gates*dim)).astype(np.float32), fw_H=np.random.random((dim,num_gates*dim)).astype(np.float32), fw_b=np.random.random((num_gates*dim,)).astype(np.float32), bw_W=np.random.random((in_dim,num_gates*dim)).astype(np.float32), bw_H=np.random.random((dim,num_gates*dim)).astype(np.float32), bw_b=np.random.random((num_gates*dim,)).astype(np.float32))) ci.reset()
def test_saving_and_loading_int16_ndarray_as_attribute(tmpdir): model_file = str(tmpdir/'test_model_int16.bin') delete_if_file_exists(model_file) data = np.arange(0,64, dtype=np.int16).reshape(16,4) dict_val = C._to_cntk_dict_value(data) W = C.Parameter((C.InferredDimension, 42), init=C.glorot_uniform(), dtype=np.float) x = C.input_variable(12, dtype=np.float) y = C.times(x, W) y.custom_attributes = {'int16_nd':dict_val} y.save(model_file) assert(os.path.isfile(model_file)) z = C.load_model(model_file) int16_data = z.custom_attributes['int16_nd'] assert(int16_data.shape == (16,4)) assert (np.array_equal(int16_data, data))
def binary_convolution(filter_shape, num_filters=1, channels = 1, init=C.glorot_uniform(), pad=False, strides=1, name='BinaryConvolution'): ''' Creates a binary convolution function based on the input parameters. Args: filter_shape : shape of the filter num_filters : number of filters to use init : initialization function for the filter pad : padding enabled or not for the filter strides : overlap for this filter name : name given to the binary convolution. Returns: a function for performing binary convolution ''' kernel_shape = (num_filters, channels) + filter_shape W = C.Parameter(shape=kernel_shape, init=init, name="filter") def convolution(operand): bcv_operand_p = C.placeholder( operand.shape, operand.dynamic_axes, name="operand") bcv = C.convolution( CustomMultibit(W, 1), CustomMultibit(bcv_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides]) return C.as_block(bcv, [(bcv_operand_p, operand)], name) return convolution
np.random.seed(0) def generate_synthetic_data(N): Y = np.random.randint(size=N, low=0, high=num_classes) # labels X = (np.random.randn(N, input_dim)+3) * (Y[:,None]+1) # data # Our model expects float32 features, and cross-entropy expects one-hot encoded labels. Y = scipy.sparse.csr_matrix((np.ones(N,np.float32), (range(N), Y)), shape=(N, num_classes)) X = X.astype(np.float32) return X, Y X_train, Y_train = generate_synthetic_data(20000) X_test, Y_test = generate_synthetic_data(1024) # Define the CNTK model function. The model function maps input data to # predictions (here: 2-dimensional inputs --> 2 scores). # This simple logistic-regression model just uses a linear transform. data = cntk.input_variable(input_dim) W = cntk.Parameter((input_dim, num_classes), init=cntk.glorot_uniform(), name='W') b = cntk.Parameter((num_classes,), init=0, name='b') model = cntk.times(data, W) + b # Define the CNTK criterion function. A criterion function maps # (input vectors, labels) to a loss function and an optional additional # metric. The loss function is used to train the model parameters. # We use cross entropy as a loss function. label_one_hot = cntk.input_variable(num_classes, is_sparse=True) loss = cntk.cross_entropy_with_softmax(model, label_one_hot) # this applies softmax to model's output under the hood metric = cntk.classification_error(model, label_one_hot) criterion = cntk.combine([loss, metric]) # criterion is a tuple-valued function (loss, metric) # Learner object. The learner implements the update algorithm, in this case plain SGD. learning_rate = 0.1 learner = cntk.sgd(model.parameters, cntk.learning_parameter_schedule(learning_rate))
def deconv_mnist(max_epochs=3): image_height = 28 image_width = 28 num_channels = 1 input_dim = image_height * image_width * num_channels num_output_classes = 10 # Input variable and normalization input_var = cntk.ops.input_variable((num_channels, image_height, image_width), np.float32) scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var) # Define the auto encoder model cMap = 1 conv1 = cntk.layers.Convolution2D ((5,5), cMap, pad=True, activation=cntk.ops.relu)(scaled_input) pool1 = cntk.layers.MaxPooling ((4,4), (4,4))(conv1) unpool1 = cntk.layers.MaxUnpooling ((4,4), (4,4))(pool1, conv1) z = cntk.layers.ConvolutionTranspose2D((5,5), num_channels, pad=True, bias=False, init=cntk.glorot_uniform(0.001))(unpool1) # define rmse loss function (should be 'err = cntk.ops.minus(deconv1, scaled_input)') f2 = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var) err = cntk.ops.reshape(cntk.ops.minus(z, f2), (784)) sq_err = cntk.ops.element_times(err, err) mse = cntk.ops.reduce_mean(sq_err) rmse_loss = cntk.ops.sqrt(mse) rmse_eval = cntk.ops.sqrt(mse) reader_train = create_reader(os.path.join(data_path, 'Train-28x28_cntk_text.txt'), True, input_dim, num_output_classes) # training config epoch_size = 60000 minibatch_size = 64 # Set learning parameters lr_schedule = cntk.learning_rate_schedule([0.00015], cntk.learner.UnitType.sample, epoch_size) mm_schedule = cntk.learner.momentum_as_time_constant_schedule([600], epoch_size) # Instantiate the trainer object to drive the model training learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True) progress_printer = cntk.utils.ProgressPrinter(tag='Training') trainer = cntk.Trainer(z, (rmse_loss, rmse_eval), learner, progress_printer) # define mapping from reader streams to network inputs input_map = { input_var : reader_train.streams.features } cntk.utils.log_number_of_parameters(z) ; print() # Get minibatches of images to train with and perform model training for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = reader_train.next_minibatch(min(minibatch_size, epoch_size - sample_count), input_map=input_map) # fetch minibatch. trainer.train_minibatch(data) # update model with it sample_count += data[input_var].num_samples # count samples processed so far trainer.summarize_training_progress() z.save(os.path.join(model_path, "07_Deconvolution_PY_{}.model".format(epoch))) # rename final model last_model_name = os.path.join(model_path, "07_Deconvolution_PY_{}.model".format(max_epochs - 1)) final_model_name = os.path.join(model_path, "07_Deconvolution_PY.model") try: os.remove(final_model_name) except OSError: pass os.rename(last_model_name, final_model_name) # Load test data reader_test = create_reader(os.path.join(data_path, 'Test-28x28_cntk_text.txt'), False, input_dim, num_output_classes) input_map = { input_var : reader_test.streams.features } # Test data for trained model epoch_size = 10000 minibatch_size = 1024 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while sample_count < epoch_size: current_minibatch = min(minibatch_size, epoch_size - sample_count) # Fetch next test min batch. data = reader_test.next_minibatch(current_minibatch, input_map=input_map) # minibatch data to be trained with metric_numer += trainer.test_minibatch(data) * current_minibatch metric_denom += current_minibatch # Keep track of the number of samples processed so far. sample_count += data[input_var].num_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom
def embed(self): # load glove npglove = np.zeros((self.wg_dim, self.w2v_hidden_dim), dtype=np.float32) with open(os.path.join(self.abs_path, 'glove.6B.100d.txt'), encoding='utf-8') as f: for line in f: parts = line.split() word = parts[0].lower() if word in self.vocab: npglove[self.vocab[word],:] = np.asarray([float(p) for p in parts[1:]]) glove = C.constant(npglove) nonglove = C.parameter(shape=(len(self.vocab) - self.wg_dim, self.w2v_hidden_dim), init=C.glorot_uniform(), name='TrainableE') def func(wg, wn): return C.times(wg, glove) + C.times(wn, nonglove) return func
def blocked(d): blocked_W = C.parameter((-1,d), init = C.glorot_uniform()) @C.layers.BlockFunction('', '') def func(x): return C.optimized_rnnstack(x, blocked_W, d, 1, recurrent_op='lstm') return func
def simple_mnist(): input_dim = 784 num_output_classes = 10 num_hidden_layers = 2 hidden_layers_dim = 200 # Input variables denoting the features and label data feature = C.input_variable(input_dim) label = C.input_variable(num_output_classes) # Instantiate the feedforward classification model scaled_input = element_times(constant(0.00390625), feature) # z = Sequential([ # Dense(hidden_layers_dim, activation=relu), # Dense(hidden_layers_dim, activation=relu), # Dense(num_output_classes)])(scaled_input) with default_options(activation=relu, init=C.glorot_uniform()): z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim)), Dense(num_output_classes, activation=None)])(scaled_input) ce = cross_entropy_with_softmax(z, label) pe = classification_error(z, label) # setup the data path = abs_path + "\Train-28x28_cntk_text.txt" reader_train = MinibatchSource(CTFDeserializer(path, StreamDefs( features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_train.streams.features, label: reader_train.streams.labels } # Training config minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 # Instantiate progress writers. progress_writers = [ProgressPrinter( tag='Training', num_epochs=num_sweeps_to_train_with)] # Instantiate the trainer object to drive the model training lr = learning_rate_schedule(1, UnitType.sample) trainer = Trainer(z, (ce, pe), [adadelta(z.parameters, lr)], progress_writers) training_session( trainer=trainer, mb_source=reader_train, mb_size=minibatch_size, model_inputs_to_streams=input_map, max_samples=num_samples_per_sweep * num_sweeps_to_train_with, progress_frequency=num_samples_per_sweep ).train() # Load test data path = abs_path + "\Test-28x28_cntk_text.txt" reader_test = MinibatchSource(CTFDeserializer(path, StreamDefs( features=StreamDef(field='features', shape=input_dim), labels=StreamDef(field='labels', shape=num_output_classes)))) input_map = { feature: reader_test.streams.features, label: reader_test.streams.labels } # Test data for trained model test_minibatch_size = 1024 num_samples = 10000 num_minibatches_to_test = num_samples / test_minibatch_size test_result = 0.0 for i in range(0, int(num_minibatches_to_test)): mb = reader_test.next_minibatch(test_minibatch_size, input_map=input_map) eval_error = trainer.test_minibatch(mb) test_result = test_result + eval_error # Average of evaluation errors of all test minibatches return test_result / num_minibatches_to_test
def charcnn(self, x): conv_out = C.layers.Sequential([ C.layers.Embedding(self.char_emb_dim), C.layers.Dropout(self.dropout), C.layers.Convolution2D((5,self.char_emb_dim), self.convs, activation=C.relu, init=C.glorot_uniform(), bias=True, init_bias=0, name='charcnn_conv')])(x) return C.reduce_max(conv_out, axis=1) # workaround cudnn failure in GlobalMaxPooling