def HighwayBlock( dim, # ideally this should be inferred, but times does not allow inferred x inferred parameter for now transform_weight_initializer=0 transform_weight_initializer=0, transform_bias_initializer=0, update_weight_initializer=0, update_bias_initializer=0, name=''): WT = C.Parameter(( dim, dim, ), init=transform_weight_initializer, name=name + '_WT') bT = C.Parameter(dim, init=transform_bias_initializer, name=name + '_bT') WU = C.Parameter(( dim, dim, ), init=update_weight_initializer, name=name + '_WU') bU = C.Parameter(dim, init=update_bias_initializer, name=name + '_bU') @C.Function def func(x_var): x = C.placeholder() transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT) update = C.relu(C.times(x, WU, name=name + '_U') + bU) return C.as_block( x + transform_gate * (update - x), # trans(x)*u(x)+(1-f(x))*x [(x, x_var)], 'HighwayBlock', 'HighwayBlock' + name) return func
def test_BatchNormalization(tmpdir): dtype = np.float32 sample = [ # 5 samples having 4 classes [1, 1, 2, 3], [0, 0, 0, 0], [3, 3, 4, 4], [1000, 1000, 1000, 1000], [10000, 10000, 10000, 10000]] epsilon = 0.00001 t = np.asarray(sample, dtype=dtype).reshape(-1,1) mean = 1 var = 2 init_scale = 3 init_bias = 4 scale = C.Parameter(init=np.asarray([init_scale], dtype=dtype), dtype=dtype) bias = C.Parameter(init=np.asarray([init_bias], dtype=dtype), dtype=dtype) run_mean = C.ops.constant(mean, shape=(1), dtype=dtype) run_variance = C.ops.constant(var, shape=(1), dtype=dtype) run_count = C.ops.constant(0, dtype=dtype) a = C.input_variable(shape=(1), dtype=dtype, needs_gradient=False, name='a') op_node = C.batch_normalization(a, scale, bias, run_mean, run_variance, running_count=run_count, spatial=False, epsilon=epsilon) verify_one_input(op_node, t, tmpdir, 'BatchNormalization')
def create_model(self): w = cntk.Parameter((self.number_features, self.number_labels), init=cntk.glorot_uniform(), name='W') b = cntk.Parameter((self.number_labels, ), init=0, name='b') self.model = cntk.times(self.input_transform, w) + b
def cross_entropy_with_sampled_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim, # Dimension of the hidden vector num_samples, # Number of samples to use for sampled softmax sampling_weights, # Node providing weights to be used for the weighted sampling allow_duplicates=False # Boolean flag to control whether to use sampling with replacement (allow_duplicates == True) or without replacement. ): bias = C.Parameter(shape=(vocab_dim, 1), init=0) weights = C.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) sample_selector_sparse = C.random_sample( sampling_weights, num_samples, allow_duplicates) # sparse matrix [num_samples * vocab_size] if use_sparse: sample_selector = sample_selector_sparse else: # Note: Sampled softmax with dense data is only supported for debugging purposes. # It might easily run into memory issues as the matrix 'I' below might be quite large. # In case we wan't to a dense representation for all data we have to convert the sample selector I = C.Constant(np.eye(vocab_dim, dtype=np.float32)) sample_selector = C.times(sample_selector_sparse, I) inclusion_probs = C.random_sample_inclusion_frequency( sampling_weights, num_samples, allow_duplicates) # dense row [1 * vocab_size] log_prior = C.log(inclusion_probs) # dense row [1 * vocab_dim] print("hidden_vector: " + str(hidden_vector.shape)) wS = C.times(sample_selector, weights, name='wS') # [num_samples * hidden_dim] print("ws:" + str(wS.shape)) zS = C.times_transpose(wS, hidden_vector, name='zS1') + C.times( sample_selector, bias, name='zS2') - C.times_transpose( sample_selector, log_prior, name='zS3') # [num_samples] # Getting the weight vector for the true label. Dimension hidden_dim wT = C.times(target_vector, weights, name='wT') # [1 * hidden_dim] zT = C.times_transpose(wT, hidden_vector, name='zT1') + C.times( target_vector, bias, name='zT2') - C.times_transpose( target_vector, log_prior, name='zT3') # [1] zSReduced = C.reduce_log_sum_exp(zS) # Compute the cross entropy that is used for training. # We don't check whether any of the classes in the random samples coincides with the true label, so it might happen that the true class is counted # twice in the normalizing denominator of sampled softmax. cross_entropy_on_samples = C.log_add_exp(zT, zSReduced) - zT # For applying the model we also output a node providing the input for the full softmax z = C.times_transpose(weights, hidden_vector) + bias z = C.reshape(z, shape=(vocab_dim)) zSMax = C.reduce_max(zS) error_on_samples = C.less(zT, zSMax) return (z, cross_entropy_on_samples, error_on_samples)
def func(x_var): x = C.placeholder() WT = C.Parameter((dim,dim,), init=transform_weight_initializer, name=name+'_WT') bT = C.Parameter(dim, init=transform_bias_initializer, name=name+'_bT') WU = C.Parameter((dim,dim,), init=update_weight_initializer, name=name+'_WU') bU = C.Parameter(dim, init=update_bias_initializer, name=name+'_bU') transform_gate = C.sigmoid(C.times(x, WT, name=name+'_T') + bT) update = C.relu(C.times(x, WU, name=name+'_U') + bU) return C.as_block( x + transform_gate * (update - x), [(x, x_var)], 'HighwayBlock', 'HighwayBlock'+name)
def BiRecurrence(step_function: C.Function, initial_state=0, dropout_rate_input=None, dropout_rate_output=None, weight_tie: bool = False, seed=SentinelValueForAutoSelectRandomSeed, name=''): """ Wrapper to create a bidirectional rnn Also comes with the option to to half the number of parameters required by bidirectional recurrent layer. This is done by only using one recurrent unit to do both forward and backward computation instead of the usual two. A forward and backward token is used to initialise the hidden state so that the recurrent unit can tell the directionality. More details can be found in the paper 'Efficient Bidirectional Neural Machine Translation' (https://arxiv.org/abs/1908.09329) Example: a = C.sequence.input_variable(10) b = BiRecurrence(LSTM(100), weight_tie=True)(a) assert b.shape == (200, ) Arguments: step_function (:class:`~cntk.ops.functions.Function` or equivalent Python function): This function must have N+1 inputs and N outputs, where N is the number of state variables (typically 1 for GRU and plain RNNs, and 2 for LSTMs). initial_state: dropout_rate_input: variational dropout on input dropout_rate_output: variational dropoput on output weight_tie (bool): whether to use only one recurrent function for computation in both direction. seed (int): seed for randomisation name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function that accepts one argument (which must be a sequence) and performs the recurrent operation on it """ fxn1 = step_function fxn2 = step_function.clone(C.CloneMethod.clone, {}) if not weight_tie else fxn1 forward_token = initial_state backward_token = initial_state if weight_tie: forward_token = C.Parameter(shape=(-1,), init=C.glorot_normal(), name='f_token') backward_token = C.Parameter(shape=(-1,), init=C.glorot_normal(), name='b_token') forward = Recurrence(fxn1, dropout_rate_input=dropout_rate_input, dropout_rate_output=dropout_rate_output, initial_state=forward_token, seed=seed) backward = Recurrence(fxn2, dropout_rate_input=dropout_rate_input, dropout_rate_output=dropout_rate_output, initial_state=backward_token, seed=seed, go_backwards=True) @C.Function def inner(x): output = C.splice(forward(x), backward(x), axis=-1) return C.layers.Label(name)(output) if name else output return inner
def test_initializer_scale(): # this should work fine: p = C.Parameter(shape=(1,), init=initializer.uniform(1)); with pytest.raises(ValueError) as excinfo: name = 'uniform_zero' p = C.Parameter(shape=(1,), init=initializer.uniform(0), name=name); assert 'CreateInitializer' in str(excinfo.value) assert name in str(excinfo.value) with pytest.raises(ValueError) as excinfo: name = 'glorot_negative_one' p = C.Parameter(shape=(1,), init=initializer.glorot_uniform(-1), name=name); assert 'CreateInitializer' in str(excinfo.value) assert name in str(excinfo.value)
def test_large_model_serialization_double(tmpdir): import os two_gb = 2**31 type_size = np.dtype(np.float64).itemsize size = two_gb / type_size + 10 assert size * type_size > two_gb device = C.device.cpu() i = C.sequence.input(size, dtype=np.float64) w = C.Parameter((size, ), dtype=np.float64, init=C.uniform(3.0, seed=12345), device=device) z = C.times(i, w) filename = str(tmpdir / 'test_large_model_serialization_double.out') z.save(filename) assert os.path.getsize(filename) > two_gb y = C.Function.load(filename, device=device) assert (len(z.parameters) == len(y.parameters)) for param_pair in zip(z.parameters, y.parameters): assert param_pair[0].shape == param_pair[1].shape assert np.allclose(param_pair[0].value, param_pair[1].value)
def build(self): self.input_kernel = C.Parameter(shape=(self._input_size, self._hidden_dim), init=self._input_initializer) self.recur_kernel = C.Parameter(shape=(self._hidden_dim, ), init=self._recurrent_initializer) self.bias = C.Parameter(shape=(self._hidden_dim), init=0) @C.Function def runit(h, x): ht = self._activation( C.times(x, self.input_kernel) + h * self.recur_kernel + self.bias) return ht return runit
def simi_attention(self, input, memory): ''' return: memory weighted vectors over input [#,c][d] weight ''' input_ph = C.placeholder() # [#,c][d] mem_ph = C.placeholder() # [#,q][d] input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0) weight_dense = Dense(1, bias=False, input_rank=1) proj_inp = input_dense(input_ph) # [#,c][d] proj_mem = mem_dense(mem_ph) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( proj_mem, 0).outputs # [#][*=q, d] [#][*=q] expand_mem = C.sequence.broadcast_as(unpack_memory, proj_inp) # [#,c][*=q,d] expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp) # [#,c][*=q] matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)), (-1, )) # [#,c][*=q] matrix = C.element_select(expand_mask, matrix, -1e30) logits = C.softmax(matrix, axis=0) # [#,c][*=q] weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem, axis=0) # [#,c][d] weight_mem = C.reshape(weight_mem, (-1, )) return C.as_block(C.combine(weight_mem, logits), [(input_ph, input), (mem_ph, memory)], 'simi_attention', 'simi_attention')
def convolution(input, name, **kwargs): dim = __weights_dict[name]['weights'].ndim weight = np.transpose(__weights_dict[name]['weights'], [dim - 1, dim - 2] + list(range(0, dim - 2))) w = cntk.Parameter(init=weight, name=name + '_weight') input = cntk.transpose(input, [dim - 2] + list(range(0, dim - 2))) layer = ops.convolution(w, input, **kwargs) if 'bias' in __weights_dict[name]: bias = np.reshape(__weights_dict[name]['bias'], [-1] + [1] * (dim - 2)) b = cntk.Parameter(init=bias, name=name + '_bias') layer = layer + b layer = cntk.transpose(layer, list(range(1, dim - 1)) + [0]) return layer
def cross_entropy_with_full_softmax( hidden_vector, # Node providing the output of the recurrent layers target_vector, # Node providing the expected labels (as sparse vectors) vocab_dim, # Vocabulary size hidden_dim # Dimension of the hidden vector ): bias = C.Parameter(shape=(vocab_dim, 1), init=0) weights = C.Parameter(shape=(vocab_dim, hidden_dim), init=C.initializer.glorot_uniform()) z = C.reshape( C.times_transpose(weights, hidden_vector) + bias, (1, vocab_dim)) zT = C.times_transpose(z, target_vector) ce = C.reduce_log_sum_exp(z) - zT zMax = C.reduce_max(z) error_on_samples = C.less(zT, zMax) return (z, ce, error_on_samples)
def build(self, require_train=False): gamma = C.Parameter(1,init=1) scales = C.Parameter(3, init=C.glorot_uniform(), name='scales') encoder = self.encoder_fac.build() bilm = self.bilm_fac.build() @C.Function def _func(x): ph = C.placeholder() first_out = encoder(ph) second_out, third_out = bilm(first_out).outputs # [#,*][1024] dup_first_out = C.splice(first_out, first_out) #[#,*][1024] s = C.softmax(scales) out = gamma*(s[0]*dup_first_out+s[1]*second_out+s[2]*third_out) return C.as_block( out, [(ph, x)],'Elmo', 'Elmo' ) return _func
def test_custom_op_with_int8_params(tmpdir): model_file = str(tmpdir/'test_model_params.bin') delete_if_file_exists(model_file) W1 = C.Parameter((1, 42), dtype=np.int8) W1.value = np.arange(42).reshape(1, 42) W2 = C.Parameter((1, 42), dtype=np.int8) W3 = C.Parameter((1, 42), dtype=np.float) X = C.input_variable((1, 42), dtype=np.float) # custom_op, output_shape, output_data_type, *operands, **kw_name z = C.custom_proxy_op("times", (21, 2), np.int8, X, W1, W2, W3, name ="custom_proxy") z.save(model_file) newz = C.load_model(model_file) assert(newz.parameters[0].shape == (1, 42)) assert(newz.output.shape == (21, 2)) assert (np.array_equal(W1.value, newz.parameters[0].value))
def resblock_basic(inp, num_filters): c1 = C.layers.Convolution( (3, 3), num_filters, init=C.he_normal(), pad=True, bias=False)(inp) c1 = C.layers.BatchNormalization(map_rank=1)(c1) c1 = C.param_relu(C.Parameter(c1.shape, init=C.he_normal()), c1) c2 = C.layers.Convolution( (3, 3), num_filters, init=C.he_normal(), pad=True, bias=False)(c1) c2 = C.layers.BatchNormalization(map_rank=1)(c2) return inp + c2
def SRResNet(h0): print('Generator inp shape: ', h0.shape) with C.layers.default_options(init=C.he_normal(), bias=False): h1 = C.layers.Convolution((9, 9), 64, pad=True)(h0) h1 = C.param_relu(C.Parameter(h1.shape, init=C.he_normal()), h1) h2 = resblock_basic_stack(h1, 16, 64) h3 = C.layers.Convolution((3, 3), 64, activation=None, pad=True)(h2) h3 = C.layers.BatchNormalization(map_rank=1)(h3) h4 = h1 + h3 # here h5 = C.layers.ConvolutionTranspose2D( (3, 3), 64, pad=True, strides=(2, 2), output_shape=(224, 224))(h4) h5 = C.param_relu(C.Parameter(h5.shape, init=C.he_normal()), h5) h6 = C.layers.Convolution((3, 3), 3, pad=True)(h5) return h6
def func(x_var): x = C.placeholder() WT = C.Parameter(( dim, dim, ), init=transform_weight_initializer, name=name + '_WT') bT = C.Parameter(dim, init=transform_bias_initializer, name=name + '_bT') WU = C.Parameter(( dim, dim, ), init=update_weight_initializer, name=name + '_WU') bU = C.parameter(dim, init=update_bias_initializer, name=name + '_bU') transform_gate = C.sigmoid(C.times(x, WT, name=name + '_T') + bT) update = C.tanh(C.times(x, WU, name=name + '_U') + bU) return C.as_block(update * transform_gate + (1 - transform_gate) * x, [(x, x_var)], 'SingleInner', 'SingleInner' + name)
def InstanceNormalization( num_channel, initial_scale=1, initial_bias=0, epsilon=C.default_override_or(0.00001), name=''): """ Instance Normalization (2016) """ epsilon = C.get_default_override(InstanceNormalization, epsilon=epsilon) dtype = C.get_default_override(None, dtype=C.default_override_or(np.float32)) scale = C.Parameter(num_channel, init=initial_scale, name='scale') bias = C.Parameter(num_channel, init=initial_bias, name='bias') epsilon = np.asarray(epsilon, dtype=dtype) @C.BlockFunction('InstanceNormalization', name) def instance_normalization(x): mean = C.reduce_mean(x, axis=(1, 2)) x0 = x - mean std = C.sqrt(C.reduce_mean(x0 * x0, axis=(1, 2))) if epsilon != 0: std += epsilon x_hat = x0 / std return x_hat * C.reshape(scale, (-1, 1, 1)) + C.reshape(bias, (-1, 1, 1)) return instance_normalization
def build(self): input_kernel = C.Parameter(shape=(self._input_size, self._hidden_dim), init=self._input_initializer) recur_kernel = C.Parameter(shape=(self._hidden_dim, ), init=self._recurrent_initializer) bias = C.Parameter(shape=(self._hidden_dim), init=0) if self._recurrent_min_abs > 0: abs_kernel = C.abs(recur_kernel) min_abs_kernel = C.element_max(abs_kernel, self._recurrent_min_abs) recur_kernel = min_abs_kernel * C.element_select( C.greater_equal(recur_kernel, C.constant(0)), C.constant(1), C.constant(-1)) if self._recurrent_max_abs: recur_kernel = C.clip(recur_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) @C.Function def runit(h, x): h_t = C.times(x, input_kernel) + bias + recur_kernel * h return h_t return runit
def MyBLSTMLayer(hidden_size=128, num_layers=2): W = C.Parameter((C.InferredDimension, hidden_size), init=C.he_normal(1.0), name='rnn_parameters') def _func(operand): return C.optimized_rnnstack(operand, weights=W, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, recurrent_op='lstm') return _func
def test_parameter_set_value(): p = C.Parameter(shape=(2, 3), init=1) n = np.random.randn(2, 3) p.value = n assert np.all(p.value == n.astype(p.dtype)) n = np.reshape(np.arange(6), (2, 3)) p.value = n op = plus(p, p) state, output = op.forward({}, op.outputs, op.outputs) value = output[op.output] assert np.all(value == 2 * n.astype(p.dtype)) p.value = C.internal.sanitize_value(p.shape, 1.0, np.float32, None) assert np.all(p.value == np.ones((2, 3)))
def test_saving_and_loading_int8_ndarray_as_attribute(tmpdir): model_file = str(tmpdir/'test_model.bin') delete_if_file_exists(model_file) data = np.arange(0,64, dtype=np.int8).reshape(16,4) dict_val = C._to_cntk_dict_value(data) W = C.Parameter((C.InferredDimension, 42), init=C.glorot_uniform(), dtype=np.float) x = C.input_variable(12, dtype=np.float) y = C.times(x, W) y.custom_attributes = {'int8_nd':dict_val} y.save(model_file) assert(os.path.isfile(model_file)) z = C.load_model(model_file) int8_data = z.custom_attributes['int8_nd'] assert(int8_data.shape == (16,4)) assert (np.array_equal(int8_data, data))
def test_recurrance_with_udf_without_layers(): name = "SimpleUdf" def udf(a): return C.user_function(SimpleUdf(a, name=name)) # input varibale and the data. x = C.sequence.input_variable(needs_gradient=True, shape=(2, )) x0 = np.reshape(np.arange(16.0, dtype=np.float32), (2, 4, 2)) print(x0) # creates a recurrent loop. p = C.placeholder(shape=(2, )) past = C.sequence.past_value(p) z = udf(x) * udf(past) + C.Parameter((2, ), init=[1, 1]) z.replace_placeholders({p: z.outputs[0]}) #C.logging.graph.plot(z, "recurrent.pdf") out = z.eval({x: x0}) print(out) expected_out = [ np.array([1, 1, 3, 4, 13, 21, 79, 148], dtype=np.float32).reshape(4, 2), np.array([1, 1, 11, 12, 133, 157, 1863, 2356], dtype=np.float32).reshape(4, 2) ] assert np.array_equal(out, expected_out) gradient, result = z.grad({x: x0}, wrt=[x], outputs=[z.output]) print(result) assert np.array_equal(result, expected_out) expected_grad = [ np.array([0, 0, 29, 41, 21, 32, 13, 21], dtype=np.float32).reshape(4, 2), np.array([0, 0, 181, 209, 165, 192, 133, 157], dtype=np.float32).reshape(4, 2) ] print(gradient) assert np.array_equal(gradient, expected_grad)
def binary_convolution(filter_shape, num_filters=1, channels = 1, init=C.glorot_uniform(), pad=False, strides=1, name='BinaryConvolution'): ''' Creates a binary convolution function based on the input parameters. Args: filter_shape : shape of the filter num_filters : number of filters to use init : initialization function for the filter pad : padding enabled or not for the filter strides : overlap for this filter name : name given to the binary convolution. Returns: a function for performing binary convolution ''' kernel_shape = (num_filters, channels) + filter_shape W = C.Parameter(shape=kernel_shape, init=init, name="filter") def convolution(operand): bcv_operand_p = C.placeholder( operand.shape, operand.dynamic_axes, name="operand") bcv = C.convolution( CustomMultibit(W, 1), CustomMultibit(bcv_operand_p, 1), auto_padding=[False, pad, pad], strides=[strides]) return C.as_block(bcv, [(bcv_operand_p, operand)], name) return convolution
def test_large_model_serialization_float(tmpdir): import os from cntk.layers import Recurrence, LSTM, Dense type_size = np.dtype(np.float32).itemsize two_gb = 2**31 size = (2097152 + 4, 256, 512, 4096) assert size[0] * size[1] * type_size > two_gb device = C.device.cpu() i = C.sequence.input(size[0]) w = C.Parameter((size[0], size[1]), init=C.uniform(3.0, seed=12345), device=device) e = C.times(i, w) h_fwd = Recurrence(LSTM(size[2]))(e) h_bwd = Recurrence(LSTM(size[2]), go_backwards=True)(e) h_last_fwd = C.sequence.last(h_fwd) h_first_bwd = C.sequence.first(h_bwd) t = C.splice(h_last_fwd, h_first_bwd) z1 = Dense(size[2], activation=C.relu)(t) z = Dense(2, activation=None)(z1) filename = str(tmpdir / 'test_large_model_serialization_float.out') z.save(filename) assert os.path.getsize(filename) > two_gb y = C.Function.load(filename, device=device) assert (len(z.parameters) == len(y.parameters)) for param_pair in zip(z.parameters, y.parameters): assert param_pair[0].shape == param_pair[1].shape assert np.allclose(param_pair[0].value, param_pair[1].value)
import cntk as C import numpy as np dataset_size = 200000 X = np.random.rand(dataset_size, 2) labels = np.zeros((dataset_size, 3)) labels[X[:, 0] > X[:, 1]] = [0, 0, 1] labels[X[:, 0] <= X[:, 1]] = [1, 0, 0] labels[X[:, 1] + X[:, 0] > 1] = [0, 1, 0] init = C.initializer.normal(0.01) theta1 = C.Parameter(shape=(2, 12), init=init) bias1 = C.Parameter(shape=(1, 12), init=init) theta2 = C.Parameter(shape=(12, 3), init=init) bias2 = C.Parameter(shape=( 1, 3, ), init=init) x = C.input_variable(shape=(2, ), needs_gradient=False) t = C.input_variable(shape=(3, ), needs_gradient=False) def forward(x): y = C.times(x, theta1) + C.squeeze(bias1, 0) y = C.element_max(y, 0.) return C.times(y, theta2) + C.squeeze(bias2, 0)
# Our model expects float32 features, and cross-entropy expects one-hot encoded labels. Y = scipy.sparse.csr_matrix((np.ones(N, np.float32), (range(N), Y)), shape=(N, num_classes)) X = X.astype(np.float32) return X, Y X_train, Y_train = generate_synthetic_data(20000) X_test, Y_test = generate_synthetic_data(1024) # Define the CNTK model function. The model function maps input data to # predictions (here: 2-dimensional inputs --> 2 scores). # This simple logistic-regression model just uses a linear transform. data = cntk.input_variable(input_dim) W = cntk.Parameter((input_dim, num_classes), init=cntk.glorot_uniform(), name='W') b = cntk.Parameter((num_classes, ), init=0, name='b') model = cntk.times(data, W) + b # Define the CNTK criterion function. A criterion function maps # (input vectors, labels) to a loss function and an optional additional # metric. The loss function is used to train the model parameters. # We use cross entropy as a loss function. label_one_hot = cntk.input_variable(num_classes, is_sparse=True) loss = cntk.cross_entropy_with_softmax( model, label_one_hot) # this applies softmax to model's output under the hood metric = cntk.classification_error(model, label_one_hot) criterion = cntk.combine( [loss, metric]) # criterion is a tuple-valued function (loss, metric)
def test_parameter_value(value): c = C.Parameter(init=value) assert np.allclose(c.value, value)
def main(): print("version", C.__version__) bs = 1 n_chans = 1 sigma_s = 16 sigma_r = 12 # 4x4x1024x1024 # 4x12x64x64 sz = 256 # sz = 1024 small_sz = sz // sigma_s yy, xx = np.meshgrid(np.arange(0, sz), np.arange(0, sz)) cc, bb = np.meshgrid(np.arange(0, n_chans), np.arange(0, bs)) xx = np.expand_dims(xx, 0) xx = np.expand_dims(xx, 0) yy = np.expand_dims(yy, 0) yy = np.expand_dims(yy, 0) bb = np.expand_dims(bb, 2) bb = np.expand_dims(bb, 3) cc = np.expand_dims(cc, 2) cc = np.expand_dims(cc, 3) # Compute graph grid = C.Parameter([bs, n_chans, sigma_r, small_sz, small_sz], ) # grid = C.input_variable( # [bs, n_chans, sigma_r, small_sz, small_sz], # dynamic_axes=[], needs_gradient=True) guide = C.input_variable([bs, sz, sz], dynamic_axes=[], needs_gradient=True) guide_non_diff = C.input_variable([bs, sz, sz], dynamic_axes=[]) # Coordinates xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) cc = C.Constant(cc, cc.shape) bb = C.Constant(bb, bb.shape) gx_d, gy_d, gz_d, fx_d, fy_d, fz_d, _, _, _ = grid_coord( guide, xx, yy, sz, small_sz, sigma_r, bs) # Trilerp weights wx = (gx_d - 0.5 - fx_d) wy = (gy_d - 0.5 - fy_d) wz = C.abs(gz_d - 0.5 - fz_d) # Enclosing cell gx, gy, gz, fx, fy, fz, cx, cy, cz = grid_coord(guide_non_diff, xx, yy, sz, small_sz, sigma_r, bs) output_components = [] for ix, x in enumerate([fx, cx]): wx_ = (1 - wx) if ix == 0 else wx for iy, y in enumerate([fy, cy]): wy_ = (1 - wy) if iy == 0 else wy for iz, z in enumerate([fz, cz]): wz_ = (1 - wz) if iz == 0 else wz linear_idx = x + small_sz * (y + small_sz * (z + sigma_r * (cc + n_chans * bb))) # Flatten data for gather op flat_grid = C.reshape( grid, [bs * small_sz * small_sz * sigma_r * n_chans]) flat_linear_idx = C.reshape(linear_idx, [bs * n_chans * sz * sz]) # Slice interp = C.gather(flat_grid, flat_linear_idx) interp_fsz = C.reshape(interp, [bs, n_chans, sz, sz]) output_components.append(interp_fsz * wz_ * wx_ * wy_) out = sum(output_components) loss = C.squared_error(out, guide) # svg = C.logging.graph.plot(out, "/output/graph.svg") grid_data = np.random.uniform(size=(bs, n_chans, sigma_r, small_sz, small_sz)).astype(np.float32) # guide_data = np.random.uniform( # size=(bs, sz, sz)).astype(np.float32) guide_data = skio.imread("/data/rgb.png").mean(2)[:sz, :sz].astype( np.float32) guide_data = np.expand_dims(guide_data, 0) / 255.0 inputs = {guide: guide_data, guide_non_diff: guide_data}
def BilateralSlice(sz, i_chans, o_chans, grid_sz=64, sigma_r=8): gsize = [(i_chans+1)*o_chans, sigma_r, grid_sz, grid_sz] grid = C.Parameter(gsize, name="grid", init=np.random.uniform(size=gsize)) guide_scale = C.Parameter((1, ), name="guide_scale", init=np.ones((1, ))) grid_scale = C.Parameter((1, ), name="grid_scale", init=np.ones((1, ))) im_scale = C.Parameter((1, ), name="im_scale", init=np.ones((1, ))) yy, xx = np.meshgrid(np.arange(0, sz), np.arange(0, sz)) xx = np.expand_dims(xx, 0) yy = np.expand_dims(yy, 0) cc = np.arange(0, i_chans+1) cc = np.expand_dims(cc, 1) cc = np.expand_dims(cc, 2) xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) cc = C.Constant(cc, cc.shape) @C.functions.BlockFunction("BilateralSlice", "bilateral_slice") def bilateral_slice(im, guide, guide_no_grad): # Flatten data for gather op flat_grid = grid_scale*C.reshape(grid, [grid_sz*grid_sz*sigma_r*o_chans*(i_chans+1)]) # flat_grid_u = C.unpack_batch(flat_grid) # Make sure we do sth that requires the gradient w.r.t guide scaled_guide = guide_scale*guide gx_d, gy_d, gz_d, fx_d, fy_d, fz_d, _, _, _ = grid_coord( scaled_guide, xx, yy, sz, grid_sz, sigma_r) wx = C.abs(gx_d - 0.5 - fx_d) wy = C.abs(gy_d - 0.5 - fy_d) wz = C.abs(gz_d - 0.5 - fz_d) # Enclosing cell gx, gy, gz, fx, fy, fz, cx, cy, cz = grid_coord( guide_no_grad, xx, yy, sz, grid_sz, sigma_r) out_chans = [] for chan in range(o_chans): output_components = [] for ix, x in enumerate([fx, cx]): wx_ = (1-wx) if ix == 0 else wx for iy, y in enumerate([fy, cy]): wy_ = (1-wy) if iy == 0 else wy for iz, z in enumerate([fz, cz]): wz_ = (1-wz) if iz == 0 else wz linear_idx = x + grid_sz*(y + grid_sz*(z + sigma_r*(cc + chan*(i_chans+1)))) flat_linear_idx = C.reshape(linear_idx, [(i_chans+1)*sz*sz]) # Slice interp = C.gather(flat_grid, flat_linear_idx) interp_fsz = C.reshape(interp, [i_chans+1, sz, sz])*wx_*wy_*wz_ output_components.append(interp_fsz) out_coeffs = sum(output_components) out_chan = C.reduce_sum(out_coeffs[:i_chans]*(im_scale*im) + out_coeffs[-1], 0) out_chans.append(out_chan) out = C.splice(*out_chans, axis=0) return out return bilateral_slice