def criterion(self): # hyperparameters lambda_val = 0.5 # Margin loss left = ct.square(ct.relu(0.9 - self.length)) right = ct.square(ct.relu(self.length - 0.1)) left = ct.reshape(left, (-1)) right = ct.reshape(right, (-1)) lc = self.labels * left + lambda_val * (1 - self.labels) * right margin_loss = ct.reduce_sum(lc, axis=0) margin_loss = ct.reduce_mean(margin_loss, axis=ct.axis.Axis.default_batch_axis()) # classification_error predict = ct.softmax(self.length, axis=0) error = ct.classification_error(ct.reshape(predict, (10)), self.labels) total_loss = margin_loss reconstruction_err = 0 if self.use_reconstruction: features = ct.reshape(self.features, shape=(-1,)) encoder = ct.reshape(self.training_model, shape=(-1,)) squared = ct.square(encoder - features) reconstruction_err = ct.reduce_mean(squared, axis=0) reconstruction_err = ct.reduce_mean(reconstruction_err, axis=ct.axis.Axis.default_batch_axis()) total_loss = margin_loss + (0.0005*784) * reconstruction_err return total_loss, error
def true_density(z): z1, z2 = z[0], z[1] norm = C.sqrt(C.square(z1) + C.square(z2)) exp1 = C.exp(-0.5 * C.square((z1 - 2) / 0.8)) exp2 = C.exp(-0.5 * C.square((z1 + 2) / 0.8)) u = 0.5 * C.square(((norm - 4) / 0.4)) - C.log(exp1 + exp2) return C.exp(-u)
def model(seq_image, decoded): params = dense(decoded) g_x, g_y, sigma2, delta, gamma = attention_parameters(params) i = C.Constant(np.arange(n) + 1, ) # col of patch j = C.Constant(np.arange(n) + 1, ) # row of patch mu_x = g_x + (i - n / 2 - 0.5) * delta mu_y = g_y + (j - n / 2 - 0.5) * delta mu_x = C.expand_dims(mu_x, axis=-1) mu_y = C.expand_dims(mu_y, axis=-1) # mu_x: [#, *] [n, 1] # mu_y: [#, *] [n, 1] image = C.sequence.unpack(seq_image, padding_value=0, no_mask_output=True) # image: [#] [*image_width, filters, image_height] width_pos = Cx.sequence.position(seq_image) # width_pos: [#, *] [1] width_pos_unpacked = C.sequence.unpack(width_pos, padding_value=999_999, no_mask_output=True) # width_pos: [#] [*image_width, 1] a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x) # a: [#, *] [1, *image_width] # x pos index of image (width) b = C.Constant(np.arange(image_height).reshape((1, -1))) # b: [] [1, image_height] # y pos index of image (height) # calculate the which portion of the image that is attended by the gaussian filter f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2) f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2) # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] z_x = C.reduce_sum(f_xi, axis=1) z_y = C.reduce_sum(f_yj, axis=1) # z_x: [#, *] [n] # z_y: [#, *] [n] f_xi = f_xi / z_x f_yj = f_yj / z_y # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] # combine filters from x and y image_broadcasted = C.sequence.broadcast_as(image, f_yj) attended = gamma * C.times( f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2) # attended: [#, *] [n, filters, n] attended = C.swapaxes(attended) # attended: [#, *] [filters, n (x) , n (y)] return attended
def total_variation_loss(x): xx = C.reshape(x, (1,)+x.shape) delta = np.array([-1, 1], dtype=np.float32) kh = C.constant(value=delta.reshape(1, 1, 1, 1, 2)) kv = C.constant(value=delta.reshape(1, 1, 1, 2, 1)) dh = C.convolution(kh, xx, auto_padding=[False]) dv = C.convolution(kv, xx, auto_padding=[False]) avg = 0.5 * (C.reduce_mean(C.square(dv)) + C.reduce_mean(C.square(dh))) return avg
def create_model(self): hidden_layers = self._hidden_layers with C.layers.default_options(init=C.layers.glorot_uniform(), activation=C.ops.relu): h = self._input for i in range(self._num_hidden_layers): h = C.layers.Dense(hidden_layers[i])(h) model = C.layers.Dense(self._output_size, activation=None)(h) loss = C.reduce_mean(C.square(model - self._output), axis=0) meas = C.reduce_mean(C.square(model - self._output), axis=0) learner = C.adadelta(model.parameters, self._lr_schedule) trainer = C.Trainer(model, (loss, meas), learner) return model, loss, learner, trainer
def __get_trainer_loss(self, model, action_count): q_target = C.sequence.input_variable(action_count, np.float32) # loss='mse' loss = C.reduce_mean(C.square(model - q_target), axis=0) meas = C.reduce_mean(C.square(model - q_target), axis=0) # optimizer lr_schedule = C.learning_rate_schedule(LEARNING_RATE, C.UnitType.minibatch) learner = C.sgd(model.parameters, lr_schedule, gradient_clipping_threshold_per_sample=10) trainer = C.Trainer(model, (loss, meas), learner) return trainer, loss
def create_model(self): hidden_layers = self._hidden_layers with cntk.layers.default_options(init=cntk.layers.glorot_uniform(), activation=cntk.ops.relu): h = self._input for i in range(self._num_hidden_layers): h = cntk.layers.Dense(hidden_layers[i], activation=cntk.ops.relu)(h) model = cntk.layers.Dense(self._output_size, activation=None)(h) loss = cntk.reduce_mean(cntk.square(model - self._output), axis=0) meas = cntk.reduce_mean(cntk.square(model - self._output), axis=0) learner = cntk.adadelta(model.parameters, self._lr_schedule, l2_regularization_weight=0.01) trainer = cntk.Trainer(model, (loss, meas), learner) return model, loss, learner, trainer
def window_weight(a, b, k, u): """ Calculate Phi is the window weight of character seq at position u of time t. Function tested to be correct on 2018-25-02 using numpy equivalent math: phi = summation of mixtures { a * exp ( -b * (k - u) ^ 2 ) } Args: a: importance of window within the mixture. Not normalised and doesn't sum to one. b: width of attention window k: location of window u: integer position of each item in sequence. Value from 1 to seq_length. (rank 2 tensor) [-3, 1] Returns: :class:`~cntk.ops.functions.Function` """ # print(f"k shape: {k.shape}, u shape: {u.shape}") phi = a * C.exp(-1 * b * C.square(k - u)) # print("internal phi shape:", phi.shape) phi = C.swapaxes(C.reduce_sum(phi, axis=0)) # Reduce sum the mixture axis # phi: [#, n] [*-c, 1] return phi
def run_cntk(image_path, model_path): import functools import cv2 model = cntk.load_model(model_path) pool_nodes = list() for l in cntk.logging.depth_first_search(model, lambda x: True, depth=0): if type(l) is cntk.ops.functions.Function: description = str(l) if description.find('Pooling') >= 0: pool_nodes.append(l) print(l) print(pool_nodes) # node contributions to the loss metric layer_contributions = { pool_nodes[2]: 1, pool_nodes[3]: 3, } # Define the loss loss = None for layer in layer_contributions.keys(): coeff = layer_contributions[layer] activation = layer.output scaling = functools.reduce(lambda x, y: x * y, activation.shape) sum_squares = cntk.reduce_sum(cntk.square(activation)) scaled_sum_squares = (coeff / scaling) * sum_squares if loss is None: loss = scaled_sum_squares else: loss += scaled_sum_squares dream = cntk.input_variable(shape=model.arguments[0].shape, needs_gradient=True, name='features') model = cntk.ops.combine(loss).clone( cntk.ops.CloneMethod.freeze, substitutions={model.arguments[0]: dream}) step = 0.1 # Gradient ascent step size iterations = 5 # Number of ascent steps per scale # Load the image into a Numpy array img = cv2.imread(image_path) img = cv2.resize(img, (224, 224)) # cv2.imshow('Original Image', img.copy()) img = img.astype(np.float32) img = np.transpose(img, (2, 0, 1)) img /= 127.5 img -= 1 img = gradient_ascent_cntk(model, img, iterations=iterations, step=step) img = np.transpose(img, (1, 2, 0)) img /= 2. img += 0.5 img *= 255. img = np.clip(img, 0, 255).astype('uint8') return img
def test_gather_op(device_id, precision): a_data = [AA([[0],[1]], dtype=PRECISION_TO_TYPE[precision]), AA([[3],[4]], dtype=PRECISION_TO_TYPE[precision])] a = C.input_variable((2,1)) r_data = np.arange(12).reshape(6,2).astype('f') r = C.parameter(shape=r_data.data, init=r_data) res = C.gather(r, a).eval({a:a_data}) expectd = np.asarray([[[[0., 1.]],[[2., 3.]]],[[[6., 7.]],[[8.,9.]]]]) assert np.array_equal(res, expectd) grads = C.gather(r, a).grad({a:a_data}, [r]) expectd_grad = np.asarray([[1,1],[1,1],[0,0],[1,1],[1,1],[0,0]], dtype=np.float32) assert np.array_equal(grads, expectd_grad) #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed) indices_params = C.parameter(shape=(1,), init=1.0) grads = C.gather(r, (indices_params *a)).grad({a:a_data}, [r, indices_params]) assert np.array_equal(grads[r], expectd_grad) assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32)) b_data = [AA([[0,2],[1,3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2,4],[3,5]], dtype=PRECISION_TO_TYPE[precision])] b = C.input_variable((2,2)) res2 = C.gather(r, b).eval({b:b_data}) expectd2 = np.asarray([[[[0., 1.],[4.,5.]],[[2., 3.],[6., 7.]]],[[[4., 5.],[8.,9.]],[[6., 7.], [10., 11.]]]]) assert np.array_equal(res2, expectd2) #the following small model is to test the memory reuse issue of gather node. x = C.input((3, 4)) x1 = C.to_sequence(x) w = C.parameter((5, 6), init=1) z = C.gather(w, x1) assert z.shape == (4, 6) #need the unpack node to trigger memory reuse. f = C.sequence.unpack(z, 0, no_mask_output=True) y = C.input((3, 4, 6)) loss = C.reduce_mean(C.square(f - y), axis=-1) loss = C.reduce_mean(loss, axis=C.Axis.all_axes()) g = C.constant(0, shape=w.shape) u = C.assign(w, g + 1) learner = C.cntk_py.universal_learner([w], [g], u) trainer = C.trainer.Trainer(loss, [loss], [learner]) indices = np.asarray([[[1, 2, 1, 2]]]) input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0) lable = np.full((10, 3, 4, 6), 2) trainer.train_minibatch({x: input, y: lable}) # the 2nd and 3rd rows should be udpated by gradients. assert np.mean(w.value[1, :]) < 1 assert np.mean(w.value[2, :]) < 1 # the other three rows should keep as 1 assert np.isclose(np.mean(w.value[0, :]), 1) assert np.isclose(np.mean(w.value[3, :]), 1) assert np.isclose(np.mean(w.value[4, :]), 1)
def squash(input): # ||Sj||^2 Sj_squared_norm = ct.reduce_sum(ct.square(input), axis=axis) # ||Sj||^2 / (1 + ||Sj||^2) * (Sj / ||Sj||) factor = ct.element_divide( ct.element_divide(Sj_squared_norm, ct.plus(1, Sj_squared_norm)), ct.sqrt(ct.plus(Sj_squared_norm, epsilon))) return factor * input
def __local_response_normalization(self, k, n, alpha, beta, name=''): x = cntk.placeholder(name='lrn_arg') x2 = cntk.square(x) x2s = cntk.reshape(x2, (1, cntk.InferredDimension), 0, 1) W = cntk.constant(alpha / (2 * n + 1), (1, 2 * n + 1, 1, 1), name='W') y = cntk.convolution(W, x2s) b = cntk.reshape(y, cntk.InferredDimension, 0, 2) den = cntk.exp(beta * cntk.log(k + b)) apply_x = cntk.element_divide(x, den) return apply_x
def lrn(x, depth_radius, bias, alpha, beta, name=''): x2 = C.square(x) # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha/(2*depth_radius+1), shape=(1,2*depth_radius+1,1,1), dtype=dtype, name='W') # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1 y = C.convolution (W, x2s) # reshape back to remove the fake singleton reduction dimension b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(bias + b)) return C.element_divide(x, den)
def LocalResponseNormalization(k, n, alpha, beta, name=''): x = C.placeholder(name='lrn_arg') x2 = C.square(x) x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha / (2 * n + 1), (1, 2 * n + 1, 1, 1), name='W') y = C.convolution(W, x2s) b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(k + b)) apply_x = C.element_divide(x, den) return apply_x
def true_density(z): z1, z2 = z[0], z[1] w1 = lambda x: C.sin(2 * np.pi * x/4) u = 0.5 * C.square((z2 - w1(z1))/0.4) dummy = C.ones_like(u) * 1e7 # u = C.element_select(C.less_equal(z1,4), u, dummy) cond = C.less_equal(z1,4) u = C.element_select(cond, u, dummy) # u = cond*u + (1-cond)*dummy return C.exp(-u)
def LocalResponseNormalization(k, n, alpha, beta, name=''): x = C.placeholder(name='lrn_arg') x2 = C.square(x) # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha / (2 * n + 1), (1, 2 * n + 1, 1, 1), name='W') # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1 y = C.convolution(W, x2s) # reshape back to remove the fake singleton reduction dimension b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(k + b)) apply_x = C.element_divide(x, den) return apply_x
def LocalResponseNormalization(k, n, alpha, beta, name=''): x = C.placeholder(name='lrn_arg') x2 = C.square(x) # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. x2s = C.reshape(x2, (1, C.InferredDimension), 0, 1) W = C.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W') # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1 y = C.convolution (W, x2s) # reshape back to remove the fake singleton reduction dimension b = C.reshape(y, C.InferredDimension, 0, 2) den = C.exp(beta * C.log(k + b)) apply_x = C.element_divide(x, den) return apply_x
def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0): #W=tf.transpose(W, [0,2,3,1]) arrs=array.shape ashp=W.shape sb=(W.shape[1],1,1) WV=W.shape[-2:] xi=(-2,-1) x2=(-2,-1,-3) if V: print(W.eval()) print(arrs,ashp) mul=(array*W) if V: print('Wsamp',W[-1,-1].eval()) print('array*w',(mul.eval())[0,-1]) size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel) if V: print("sizesamp",size.shape,size.eval()) if B is None: B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))])) if sizz==1: mean=C.reduce_sum(mul,axis=xi)/size else: mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32) if V: print("meansamp",mean.eval()[0,-1]) if square: i=(C.square(mul-mean)+B) else: i=(((mul)-mean)+B) di=i/size if V==2: print("i",i.eval(),"i") print("di",di.eval(),"di") if V: print('isamp',i.shape,i.eval()[-1,-1,]) out=C.reduce_sum(i+B,axis=x2) #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1) print(out.shape) if sqrt: out=C.sqrt(out) out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1) print(out.shape) assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2]) return(out)
def gaussian_mdn_phi(target, mu, sigma, ndim: int): """ Calculates phi between the target tensor and the network prediction Does not assumes independence between components of target. Arguments: target: target tensor with shape (ndim, ) mu: means of gaussian mdn with shape (nmix, ndim) sigma: sigma of gaussian mdn nmix (int): number of mixtures ndim (int): number of dimensions in gaussian Returns: :class:`~cntk.ops.functions.Function` """ if not len(mu.shape) == 2: raise ValueError("mu {0} must have shape (nmix, ndim)".format(mu.shape)) t = C.expand_dims(target, axis=0) exp_term = C.exp(C.negate(C.square(C.reduce_l2(t - mu, axis=-1)) / (2 * C.square(sigma)))) factor = C.reciprocal((2 * pi) ** (ndim / 2) * C.pow(sigma, ndim)) return factor * exp_term
def _create(self, hidden): observation = C.input_variable(STATE_COUNT, name="s") q_target = C.input_variable(ACTION_COUNT, name="q") model = C.layers.Dense(hidden, activation=C.relu)(observation) model = C.layers.Dense(ACTION_COUNT)(model) # loss='mse' loss = C.reduce_mean(C.square(model - q_target)) #, axis=0) # optimizer lr = 0.00025 lr_schedule = C.learning_parameter_schedule(lr) learner = C.sgd(model.parameters, lr_schedule, gradient_clipping_threshold_per_sample=10) trainer = C.Trainer(model, (loss, None), learner) return model, trainer, loss
def square(x, name=''): ''' Computes the element-wise square of `x`: Example: >>> C.eval(C.square([1., 10.])) [array([[ 1. , 100.]])] Args: x: numpy array or any :class:`cntk.Function` that outputs a tensor name (str): the name of the node in the network Returns: :class:`cntk.Function` ''' from cntk import square x = sanitize_input(x) return square(x, name).output()
def create_binary_convolution_model(): # Input variables denoting the features and label data feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), feature_var) # first layer is ok to be full precision z = C.layers.Convolution((3, 3), 64, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3, 3), 128, channels=64, pad=True) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3, 3), 128, channels=128, pad=True) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (1, 1), num_classes, channels=128, pad=True) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes, )) # Add binary regularization (ala Gang Hua) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) # After the last layer, we need to apply a learnable scale SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def std_normalized_l2_loss(output, target): std_inv = np.array([ 6.6864805402, 5.2904440280, 3.7165409939, 4.1421640454, 8.1537399389, 7.0312877415, 2.6712380967, 2.6372177876, 8.4253649884, 6.7482162880, 9.0849960354, 10.2624412692, 3.1325531319, 3.1091179819, 2.7337937590, 2.7336441031, 4.3542467871, 5.4896293687, 6.2003761588, 3.1290341469, 5.7677042738, 11.5460919611, 9.9926451700, 5.4259818848, 20.5060642486, 4.7692101480, 3.1681517575, 3.8582905289, 3.4222250436, 4.6828286809, 3.0070785113, 2.8936539301, 4.0649030157, 25.3068458731, 6.0030623160, 3.1151977458, 7.7773542649, 6.2057372469, 9.9494258692, 4.6865422850, 5.3300697628, 2.7722027974, 4.0658663003, 18.1101618617, 3.5390113731, 2.7794520068 ], dtype=np.float32) weights = C.constant(value=std_inv) #.reshape((1, label_dim))) dif = output - target ret = C.reduce_mean(C.square(C.element_times(dif, weights))) return ret
def create_binary_convolution_model(): # Input variables denoting the features and label data feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) # apply model to input scaled_input = C.element_times(C.constant(0.00390625), feature_var) # first layer is ok to be full precision z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=32, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (3,3), 128, channels=128, pad=True) z = C.layers.MaxPooling((3,3), strides=(2,2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution(z, (1,1), num_classes, channels=128, pad=True) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes,)) # Add binary regularization (ala Gang Hua) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) # After the last layer, we need to apply a learnable scale SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) # loss and metric ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def layer_normalization(inputs: C.Function, name='layer_normalization') -> C.Function: X = C.placeholder( inputs.shape, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') mu = C.reduce_mean(X, name='mu') sigma = C.sqrt(C.reduce_mean(C.square(X - mu)), name='sigma') result = (X - mu) / sigma #region scale + bias scale = C.parameter(inputs.shape, init=1, name='scale') bias = C.parameter(inputs.shape, init=0, name='bias') result = result * scale + bias #endregion block = C.as_block(result, [(X, X)], name) return block(inputs)
def create_binary_convolution_model(): feature_var = C.input((num_channels, image_height, image_width)) label_var = C.input((num_classes)) scaled_input = C.element_times(C.constant(0.00390625), feature_var) z = C.layers.Convolution((3, 3), 32, pad=True, activation=C.relu)(scaled_input) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution((3, 3), 128, channels=32, pad=True)(z) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution((3, 3), 128, channels=128, pad=True)(z) z = C.layers.MaxPooling((3, 3), strides=(2, 2))(z) z = C.layers.BatchNormalization(map_rank=1)(z) z = BinaryConvolution((1, 1), num_classes, channels=128, pad=True)(z) z = C.layers.AveragePooling((z.shape[1], z.shape[2]))(z) z = C.reshape(z, (num_classes, )) weight_sum = C.constant(0) for p in z.parameters: if (p.name == "filter"): weight_sum = C.plus(weight_sum, C.reduce_sum(C.minus(1, C.square(p)))) bin_reg = C.element_times(.000005, weight_sum) SP = C.parameter(shape=z.shape, init=0.001) z = C.element_times(z, SP) ce = C.cross_entropy_with_softmax(z, label_var) ce = C.plus(ce, bin_reg) pe = C.classification_error(z, label_var) return C.combine([z, ce, pe])
def build_SRResNet_graph(lr_image_shape, hr_image_shape, net): inp_dynamic_axes = [C.Axis.default_batch_axis()] real_X = C.input( lr_image_shape, dynamic_axes=inp_dynamic_axes, name="real_X") real_Y = C.input( hr_image_shape, dynamic_axes=inp_dynamic_axes, name="real_Y") real_X_scaled = real_X/255 real_Y_scaled = real_Y/255 genG = net(real_X_scaled) G_loss = C.reduce_mean(C.square(real_Y_scaled - genG)) G_optim = C.adam(G_loss.parameters, lr=C.learning_rate_schedule( [(1, 0.01), (1, 0.001), (98, 0.0001)], C.UnitType.minibatch, 10000), momentum=C.momentum_schedule(0.9), gradient_clipping_threshold_per_sample=1.0) G_G_trainer = C.Trainer(genG, (G_loss, None), G_optim) return (real_X, real_Y, genG, real_X_scaled, real_Y_scaled, G_optim, G_G_trainer)
def Loss(self): # Evaluating old actions and values : logprobs, state_value, dist_entropy = self.policy.evaluate() # Finding the ratio (pi_theta / pi_theta__old): # (importance sampling) c_old_logprobs = C.input_variable(logprobs.shape, name='old_log_probs') ratios = C.exp(logprobs - C.stop_gradient(c_old_logprobs)) c_rewards = C.input_variable(1, name='rewards') advantages = c_rewards - C.stop_gradient(state_value) # Finding Surrogate Loss: surr1 = ratios * advantages surr2 = C.clip(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages neglog_loss = -C.element_min(surr1, surr2) entropy_loss = -0.01 * dist_entropy actor_loss = C.reduce_mean(neglog_loss + entropy_loss) critic_loss = 0.5 * C.reduce_mean(C.square(state_value - c_rewards)) loss = actor_loss + critic_loss chunk = { 'neglog_loss': neglog_loss, 'entropy_loss': entropy_loss, 'actor_loss': actor_loss, 'critic_loss': critic_loss } trainer = C.Trainer( loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule_per_sample(self.lr), C.momentum_schedule_per_sample(self.betas[0]), variance_momentum=C.momentum_schedule_per_sample( self.betas[1]))) # trainer = C.Trainer(loss, (loss, None), C.adam(loss.parameters, C.learning_parameter_schedule(10), C.momentum_schedule(0.9), variance_momentum=C.momentum_schedule(0.999))) # higher learning rate return loss, chunk, trainer
def flow_forward(input_dim: int, act_func_pair: tuple = (None, None), batch_norm: bool = False): chunk = {} log_det_J = 0 chunk['input_dim'] = input_dim _ph = C.placeholder(input_dim, name='place_holder') _out = _ph if batch_norm: # _bn = C.layers.BatchNormalization(name='batch_norm')(_ph) # chunk['scale'] = _bn.parameters[0] # chunk['bias'] = _bn.parameters[1] chunk['mu'] = C.Constant(np.zeros(shape=input_dim)) chunk['var'] = C.Constant(np.ones(shape=input_dim)) _eps = C.Constant(1e-7) _mu = C.reduce_mean(_ph, axis=C.Axis.default_batch_axis()) _var = C.reduce_mean(C.square(_ph-_mu), axis=C.Axis.default_batch_axis()) chunk['muB'] = _mu chunk['varB'] = _var # _bn = (_ph-chunk['mu'])/C.sqrt(chunk['var']+_eps) _bn = C.sqrt(chunk['var']+_eps)*_ph + chunk['mu'] _ph = _bn log_det_J += -0.5*C.reduce_sum(C.log((_var+_eps))) # log_det_J += C.reduce_sum(C.log()) chunk['W_rot_mat'] = _W = C.parameter((input_dim, input_dim)) _W.value = random_rotation_matrix = special_ortho_group.rvs(input_dim) # _W.value = np.roll(np.eye(input_dim),input_dim//2,axis=0) _out = _ph@_W log_det_J += C.log(C.abs(C.det(_W))) # or # log_det_J += C.slogdet(_W)[1] _half_dim = input_dim//2 _x1 = _out[:_half_dim] _x2 = _out[_half_dim:] _log_s_func, _t_func = act_func_pair if _log_s_func is None: # basic network _log_s_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim, C.tanh), ])#(C.placeholder(input_dim, name='place_holder')) if _t_func is None: # basic network _t_func = C.layers.Sequential([ C.layers.Dense(256, C.leaky_relu), C.layers.Dense(256, C.leaky_relu), C.layers.Dense(_half_dim), ])#(C.placeholder(input_dim, name='place_holder')) chunk['log_s_func'] = _log_s_func chunk['t_func'] = _t_func _log_s, _t = _log_s_func(_x2), _t_func(_x2) _s = C.exp(_log_s) _y1 = _s*_x1 + _t _y2 = _x2 _Y = C.splice(_y1, _y2) chunk['output'] = _Y log_det_J += C.reduce_sum(_log_s) return _Y, log_det_J, chunk
data_dir = os.path.join("..", "Examples", "Image", "DataSets", "MNIST") if not os.path.exists(data_dir): data_dir = os.path.join("data", "MNIST") print('Writing train text file...') savetxt(os.path.join(data_dir, "Train-28x28_cntk_text.txt"), train) print('Writing test text file...') savetxt(os.path.join(data_dir, "Test-28x28_cntk_text.txt"), test) print('Done') input = C.input_variable(input_dim) label = C.input_variable(num_output_classes) normalize_input = input / 255.0 squared_input = C.square(input / 255.0) sqrt_input = C.sqrt(input / 255.0) z = create_model(C.splice(normalize_input, squared_input, sqrt_input)) loss = C.cross_entropy_with_softmax(z, label) label_error = C.classification_error(z, label) lr_schedule = C.learning_parameter_schedule(learning_rate) learner = C.sgd(z.parameters, lr_schedule) trainer = C.Trainer(z, (loss, label_error), [learner]) data_found = False
def loss_fun(output, label): length = C.sequence.reduce_sum(C.reduce_sum(output) * 0 + 1) return C.sequence.reduce_sum(C.reduce_sum( C.square(output - label))) / length
def true_density(z): z1, z2 = z[0], z[1] w1 = lambda x: C.sin(2 * np.pi * x/4) u = 0.5 * C.square((z2 - w1(z1))/0.4) dummy = C.ones_like(u) * 1e7 # u = C.element_select(C.less_equal(z1,4), u, dummy) cond = C.less_equal(z1,4) u = C.element_select(cond, u, dummy) # u = cond*u + (1-cond)*dummy return C.exp(-u) #%% h = lambda x: C.tanh(x) h_prime = lambda x: 1 - C.square(C.tanh(x)) base_dist = MultivariateNormalDiag(loc=[0., 0.], scale_diag=[1., 1.]) z_0 = C.input_variable(base_dist.size(), name='sampled') z_prev = z_0 sum_log_det_jacob = 0. initializer = C.initializer.uniform(1) for i in range(K): u = C.parameter((2), name='u', init=initializer) w = C.parameter((2), name='w', init=initializer) b = C.parameter((1), name='b', init=initializer) psi = h_prime(C.dot(w, z_prev)+b) * w det_jacob = C.abs(1 + C.dot(u, psi))