def __init__(self, learning_rate=0.1, epsilon=None, use_locking=False): super(RayGrad, self).__init__(learning_rate, epsilon=epsilon, use_locking=use_locking) self.learning_rate = learning_rate self.epsilon = epsilon self.memory_size = S("optimizer.memory_size") self.loss_collect_last = S("optimizer.collect_last")
def _RedoRestFilters(graph): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, true if training. Raises: ValueError: When batch norm folding fails. """ matches = _FindRestFilters(graph) print( "Replacing", len(matches), "Conv|Mul|DepthwiseConv2dNative-Filters (without a suceeding BatchNorm)" ) for match in matches: scope, sep, _ = match['layer_op'].name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + '_psb' + sep): weight = match['weight_tensor'] # >>>>> CUSTOM >>>>>>>>>>>>>> # use hidden variable instead sampled_weight = variableFromSettings([], hiddenVar=weight)[0] # <<<<<<<<<<<<<<<<<<<<<<<<<<< new_layer_tensor = _CloneWithNewOperands( match['layer_op'], match['input_tensor'], sampled_weight, False) if S("util.variable.fixed_point.use"): new_layer_tensor = fixed_point( new_layer_tensor, S("util.variable.fixed_point.bits"), max=S("util.variable.fixed_point.max"), min=S("util.variable.fixed_point.min")) nodes_modified_count = common.RerouteTensor( new_layer_tensor, match['output_tensor']) if nodes_modified_count == 0: raise ValueError( 'Folding batch norms failed, %s had no outputs.' % match['output_tensor'].name)
def preprocess_image(image_buffer, bbox, output_height, output_width, num_channels, is_training=False): """Preprocesses the given image. Preprocessing includes decoding, cropping, and resizing for both training and eval images. Training preprocessing, however, introduces some random distortion of the image to improve accuracy. Args: image_buffer: scalar string Tensor representing the raw JPEG image buffer. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. num_channels: Integer depth of the image buffer for decoding. is_training: `True` if we're preprocessing the image for training and `False` otherwise. Returns: A preprocessed image. """ if is_training: # For training, we want to randomize some of the distortions. image = _decode_crop_and_flip(image_buffer, bbox, num_channels) image = _resize_image(image, output_height, output_width) else: # For validation, we want to decode, resize, then just crop the middle. image = tf.image.decode_jpeg(image_buffer, channels=num_channels) image = _aspect_preserving_resize(image, _RESIZE_MIN) image = _central_crop(image, output_height, output_width) image.set_shape([output_height, output_width, num_channels]) if S("dataset_mean_image_subtraction") == "pytorch": print("pytorch -mode: scale to -1,1") image /= 255.0 image -= 0.5 image *= 2.0 elif S("dataset_mean_image_subtraction"): print("tensorflow-mode: mean image subtraction") image = _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels) return image
def next_base2(x, strict_positive=False, stochastic=False, min=1e-8, binom_n=64): with tf.name_scope('next_base2'): x_start = x if strict_positive: sign = 1 else: sign = tf.sign(x) if stochastic: # x_next_base2 = tf.floor(tf.log(tf.abs(x+eps))/tf.log(2.0)) x_next_base2 = tf.floor( tf.log(tf.maximum(tf.abs(x), min)) / tf.log(2.0)) x_perc_missing = tf.abs(x) / 2**x_next_base2 - 1 # w_add = where_binarize[0,1]->{0,1}(x+exs) print("next_base2: stochastic-mode '" + str(stochastic) + "'") if stochastic == "binomial" or stochastic == "binom": memory_size = binom_n w_add = sample_binomial(x_perc_missing, memory_size, S('binom.log_eps')) / memory_size tf.summary.histogram("w_add", w_add) else: w_add = tf.where( tf.random.uniform(x.get_shape().as_list()) <= x_perc_missing, tf.ones_like(x), tf.zeros_like(x)) x_next_base2 += w_add else: x_next_base2 = tf.ceil( tf.log(tf.maximum(tf.abs(x), min)) / tf.log(2.0)) return pass_gradient(x_start, lambda x: sign * 2**x_next_base2, name='next_base2')
def network_inner(data, labels_one_hot, mode): is_training = mode == tf.estimator.ModeKeys.TRAIN id = lambda net, name=None: net GLOBAL["weight_counter"] = 0 print("is_training:" + str(is_training)) batch_normalization = lambda net, name=None: tfl.batch_normalization( net, name=name, reuse=is_training, training=is_training) numclasses = GLOBAL["dataset"].num_classes() data = to_picture_shape(data) net = data use_bias = False # stack convs for i, channels in enumerate(S("model.resnet.conv_blocks")): with tf.variable_scope("conv" + str(i)): net = tfl.conv2d(net, 64, 3, strides=2, padding="SAME", use_bias=use_bias) net = batch_normalization(net) net = activation(net) # end net = tf.reduce_mean(net, [1, 2], name='pool5', keepdims=True) tf.summary.histogram("pre_fc", net) if S("model.resnet.last_layer_real"): net = tfl.dense(net, numclasses) else: net = tfl.conv2d(net, numclasses, 1, strides=1, padding="SAME", use_bias=use_bias) net = tf.reshape(net, [-1, numclasses]) return net
def network(data, labels_one_hot, mode): model_name = S("model.classification_models.model") dataset = S("model.classification_models.dataset") # keras.backend.set_learning_phase(1 if mode==tf.estimator.ModeKeys.TRAIN else 0) # 0: Test(default), 1: Train keras.backend.set_learning_phase(0) # 0: Test(default), 1: Train classifier, preprocess_input = Classifiers.get(model_name) # overwrite preprocess_input for mobilenet (workaround for a bug in keras_applications) if "mobilenet" in model_name: from keras.applications import imagenet_utils preprocess_input = lambda data: imagenet_utils.preprocess_input( data, mode='tf') # apply model data = preprocess_input(data) GLOBAL["keras_model_preprocess"] = preprocess_input model = classifier((224, 224, 3), input_tensor=data, weights=dataset) GLOBAL["keras_model"] = model logits = model.output # keras-models do not use empty-class logits = tf.concat([tf.expand_dims(logits[:, 0] * 0, 1), logits], axis=-1) return logits
def get_filenames(self): if S("dataset_join_train_val"): if self.subset == 'train': print([ os.path.join(self.data_dir, 'train.tfrecords'), os.path.join(self.data_dir, 'validation.tfrecords') ]) print( "joining training and validation set. (leaving only testset for testing" ) return [ os.path.join(self.data_dir, 'train.tfrecords'), os.path.join(self.data_dir, 'validation.tfrecords') ] if self.subset in ['train', 'validation', 'eval']: return [os.path.join(self.data_dir, self.subset + '.tfrecords')] else: raise ValueError('Invalid data subset "%s"' % self.subset)
def lossfn(net_out, data, labels_one_hot, mode): with tf.name_scope('cross_entropy'): loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_one_hot, logits=net_out) tf.summary.scalar("loss", loss) with tf.name_scope('regularization'): reg_variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) if len(reg_variables) > 0: reg = tf.reduce_mean(reg_variables, name="regularization_loss") tf.summary.scalar('regularization', reg) else: reg = 0 with tf.name_scope('total_loss'): if reg != 0: total = loss + S("util.variable.regularizer.weight", alt=1.0) * reg tf.summary.scalar("total_loss", total) return total return loss
def variableFromSettings(shape, S=S, hiddenVar=None): # local variables for mini-parser V = {} # initializer base_initializer = getattr(tf.initializers, S("initializer")) initializer = TransformInitializer(base_initializer, S("transformation.init", alt=[]), dtype=getattr(tf, S("dtype")), seed=S("seed")) with tf.name_scope("var"): var_name = S("name") # define variable if hiddenVar is None: p = tf.get_variable(name=var_name, shape=shape, initializer=initializer, regularizer=None, trainable=True) else: p = hiddenVar # apply pruning if S("pruning.activate", scope=""): p = tf.contrib.model_pruning.apply_mask( p, scope=tf.contrib.framework.get_name_scope()) V["p"] = p # check for shared tensors localvars_used = [] for T in [ S("transformation.hidden", alt=[]), S("transformation.weight", alt=[]), S("transformation.regularizer.weight_transformation", alt=[]) ]: for i, t in enumerate(T): localvars = [] t_orig = t if isinstance(t, tuple): if t[0] not in ["w", "p", "x"]: localvars_used.append(t[0]) t = t[1] def parse_func(t, fn, string_fn): if not isinstance(string_fn, str): return str(t) if t.startswith(fn + "(") and t.endswith(")"): names = t[len(fn) + 1:-1].split(",") t = string_fn(*names) return t # parse predefined functions t = parse_func( t, "relaxed_binarize_wolog(0±ε)->[0,1]", lambda var: "tf.sigmoid((%s+eps + tf.log(rng) - tf.log(1-rng))/%s)" % (var, "relaxation_temp")) t = parse_func( t, "relaxed_binarize_wlog(1±ε)->[0,1]", lambda var: "tf.sigmoid((tf.log(tf.abs(%s+eps)) + tf.log(rng) - tf.log(1-rng))/%s)" % (var, "relaxation_temp")) t = parse_func( t, "gumbel_binarize_wolog(1±ε)->[0,1]", lambda var: "tf.abs(%s) + tf.log(rng) - tf.log(1-rng)" % var) t = parse_func( t, "where_binarize[0,1]->{0,1}", lambda var: "tf.where(rng <= " + var + ", ones, zeros, name='sampled_filter')") t = parse_func( t, "pass_through_binarize[0,1]->{-1,1}", lambda var: "pass_gradient(" + var + ", lambda p, localvars: tf.where(rng <= p, ones, -ones, name='sampled_filter'))" ) t = parse_func( t, "pass_through_binarize[0,1]->{0,1}", lambda var: "pass_gradient(" + var + ", lambda p, localvars: tf.where(rng <= p, ones, zeros, name='sampled_filter'))" ) t = parse_func( t, "softround", lambda var: var + " - tf.sin(2*np.pi*" + var + ")/(2*np.pi)") t = parse_func( t, "passed_round", lambda var: "2**pass_gradient(" + var + ", lambda x: x - tf.sin(2*np.pi*x)/(2*np.pi))") t = parse_func( t, "lecun_normalize", lambda var: "tf.identity((" + var + "-tf.nn.moments(" + var + ",axes=None)[0])/tf.nn.moments(" + var + ",axes=None)[1]*np.sqrt(1/np.prod(" + var + ".get_shape().as_list()[:-1])),name=\"lecun\")") t = parse_func( t, "lecun_normalize_no_mean", lambda var: "tf.identity((" + var + ")/tf.nn.moments(" + var + ",axes=None)[1]*np.sqrt(1/np.prod(" + var + ".get_shape().as_list()[:-1])),name=\"lecun\")") # get variables V["eps"] = 1e-5 if "ones" in t and "ones" not in V: localvars.append("ones") V["ones"] = tf.ones(shape) if "zeros" in t and "zeros" not in V: localvars.append("zeros") V["zeros"] = tf.zeros(shape) if "rng" in t and "rng" not in V: localvars.append("rng") V["rng"] = tf.random_uniform(shape, name="rng") # independent for var in localvars_used: if var in t and var not in localvars: localvars.append(var) # replace localvars if "localvars" in t: t = t.replace("localvars", ",".join([v + "=" + v for v in localvars])) # save modified t again if isinstance(t_orig, tuple): T[i] = (t_orig[0], t) else: T[i] = t # hidden variable transformations for t in S("transformation.hidden", alt=[]): if isinstance(t, tuple): name = t[0] V[name] = eval(t[1], {**G, **V}) if name.lower() == "assert": try: assert V[name] except AssertionError: raise AssertionError(t[1]) else: V["p"] = eval(t, {**G, **V}) # map hidden weight to weight V["w"] = p for t in S("transformation.weight", alt=[]): if isinstance(t, tuple): name = t[0] V[name] = eval(t[1], {**G, **V}) if name.lower() == "assert": try: assert V[name] except AssertionError: raise AssertionError(t[1]) else: V["w"] = eval(t, {**G, **V}) # add regularizer if S("regularizer.type") is not None: if all(var_name not in s for s in S("regularizer.exclude_names", alt=[])): tf.contrib.layers.apply_regularization( getattr(tf.contrib.layers, S("regularizer.type"))(S("regularizer.weight")), [eval(S("regularizer.weight_transformation"), { **G, **V })]) else: print("excluding:", var_name) GLOBAL["weight_counter"] += 1 # return sampled weight / hidden variable - combo return V["w"], V["p"]
from util.helpers import pass_gradient, sample_binomial, next_base2, fixed_point from util.initializer import TransformInitializer # global variables for mini-parser G = { "tf": tf, "np": np, "S": S, "pass_gradient": pass_gradient, "sample_binomial": sample_binomial, "next_base2": next_base2, "fixed_point": fixed_point, "GLOBAL": GLOBAL, } S = S(scope="util.variable") def variableFromSettings(shape, S=S, hiddenVar=None): # local variables for mini-parser V = {} # initializer base_initializer = getattr(tf.initializers, S("initializer")) initializer = TransformInitializer(base_initializer, S("transformation.init", alt=[]), dtype=getattr(tf, S("dtype")), seed=S("seed")) with tf.name_scope("var"):
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, true if training. freeze_batch_norm_delay: How many steps to wait before freezing moving mean and variance and using them for batch normalization. Raises: ValueError: When batch norm folding fails. """ matches = list(_FindFusedBatchNorms(graph)) print("Folding", len(matches), "FusedBatchNorms") for match in matches: scope, sep, _ = match.layer_op.name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep): # new weights = old weights * gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta multiplier_tensor = match.gamma_tensor * math_ops.rsqrt( match.variance_tensor + match.bn_op.get_attr('epsilon')) bias_tensor = math_ops.subtract(match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias') correction_scale, correction_recip, correction_offset = None, None, None if is_training: correction_scale, correction_recip, correction_offset = ( _ComputeBatchNormCorrections( context='', match=match, freeze_batch_norm_delay=freeze_batch_norm_delay)) # The shape of depthwise weights is different, so we need to reshape the # multiplier_tensor to ensure that the scaled_weight_tensor has the # expected shape. weights = match.weight_tensor # remember for the other loops matched_layer_set.add(match.layer_op) matched_layer_set.add(match.bn_op) if match.layer_op.type == 'DepthwiseConv2dNative': new_shape = [ match.weight_tensor.get_shape().as_list()[2], match.weight_tensor.get_shape().as_list()[3] ] multiplier_tensor = array_ops.reshape(multiplier_tensor, new_shape, name='scale_reshape') if correction_scale is not None: correction_scale = array_ops.reshape( correction_scale, new_shape, name='correction_reshape') if correction_scale is not None: weights = math_ops.multiply(correction_scale, weights, name='correction_mult') scaled_weight_tensor = math_ops.multiply(weights, multiplier_tensor, name='mul_fold') # >>>>> CUSTOM >>>>>>>>>>>>>> # use hidden variable instead scaled_weight_tensor = variableFromSettings( [], hiddenVar=scaled_weight_tensor)[0] # bias_tensor = variableFromSettings([],hiddenVar=bias_tensor)[0] # bias_tensor = next_base2(bias_tensor, strict_positive=False, min=1e-8) if S("util.variable.fixed_point.use"): bias_tensor = fixed_point( bias_tensor, S("util.variable.fixed_point.bits"), max=S("util.variable.fixed_point.max"), min=S("util.variable.fixed_point.min")) # <<<<<<<<<<<<<<<<<<<<<<<<<<< new_layer_tensor = _CloneWithNewOperands( match.layer_op, match.input_tensor, scaled_weight_tensor, match.batch_to_space_op) if correction_recip is not None: new_layer_tensor = math_ops.multiply(correction_recip, new_layer_tensor, name='post_conv_mul') new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset), 'correction_add') if S("util.variable.fixed_point.use"): new_layer_tensor = fixed_point( new_layer_tensor, S("util.variable.fixed_point.bits"), max=S("util.variable.fixed_point.max"), min=S("util.variable.fixed_point.min")) new_layer_tensor = math_ops.add(new_layer_tensor, bias_tensor, name='add_fold') if S("util.variable.fixed_point.use"): new_layer_tensor = tf.clip_by_value( new_layer_tensor, S("util.variable.fixed_point.min"), S("util.variable.fixed_point.max")) nodes_modified_count = common.RerouteTensor( new_layer_tensor, match.output_tensor) if nodes_modified_count == 0: raise ValueError( 'Folding batch norms failed, %s had no outputs.' % match.output_tensor.name)
def _RedoRestBatchnorms(graph, is_training): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, true if training. Raises: ValueError: When batch norm folding fails. """ matches = _FindRestBatchNorms(graph) print("Replacing", len(matches), "BatchNorms (without a preceding Conv2D)") for match in matches: scope, sep, _ = match.bn_op.name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + '_psb' + sep): mean = match.mean_tensor variance = match.variance_tensor beta = match.beta_tensor gamma = match.gamma_tensor eps = match.batch_epsilon # new gamma = gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta multfac = gamma / math_ops.sqrt(variance + eps) gamma = multfac beta = -multfac * mean + beta mean = array_ops.zeros_like(mean) variance = array_ops.ones_like(variance) eps = array_ops.zeros_like(eps) gamma = variableFromSettings([], hiddenVar=gamma)[0] # gamma = fixed_point(gamma,S("util.variable.fixed_point.bits"),max=S("util.variable.fixed_point.max"),min=S("util.variable.fixed_point.min")) # gamma = next_base2(gamma,strict_positive=False) # gamma = 1/variableFromSettings([],hiddenVar=1/gamma)[0] # variance = variableFromSettings([],hiddenVar=math_ops.sqrt(variance+eps))[0]**2 # beta = variableFromSettings([],hiddenVar=beta)[0] if S("util.variable.fixed_point.use"): beta = fixed_point(beta, S("util.variable.fixed_point.bits"), max=S("util.variable.fixed_point.max"), min=S("util.variable.fixed_point.min")) # gamma = fixed_point(gamma,S("util.variable.fixed_point.bits"),max=S("util.variable.fixed_point.max"),min=S("util.variable.fixed_point.min")) # mean = fixed_point(mean,S("util.variable.fixed_point.bits"),max=S("util.variable.fixed_point.max"),min=S("util.variable.fixed_point.min")) # variance = fixed_point(variance,S("util.variable.fixed_point.bits"),max=S("util.variable.fixed_point.max"),min=S("util.variable.fixed_point.min")) # fixed_point division could be ok # silly silly_idiv(silly x, silly y) { # uint64_t sign_bit = 1UL<<63; # // unsetting the sign bit to ignore it # silly res = ((x & ~sign_bit) / (y & sign_bit)) << 32; # // setting the sign bit iff only one of sign bits is set # res |= (x & sign_bit) ^ (y & sign_bit); # return res; # } new_layer_tensor = nn.batch_normalization( match.input_tensor, mean, variance, beta, gamma, eps, name=match.bn_op.name.split("/")[-1] + "_psb") if S("util.variable.fixed_point.use"): new_layer_tensor = fixed_point( new_layer_tensor, S("util.variable.fixed_point.bits"), max=S("util.variable.fixed_point.max"), min=S("util.variable.fixed_point.min")) nodes_modified_count = common.RerouteTensor( new_layer_tensor, match.output_tensor) if nodes_modified_count == 0: raise ValueError( 'Folding batch norms failed, %s had no outputs.' % match['output_tensor'].name)
def sample_binomial(p, n, eps=S('binom.log_eps')): # sample from a binomial distribution if S("binom.probability_mode") == "tfp": P = tf.stack([p, 1.0 - p], axis=-1) weight_binom = tfp.distributions.Multinomial(total_count=n, probs=P).sample()[..., 0] # weight_binom = tfp.distributions.Binomial(total_count=n,probs=p).sample() weight_binom = tf.cast(weight_binom, tf.float32) elif S("binom.probability_mode") == "gumbel": with tf.variable_scope("p"): # p = weight_p p = tf.clip_by_value(p, 0.0, 1.0) P = tf.stack([ binomialCoeff(n, k) * p**k * (1 - p)**(n - k) for k in range(n + 1) ], axis=-1) # reduces numerical instabilities P = tf.clip_by_value(P, eps, 1.0) gumbel = -tf.log( tf.maximum( -tf.log(tf.maximum(tf.random.uniform(P.get_shape()), eps)), eps)) # gumbel = -tf.log(-tf.log(tf.random.uniform(P.get_shape()))) # tf.summary.histogram("binom_p",p) # tf.summary.histogram("binom_P",P) # tf.summary.histogram("binom_logP",tf.log(P)) weight_binom = tf.argmax(tf.log(P) + gumbel, axis=-1) weight_binom = tf.cast(weight_binom, tf.float32) elif S("binom.probability_mode") == "gumbel_log": with tf.variable_scope("p"): # p = weight_p p = tf.clip_by_value(p, eps, 1.0 - eps) logP = tf.stack([ np.log(binomialCoeff(n, k)) + k * tf.log(p) + (n - k) * tf.log(1 - p) for k in range(n + 1) ], axis=-1) # reduces numerical instabilities gumbel = -tf.log( tf.maximum( -tf.log( tf.maximum(tf.random.uniform(logP.get_shape()), eps)), eps)) weight_binom = tf.argmax(logP + gumbel, axis=-1) weight_binom = tf.cast(weight_binom, tf.float32) if S("binom.gradient_correction") == "pass": weight_binom = pass_gradient(p, lambda p: weight_binom, lambda p: n * p) elif S("binom.gradient_correction") == "gumbel": weight_binom = pass_gradient( p, lambda p: weight_binom, lambda p: tf.squeeze( tf.batch_gather( P, tf.cast(tf.expand_dims(weight_binom, -1), tf.int32)))) else: raise ValueError("Gradient not defined for tf.cast. TODO") return weight_binom
import tensorflow as tf import numpy as np from template.misc import S, GLOBAL, print_info as print if S("binom.probability_mode") == "tfp": import tensorflow_probability as tfp # -------------- # # tensor helpers # # -------------- # # get shape (excluding input shape) def getshape(x): return x.get_shape().as_list()[1:] # pass gradient around non-diferentiable function def pass_gradient(x, backward_fn, forward_fn=lambda x: x, name=None): fnx = forward_fn(x) return tf.add(fnx, tf.stop_gradient(backward_fn(x) - fnx), name=name) # guess picture shape and reshape def to_picture_shape(input): current_shape = input.get_shape().as_list()[1:] current_dim = np.prod(current_shape) shape = None for i in range(256, 3, -1): if current_dim % (i * i) == 0: shape = [-1, i, i, int(current_dim / (i * i))]
def minimize(self, loss, global_step=None, var_list=None, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None): # compute (meaned) gradients for a batch grads_and_vars = self.compute_gradients( loss, var_list=var_list, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss) # check if any trainable variables provided for g, v in grads_and_vars: if g is None: print("Gradient of '" + v.name + "' is 'None'. Ignoring") grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] # default adam does: # return self.apply_gradients(grads_and_vars, global_step=global_step, name=name) # get all trainable variables variables = [v for g, v in grads_and_vars] # create a copy of all trainable variables with `0` as initial values with tf.name_scope("optimizer"): gradient_sum = [ tf.get_variable(v.name.replace(":0", "_sum"), initializer=tf.zeros_like( v.initialized_value()), trainable=False) for v in variables ] def capacity_gradient(grad_sum, grad, name, var): if "hiddenWeight" in name and "weight_gradient" in GLOBAL: return GLOBAL["weight_gradient"](grad_sum, grad, var) return grad_sum + grad with tf.control_dependencies([GLOBAL["memory_step"]]): # collect the batch gradient into accumulated vars gradient_sum_update = [ gs.assign( tf.where(GLOBAL["memory_step"] > 0, capacity_gradient(gs, g, v.name, v), g)) for gs, (g, v) in zip(gradient_sum, grads_and_vars) ] with tf.control_dependencies(gradient_sum_update): train_step = tf.cond( GLOBAL["memory_step"] >= S("optimizer.memory_size") - 1, true_fn=lambda: self.apply_gradients([ (gs / S("optimizer.memory_size"), v) for gs, (g, v) in zip(gradient_sum, grads_and_vars) ], global_step), false_fn=lambda: tf.no_op()) return train_step
def batch_normalization( inputs, axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True, beta_initializer=init_ops.zeros_initializer(), gamma_initializer=init_ops.ones_initializer(), moving_mean_initializer=init_ops.zeros_initializer(), moving_variance_initializer=init_ops.ones_initializer(), beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, training=False, trainable=True, name=None, reuse=None, renorm=False, renorm_clipping=None, renorm_momentum=0.99, fused=None, virtual_batch_size=None, adjustment=None): layer = BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, beta_constraint=beta_constraint, gamma_constraint=gamma_constraint, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_momentum, fused=fused, trainable=trainable, virtual_batch_size=virtual_batch_size, adjustment=adjustment, name=name, _reuse=reuse, _scope=name) res = layer.apply(inputs, training=training) if not S("batch_norm.transform"): return res # get moving mean and variance moving_mean, moving_variance = layer.moving_mean, layer.moving_variance beta_offset, gamma_scale = layer.beta, layer.gamma if GLOBAL["first_layer"]: GLOBAL["first_layer"] = False else: pass # print("reformulate batchnorm") # apply transformation # -------------------- # moving_mean = variableFromSettings([],hiddenVar=moving_mean)[0] # moving_variance = variableFromSettings([],hiddenVar=moving_variance)[0] # beta_offset = variableFromSettings([],hiddenVar=beta_offset)[0] # gamma_scale = variableFromSettings([],hiddenVar=gamma_scale)[0] # apply transformation (no var) # -------------------- # sample_size = S("binom.sample_size") # S("binom.sample_size",set=sample_size*4) # gamma_scale = gamma_scale/tf.sqrt(moving_variance+layer.epsilon) # gamma_scale = variableFromSettings([],hiddenVar=gamma_scale/tf.sqrt(moving_variance+layer.epsilon))[0] # moving_variance = 0*moving_variance+1 # moving_variance = tf.ones_like(moving_variance) # S("binom.sample_size",set=sample_size) # moving_variance = 1.0/variableFromSettings([],hiddenVar=1.0/moving_variance)[0] # moving_mean = fixed_point(moving_mean,8) # moving_mean, _ = variableFromSettings([],hiddenVar=moving_mean) # moving_variance = next_base2(moving_variance, strict_positive=True) # moving_variance = 2**tf.ceil(tf.log(tf.maximum(tf.abs(moving_variance),0))/tf.log(2.0)) # tf.summary.histogram("bn_mean",moving_mean) # tf.summary.histogram("bn_var",moving_variance) # set moving mean and variance layer.moving_mean, layer.moving_variance = moving_mean, moving_variance layer.beta, layer.gamma = beta_offset, gamma_scale # reapply res = layer.apply(inputs, training=training) return res