def sg_regularizer_loss(scale=1.0): r""" Get regularizer losss Args: scale: A scalar. A weight applied to regularizer loss """ return scale * tf.reduce_mean(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
def get_loss(opt): # encode audio feature logit = get_logit(opt.input[opt.gpu_index], voca_size=voca_size) for i in tf.get_collection("regularization_losses"): print(i) print('--------------------') train_list = tf.trainable_variables() var_list = tf.global_variables() real_var_list = [] for item in var_list: # print(item) if 'W' in item.name: real_var_list.append(item) loss = logit.sg_ctc(target=opt.target[opt.gpu_index], seq_len=opt.seq_len[opt.gpu_index]) # print(loss) # tf.add_to_collection("losses", loss) # losses = tf.get_collection("losses") # losses += tf.get_collection("regularization_losses") # for i in tf.get_collection("losses"): # print(i.name) # print('++++++++++++++++++++') # total_loss = tf.add_n(losses, name='total_loss') # for item in real_var_list: # loss += 0.03 * tf.nn.l2_loss(item) # for i in tf.get_collection("regularization_losses"): # loss += 0.03 * i regular_loss = tf.sg_regularizer_loss(0.03) loss += regular_loss return loss
def sg_optim(loss, **kwargs): r"""Applies gradients to variables. Args: loss: A 0-D `Tensor` containing the value to minimize. list of 0-D tensor for Multiple GPU kwargs: optim: A name for optimizer. 'MaxProp' (default), 'AdaMax', 'Adam', 'RMSProp' or 'sgd'. lr: A Python Scalar (optional). Learning rate. Default is .001. beta1: A Python Scalar (optional). Default is .9. beta2: A Python Scalar (optional). Default is .99. momentum : A Python Scalar for RMSProp optimizer (optional). Default is 0. category: A string or string list. Specifies the variables that should be trained (optional). Only if the name of a trainable variable starts with `category`, it's value is updated. Default is '', which means all trainable variables are updated. """ opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(optim='MaxProp', lr=0.001, beta1=0.9, beta2=0.99, momentum=0., category='') # select optimizer if opt.optim == 'MaxProp': optim = tf.sg_optimize.MaxPropOptimizer(learning_rate=opt.lr, beta2=opt.beta2) elif opt.optim == 'AdaMax': optim = tf.sg_optimize.AdaMaxOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'Adam': optim = tf.train.AdamOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'RMSProp': optim = tf.train.RMSPropOptimizer(learning_rate=opt.lr, decay=opt.beta1, momentum=opt.momentum) else: optim = tf.train.GradientDescentOptimizer(learning_rate=opt.lr) # get trainable variables if isinstance(opt.category, (tuple, list)): var_list = [] for cat in opt.category: var_list.extend([t for t in tf.trainable_variables() if t.name.startswith(cat)]) else: var_list = [t for t in tf.trainable_variables() if t.name.startswith(opt.category)] # # calc gradient # # multiple GPUs case if isinstance(loss, (tuple, list)): gradients = [] # loop for each GPU tower for i, loss_ in enumerate(loss): # specify device with tf.device('/gpu:%d' % i): # give new scope only to operation with tf.name_scope('gpu_%d' % i): # add gradient calculation operation for each GPU tower gradients.append(tf.gradients(loss_, var_list)) # averaging gradient gradient = [] for grad in zip(*gradients): gradient.append(tf.add_n(grad) / len(loss)) # single GPU case else: gradient = tf.gradients(loss, var_list) gradient, _ = tf.clip_by_global_norm(gradient, opt.clip_grad_norm) # gradient update op with tf.device('/gpu:0'): grad_var = [(g, v) for g, v in zip(gradient, var_list)] grad_op = optim.apply_gradients(grad_var, global_step=tf.sg_global_step()) # add summary using last tower value for g, v in grad_var: # exclude batch normal statics if 'mean' not in v.name and 'variance' not in v.name \ and 'beta' not in v.name and 'gamma' not in v.name: tf.sg_summary_gradient(v, g) # extra update ops within category ( for example, batch normal running stat update ) if isinstance(opt.category, (tuple, list)): update_op = [] for cat in opt.category: update_op.extend([t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(cat)]) else: update_op = [t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(opt.category)] return tf.group(*([grad_op] + update_op))
def wrapper(tensor, **kwargs): # kwargs parsing opt = tf.sg_opt(kwargs) + _context # set default argument try: shape = tensor.get_shape().as_list() # dropout off opt += tf.sg_opt(shape=shape, in_dim=shape[-1], dim=shape[-1], dout=0) # disable bias when normalization on opt += tf.sg_opt(bias=not opt.ln) finally: pass # automatic layer naming if opt.name is None: # layer function name will be used as layer name opt.name = func.__name__.replace('sg_', '') # find existing layer names exist_layers = [] for t in tf.get_collection(tf.GraphKeys.VARIABLES): scope_name = tf.get_variable_scope().name prefix = scope_name + '/' if len(scope_name) > 0 else '' i = t.name.rfind(prefix + 'layers/' + opt.name) if i >= 0: exist_layers.append(t.name[i:].split('/')[-2]) exist_layers = list(set(exist_layers)) # layer name numbering if len(exist_layers) == 0: opt.name += '_1' else: opt.name += '_%d' % ( max([int(n.split('_')[-1]) for n in exist_layers]) + 1) # all layer variables start with 'layers/' prefix with tf.variable_scope('layers', reuse=opt.reuse): with tf.variable_scope(opt.name): # call layer function out = func(tensor, opt) # apply dropout if opt.dout: out = tf.cond(_phase, lambda: tf.nn.dropout(out, 1 - opt.dout), lambda: out) # rename tensor out = tf.identity(out, 'out') # add final output summary if opt.reuse is None or not opt.reuse: tf.sg_summary_activation(out) # save node info for reuse out._sugar = tf.sg_opt(func=func, arg=tf.sg_opt(kwargs) + _context, prev=tensor, is_layer=True, name=opt.name) # inject reuse function out.sg_reuse = types.MethodType(sg_reuse, out) return out
def wrapper(tensor, **kwargs): import sg_initializer as init import sg_activation # kwargs parsing opt = tf.sg_opt(kwargs) + _context # set default argument try: shape = tensor.get_shape().as_list() # batch normalization off, layer normalization off, dropout off opt += tf.sg_opt(shape=shape, in_dim=shape[-1], dim=shape[-1], bn=False, ln=False, dout=0) assert not ( opt.bn and opt.ln ), 'one of batch normalization and layer normalization is available.' # disable bias when normalization on opt += tf.sg_opt(bias=not (opt.bn or opt.ln)) finally: pass # automatic layer naming if opt.name is None: # layer function name will be used as layer name opt.name = func.__name__.replace('sg_', '') # find existing layer names exist_layers = [] for t in tf.get_collection(tf.GraphKeys.VARIABLES): scope_name = tf.get_variable_scope().name prefix = scope_name + '/' if len(scope_name) > 0 else '' i = t.name.rfind(prefix + 'layers/' + opt.name) if i >= 0: exist_layers.append(t.name[i:].split('/')[-2]) exist_layers = list(set(exist_layers)) # layer name numbering if len(exist_layers) == 0: opt.name += '_1' else: opt.name += '_%d' % ( max([int(n.split('_')[-1]) for n in exist_layers]) + 1) # all layer variables start with 'layers/' prefix with tf.variable_scope('layers', reuse=opt.reuse): with tf.variable_scope(opt.name): # call layer function out = func(tensor, opt) # apply batch normalization if opt.bn: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # offset, scale parameter mean_running = init.constant('mean', opt.dim) variance_running = init.constant('variance', opt.dim, value=1) # calc batch mean, variance mean, variance = tf.nn.moments( out, axes=range(len(out.get_shape()) - 1)) # update running mean, variance def update_running_stat(): decay = 0.99 update_op = [ mean_running.assign(mean_running * decay + mean * (1 - decay)), variance_running.assign(variance_running * decay + variance * (1 - decay)) ] with tf.control_dependencies(update_op): return tf.identity(mean), tf.identity(variance) # select mean, variance by training phase m, v = tf.cond( _phase, update_running_stat, # updated running stat and batch mean, variance lambda: (mean_running, variance_running) ) # saved mean, variance # apply batch normalization out = tf.nn.batch_normalization(out, m, v, beta, gamma, tf.sg_eps) # apply normalization parameters if opt.ln: # offset, scale parameter beta = init.constant('beta', opt.dim) gamma = init.constant('gamma', opt.dim, value=1) # calc layer mean, variance for final axis mean, variance = tf.nn.moments( out, axes=[len(out.get_shape()) - 1], keep_dims=True) # apply normalization out = (out - mean) / tf.sqrt(variance + tf.sg_eps) # apply parameter out = gamma * out + beta # apply activation if opt.act: out = getattr(sg_activation, 'sg_' + opt.act.lower())(out) # apply dropout if opt.dout: out = tf.cond(_phase, lambda: tf.nn.dropout(out, 1 - opt.dout), lambda: out) # rename tensor out = tf.identity(out, 'out') # add final output summary if opt.reuse is None or not opt.reuse: tf.sg_summary_activation(out) # save node info for reuse out._sugar = tf.sg_opt(func=func, arg=tf.sg_opt(kwargs) + _context, prev=tensor, is_layer=True, name=opt.name) # inject reuse function out.sg_reuse = types.MethodType(sg_reuse, out) return out
def sg_optim(loss, **kwargs): r"""Applies gradients to variables. Args: loss: A 0-D `Tensor` containing the value to minimize. kwargs: optim: A name for optimizer. 'MaxProp' (default), 'AdaMax', 'Adam', or 'sgd'. lr: A Python Scalar (optional). Learning rate. Default is .001. beta1: A Python Scalar (optional). Default is .9. beta2: A Python Scalar (optional). Default is .99. category: A string or string list. Specifies the variables that should be trained (optional). Only if the name of a trainable variable starts with `category`, it's value is updated. Default is '', which means all trainable variables are updated. """ opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(optim='MaxProp', lr=0.001, beta1=0.9, beta2=0.99, category='') # select optimizer if opt.optim == 'MaxProp': optim = tf.sg_optimize.MaxPropOptimizer(learning_rate=opt.lr, beta2=opt.beta2) elif opt.optim == 'AdaMax': optim = tf.sg_optimize.AdaMaxOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'Adam': optim = tf.train.AdamOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'DP_GD': optim = tf.sg_optimize.DPGradientDescentOptimizer( opt.lr, [opt.eps, opt.delta], opt.gaussian_sanitizer, sigma=opt.sigma, batches_per_lot=opt.batches_per_lot) else: optim = tf.train.GradientDescentOptimizer(learning_rate=opt.lr) # get trainable variables if isinstance(opt.category, (tuple, list)): var_list = [] for cat in opt.category: var_list.extend([t for t in tf.trainable_variables() if t.name.startswith(cat)]) else: var_list = [t for t in tf.trainable_variables() if t.name.startswith(opt.category)] if opt.optim == 'DP_GD': # only handle 1 batch per lot print(type(loss)) print(loss) sanitized_grads = optim.compute_sanitized_gradients(loss, var_list=var_list) for v, g in zip(var_list, sanitized_grads): # exclude batch normal statics if 'mean' not in v.name and 'variance' not in v.name \ and 'beta' not in v.name and 'gamma' not in v.name: tf.sg_summary_gradient(v, g) grad_op = optim.apply_gradients(sanitized_grads, global_step=tf.sg_global_step()) else: # calc gradient gradient = optim.compute_gradients(loss, var_list=var_list) # add summary for v, g in zip(var_list, gradient): # exclude batch normal statics if 'mean' not in v.name and 'variance' not in v.name \ and 'beta' not in v.name and 'gamma' not in v.name: tf.sg_summary_gradient(v, g) # gradient update op grad_op = optim.apply_gradients(gradient, global_step=tf.sg_global_step()) # extra update ops within category ( for example, batch normal running stat update ) if isinstance(opt.category, (tuple, list)): update_op = [] for cat in opt.category: update_op.extend([t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(cat)]) else: update_op = [t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(opt.category)] return [grad_op] + update_op
targ = tf.placeholder(dtype=tf.int32, shape=(1, None)) #corpus.label.shape)#corpus.label loss = logit.sg_ctc(target=targ, seq_len=seq_len) opt = tf.train.GradientDescentOptimizer(learning_rate=lr) optimizer = opt.minimize(loss, var_list=(noise, )) new_target = np.array(str2index(fool)) # run network with tf.Session() as sess: # init variables tf.sg_init(sess) all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + \ tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) vars_to_train = [el for el in all_vars if 'noise' not in el.name] # restore parameters saver = tf.train.Saver(vars_to_train) saver.restore(sess, tf.train.latest_checkpoint('asset/train')) # run session for i in xrange(10000): new_loss, _, noise_out = sess.run([loss, optimizer, noise], feed_dict={ x: mfccs[index], targ: new_target.reshape((1, -1)) })
def tower_loss2_old(xx, scope, reuse_vars=False): # make embedding matrix for source and target with tf.variable_scope('embs', reuse=reuse_vars): emb_x = tf.sg_emb(name='emb_x', voca_size=Hp.vs, dim=Hp.hd, dev=self._dev) emb_y = tf.sg_emb(name='emb_y', voca_size=Hp.vs, dim=Hp.hd, dev=self._dev) x_sents = tf.unstack(xx, axis=1) #each element is (batch, sentlen) # generate first an unconditioned sentence n_input = Hp.hd subrec1 = subrec_zero_state(Hp.bs, Hp.hd) subrec2 = subrec_zero_state(Hp.bs, Hp.hd) rnn_cell = LSTMCell(in_dim=n_input, dim=Hp.hd) (rnn_state, rnn_h) = rnn_cell.zero_state(Hp.bs) crnn_cell = ConvLSTMCell(in_dim=n_input, dim=Hp.hd) (crnn_state, crnn_h) = crnn_cell.zero_state(n_input) for sent in range(len(x_sents) - 1): y = x_sents[i + 1] x = x_sents[i] # (batch, sentlen) = (16, 200) # shift target by one step for training source y_src = tf.concat([tf.zeros((Hp.bs, 1), tf.sg_intx), y[:, :-1]], 1) # embed table lookup enc = x.sg_lookup(emb=emb_x) #(batch, sentlen, dim1) # loop dilated conv block for i in range(num_blocks): enc = (enc.sg_res_block( size=5, rate=1, name="enc1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=2, name="enc2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=4, name="enc4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=8, name="enc8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=5, rate=16, name="enc16_%d" % (i), reuse_vars=reuse_vars)) #quasi rnn layer [batch * 3, t, dim2 ] conv = enc.sg_quasi_conv1d(is_enc=True, size=2, name="conv1", reuse_vars=reuse_vars) #attention layer # recurrent layer # 1 + final encoder hidden state concat = subrec1.sg_concat(target=conv, dim=0) subrec1 = conv.sg_quasi_rnn(is_enc=True, att=True) conv = pool.sg_quasi_conv1d(is_enc=True, size=2, name="conv2", reuse_vars=reuse_vars) concat = subrec2.sg_concat(target=conv, dim=0) subrec2 = conv.sg_quasi_rnn(is_enc=True, att=True) # conv LSTM (crnn_state, crnn_h) = crnn_cell(subrec2, (crnn_state, crnn_h), 5) # recurrent block (rnn_state, rnn_h) = rnn_cell(crnn_h, (rnn_state, rnn_h)) # CNN decoder dec = crnn_h.sg_concat(target=y_src.sg_lookup(emb=emb_y), name="dec") for i in range(num_blocks): dec = (dec.sg_res_block( size=3, rate=1, causal=True, name="dec1_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=2, causal=True, name="dec2_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=4, causal=True, name="dec4_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=8, causal=True, name="dec8_%d" % (i), reuse_vars=reuse_vars).sg_res_block( size=3, rate=16, causal=True, name="dec16_%d" % (i), reuse_vars=reuse_vars)) # final fully convolution layer for softmax dec = dec.sg_conv1d_gpus(size=1, dim=Hp.vs,name="out",summary=False,\ dev = self._dev,reuse=reuse_vars) ce_array = dec.sg_ce(target=y, mask=True, name="cross_ent_example") cross_entropy_mean = tf.reduce_mean(ce_array, name='cross_entropy') tf.add_to_collection('losses', cross_entropy_mean) # Assemble all of the losses for the current tower only. losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') return total_loss