def sg_optim(loss, **kwargs): opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(optim='MaxProp', lr=0.001, beta1=0.9, beta2=0.99, category='') # select optimizer if opt.optim == 'MaxProp': optim = tf.sg_optimize.MaxPropOptimizer(learning_rate=opt.lr, beta2=opt.beta2) elif opt.optim == 'AdaMax': optim = tf.sg_optimize.AdaMaxOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) # get trainable variables var_list = [ t for t in tf.trainable_variables() if t.name.encode('utf8').startswith(opt.category) ] # calc gradient gradient = optim.compute_gradients(loss, var_list=var_list) # add summary for v, g in zip(var_list, gradient): tf.sg_summary_gradient(v, g) # gradient update op return optim.apply_gradients(gradient, global_step=tf.sg_global_step())
def alt_train(sess, opt): if sess.run(tf.sg_global_step()) % 1 == 0: l_disc = sess.run([loss_d_r, train_disc])[0] # training discriminator else: l_disc = sess.run(loss_d) # l_gen = sess.run([loss_g, train_gen])[0] # training generator # print np.mean(l_gen) return np.mean(l_disc) #+ np.mean(l_gen)
# # run network # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # init variables tf.sg_init(sess) # restore parameters saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint('asset/train')) # logging tf.sg_info('Testing started on %s set at global step[%08d].' % (tf.sg_arg().set.upper(), sess.run(tf.sg_global_step()))) with tf.sg_queue_context(): # create progress bar iterator = tqdm(range(0, int(data.num_batch * tf.sg_arg().frac)), total=int(data.num_batch * tf.sg_arg().frac), initial=0, desc='test', ncols=70, unit='b', leave=False) # batch loop loss_avg = 0. for _ in iterator: # run session batch_loss = sess.run(loss) # loss history update
def sg_optim(loss, **kwargs): r"""Applies gradients to variables. Args: loss: A 0-D `Tensor` containing the value to minimize. list of 0-D tensor for Multiple GPU kwargs: optim: A name for optimizer. 'MaxProp' (default), 'AdaMax', 'Adam', 'RMSProp' or 'sgd'. lr: A Python Scalar (optional). Learning rate. Default is .001. beta1: A Python Scalar (optional). Default is .9. beta2: A Python Scalar (optional). Default is .99. momentum : A Python Scalar for RMSProp optimizer (optional). Default is 0. category: A string or string list. Specifies the variables that should be trained (optional). Only if the name of a trainable variable starts with `category`, it's value is updated. Default is '', which means all trainable variables are updated. """ opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(optim='MaxProp', lr=0.001, beta1=0.9, beta2=0.99, momentum=0., category='') # select optimizer if opt.optim == 'MaxProp': optim = tf.sg_optimize.MaxPropOptimizer(learning_rate=opt.lr, beta2=opt.beta2) elif opt.optim == 'AdaMax': optim = tf.sg_optimize.AdaMaxOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'Adam': optim = tf.train.AdamOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'RMSProp': optim = tf.train.RMSPropOptimizer(learning_rate=opt.lr, decay=opt.beta1, momentum=opt.momentum) else: optim = tf.train.GradientDescentOptimizer(learning_rate=opt.lr) # get trainable variables if isinstance(opt.category, (tuple, list)): var_list = [] for cat in opt.category: var_list.extend([t for t in tf.trainable_variables() if t.name.startswith(cat)]) else: var_list = [t for t in tf.trainable_variables() if t.name.startswith(opt.category)] # # calc gradient # # multiple GPUs case if isinstance(loss, (tuple, list)): gradients = [] # loop for each GPU tower for i, loss_ in enumerate(loss): # specify device with tf.device('/gpu:%d' % i): # give new scope only to operation with tf.name_scope('gpu_%d' % i): # add gradient calculation operation for each GPU tower gradients.append(tf.gradients(loss_, var_list)) # averaging gradient gradient = [] for grad in zip(*gradients): gradient.append(tf.add_n(grad) / len(loss)) # single GPU case else: gradient = tf.gradients(loss, var_list) gradient, _ = tf.clip_by_global_norm(gradient, opt.clip_grad_norm) # gradient update op with tf.device('/gpu:0'): grad_var = [(g, v) for g, v in zip(gradient, var_list)] grad_op = optim.apply_gradients(grad_var, global_step=tf.sg_global_step()) # add summary using last tower value for g, v in grad_var: # exclude batch normal statics if 'mean' not in v.name and 'variance' not in v.name \ and 'beta' not in v.name and 'gamma' not in v.name: tf.sg_summary_gradient(v, g) # extra update ops within category ( for example, batch normal running stat update ) if isinstance(opt.category, (tuple, list)): update_op = [] for cat in opt.category: update_op.extend([t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(cat)]) else: update_op = [t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(opt.category)] return tf.group(*([grad_op] + update_op))
def console_log(sess_): if epoch >= 0: tf.sg_info('\tEpoch[%03d:gs=%d] - loss = %s' % (epoch, sess_.run(tf.sg_global_step()), ('NA' if loss is None else '%8.6f' % loss)))
def wrapper(**kwargs): r""" Manages arguments of `tf.sg_opt`. Args: **kwargs: lr: A Python Scalar (optional). Learning rate. Default is .001. save_dir: A string. The root path to which checkpoint and log files are saved. Default is `asset/train`. max_ep: A positive integer. Maximum number of epochs. Default is 1000. ep_size: A positive integer. Number of Total batches in an epoch. For proper display of log. Default is 1e5. save_interval: A Python scalar. The interval of saving checkpoint files. By default, for every 600 seconds, a checkpoint file is written. log_interval: A Python scalar. The interval of recoding logs. By default, for every 60 seconds, logging is executed. max_keep: A positive integer. Maximum number of recent checkpoints to keep. Default is 5. keep_interval: A Python scalar. How often to keep checkpoints. Default is 1 hour. eval_metric: A list of tensors containing the value to evaluate. Default is []. tqdm: Boolean. If True (Default), progress bars are shown. If False, a series of loss will be shown on the console. """ opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(lr=0.001, save_dir='asset/train', max_ep=1000, ep_size=100000, save_interval=600, log_interval=60, eval_metric=[], max_keep=5, keep_interval=1, tqdm=True) # training epoch and loss epoch, loss = -1, None # checkpoint saver saver = tf.train.Saver(max_to_keep=opt.max_keep, keep_checkpoint_every_n_hours=opt.keep_interval) # add evaluation summary for m in opt.eval_metric: tf.sg_summary_metric(m) # summary writer log_dir = opt.save_dir + '/run-%02d%02d-%02d%02d' % tuple(tf.time.localtime(tf.time.time()))[1:5] summary_writer = tf.summary.FileWriter(log_dir) # console logging function def console_log(sess_): if epoch >= 0: tf.sg_info('\tEpoch[%03d:gs=%d] - loss = %s' % (epoch, sess_.run(tf.sg_global_step()), ('NA' if loss is None else '%8.6f' % loss))) local_init_op = tf.group(tf.sg_phase().assign(True), tf.tables_initializer(), tf.local_variables_initializer()) # create supervisor sv = tf.train.Supervisor(logdir=opt.save_dir, saver=saver, save_model_secs=opt.save_interval, summary_writer=summary_writer, save_summaries_secs=opt.log_interval, global_step=tf.sg_global_step(), local_init_op=local_init_op) # create session with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # console logging loop if not opt.tqdm: sv.loop(opt.log_interval, console_log, args=(sess,)) # get start epoch _step = sess.run(tf.sg_global_step()) ep = _step // opt.ep_size best_f1 = 0 # check if already finished if ep <= opt.max_ep: # logging tf.sg_info('Training started from epoch[%03d]-step[%d].' % (ep, _step)) # epoch loop for ep in range(ep, opt.max_ep + 1): # update epoch info start_step = sess.run(tf.sg_global_step()) % opt.ep_size epoch = ep # create progressbar iterator if opt.tqdm: iterator = tf.tqdm(range(start_step, opt.ep_size), total=opt.ep_size, initial=start_step, desc='train', ncols=70, unit='b', leave=False) else: iterator = range(start_step, opt.ep_size) # batch loop for _ in iterator: # exit loop if sv.should_stop(): break # call train function batch_loss = func(sess, opt) # loss history update if batch_loss is not None and \ not np.isnan(batch_loss.all()) and not np.isinf(batch_loss.all()): if loss is None: loss = np.mean(batch_loss) else: loss = loss * 0.9 + np.mean(batch_loss) * 0.1 # log epoch information console_log(sess) f1_stat = show_metrics(sv, sess, opt.eval_metric[2], opt.eval_metric[3], ep, opt.val_ep_size, 'val', use_tqdm=True) if f1_stat > best_f1: best_f1 = f1_stat max_model_file = opt.save_dir + max_model_name # save last version saver.save(sess, max_model_file) print("Improved F1 score, max model saved in file: %s" % max_model_file) print('Test metrics:') show_metrics(sv, sess, opt.test_metric[0], opt.test_metric[1], ep, opt.test_ep_size, 'test', use_tqdm=True) # save last version saver.save(sess, opt.save_dir + '/model.ckpt', global_step=sess.run(tf.sg_global_step())) # logging tf.sg_info('Training finished at epoch[%d]-step[%d].' % (ep, sess.run(tf.sg_global_step()))) else: tf.sg_info('Training already finished at epoch[%d]-step[%d].' % (ep - 1, sess.run(tf.sg_global_step())))
def wrapper(**kwargs): opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(lr=0.001, save_dir='asset/train', max_ep=1000, ep_size=100000, save_interval=600, log_interval=60, early_stop=True, lr_reset=False, eval_metric=[], max_keep=5, keep_interval=1, tqdm=True, console_log=False) # make directory if not exist if not os.path.exists(opt.save_dir + '/log'): os.makedirs(opt.save_dir + '/log') if not os.path.exists(opt.save_dir + '/ckpt'): os.makedirs(opt.save_dir + '/ckpt') # find last checkpoint last_file = tf.train.latest_checkpoint(opt.save_dir + '/ckpt') if last_file: ep = start_ep = int(last_file.split('-')[1]) + 1 start_step = int(last_file.split('-')[2]) else: ep = start_ep = 1 start_step = 0 # checkpoint saver saver = tf.train.Saver(max_to_keep=opt.max_keep, keep_checkpoint_every_n_hours=opt.keep_interval) # summary writer summary_writer = tf.train.SummaryWriter(opt.save_dir + '/log', graph=tf.get_default_graph()) # add learning rate summary with tf.name_scope('summary'): tf.scalar_summary('60. learning_rate/learning_rate', _learning_rate) # add evaluation metric summary for m in opt.eval_metric: tf.sg_summary_metric(m) # summary op summary_op = tf.merge_all_summaries() # create session if opt.sess: sess = opt.sess else: # session with multiple GPU support sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # initialize variables sg_init(sess) # restore last checkpoint if last_file: saver.restore(sess, last_file) # set learning rate if start_ep == 1 or opt.lr_reset: sess.run(_learning_rate.assign(opt.lr)) # logging tf.sg_info('Training started from epoch[%03d]-step[%d].' % (start_ep, start_step)) try: # start data queue runner with tf.sg_queue_context(sess): # set session mode to train tf.sg_set_train(sess) # loss history for learning rate decay loss, loss_prev, early_stopped = None, None, False # time stamp for saving and logging last_saved = last_logged = time.time() # epoch loop for ep in range(start_ep, opt.max_ep + 1): # show progressbar if opt.tqdm: iterator = tqdm(range(opt.ep_size), desc='train', ncols=70, unit='b', leave=False) else: iterator = range(opt.ep_size) # batch loop for _ in iterator: # call train function batch_loss = func(sess, opt) # loss history update if batch_loss is not None: if loss is None: loss = np.mean(batch_loss) else: loss = loss * 0.9 + np.mean(batch_loss) * 0.1 # saving if time.time() - last_saved > opt.save_interval: last_saved = time.time() saver.save(sess, opt.save_dir + '/ckpt/model-%03d' % ep, write_meta_graph=False, global_step=sess.run( tf.sg_global_step())) # logging if time.time() - last_logged > opt.log_interval: last_logged = time.time() # set session mode to infer tf.sg_set_infer(sess) # run evaluation op if len(opt.eval_metric) > 0: sess.run(opt.eval_metric) if opt.console_log: # console logging # log epoch information tf.sg_info( '\tEpoch[%03d:lr=%7.5f:gs=%d] - loss = %s' % (ep, sess.run(_learning_rate), sess.run(tf.sg_global_step()), ('NA' if loss is None else '%8.6f' % loss))) else: # tensorboard logging # run logging op summary_writer.add_summary( sess.run(summary_op), global_step=sess.run(tf.sg_global_step())) # learning rate decay if opt.early_stop and loss_prev: # if loss stalling if loss >= 0.95 * loss_prev: # early stopping current_lr = sess.run(_learning_rate) if current_lr < 5e-6: early_stopped = True break else: # decrease learning rate by half sess.run( _learning_rate.assign(current_lr / 2.)) # update loss history loss_prev = loss # revert session mode to train tf.sg_set_train(sess) # log epoch information if not opt.console_log: tf.sg_info( '\tEpoch[%03d:lr=%7.5f:gs=%d] - loss = %s' % (ep, sess.run(_learning_rate), sess.run(tf.sg_global_step()), ('NA' if loss is None else '%8.6f' % loss))) if early_stopped: tf.sg_info('\tEarly stopped ( no loss progress ).') break finally: # save last epoch saver.save(sess, opt.save_dir + '/ckpt/model-%03d' % ep, write_meta_graph=False, global_step=sess.run(tf.sg_global_step())) # set session mode to infer tf.sg_set_infer(sess) # logging tf.sg_info('Training finished at epoch[%d]-step[%d].' % (ep, sess.run(tf.sg_global_step()))) # close session if opt.sess is None: sess.close()
def wrapper(**kwargs): r""" Manages arguments of `tf.sg_opt`. Args: **kwargs: lr: A Python Scalar (optional). Learning rate. Default is .001. eval_metric: A list of tensors containing the value to evaluate. Default is []. early_stop: Boolean. If True (default), the training should stop when the following two conditions are met. i. Current loss is less than .95 * previous loss. ii. Current learning rate is less than 5e-6. lr_reset: Boolean. If True, learning rate is set to opt.lr. when training restarts. Otherwise (Default), the value of the stored `_learning_rate` is taken. save_dir: A string. The root path to which checkpoint and log files are saved. Default is `asset/train`. max_ep: A positive integer. Maximum number of epochs. Default is 1000. ep_size: A positive integer. Number of Total batches in an epoch. For proper display of log. Default is 1e5. save_interval: A Python scalar. The interval of saving checkpoint files. By default, for every 600 seconds, a checkpoint file is written. log_interval: A Python scalar. The interval of recoding logs. By default, for every 60 seconds, logging is executed. max_keep: A positive integer. Maximum number of recent checkpoints to keep. Default is 5. keep_interval: A Python scalar. How often to keep checkpoints. Default is 1 hour. tqdm: Boolean. If True (Default), progress bars are shown. console_log: Boolean. If True, a series of loss will be shown on the console instead of tensorboard. Default is False. """ opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(lr=0.001, save_dir='asset/train', max_ep=1000, ep_size=100000, save_interval=600, log_interval=60, early_stop=True, lr_reset=False, eval_metric=[], max_keep=5, keep_interval=1, tqdm=True, console_log=False) # make directory if not exist if not os.path.exists(opt.save_dir): os.makedirs(opt.save_dir) # find last checkpoint last_file = tf.train.latest_checkpoint(opt.save_dir) if last_file: ep = start_ep = int(last_file.split('-')[1]) + 1 start_step = int(last_file.split('-')[2]) else: ep = start_ep = 1 start_step = 0 # checkpoint saver saver = tf.train.Saver(max_to_keep=opt.max_keep, keep_checkpoint_every_n_hours=opt.keep_interval) # summary writer summary_writer = tf.summary.FileWriter(opt.save_dir, graph=tf.get_default_graph()) # add learning rate summary tf.summary.scalar('learning_r', _learning_rate) # add evaluation metric summary for m in opt.eval_metric: tf.sg_summary_metric(m) # summary op summary_op = tf.summary.merge_all() # create session if opt.sess: sess = opt.sess else: # session with multiple GPU support sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # initialize variables sg_init(sess) # restore last checkpoint if last_file: saver.restore(sess, last_file) # set learning rate if start_ep == 1 or opt.lr_reset: sess.run(_learning_rate.assign(opt.lr)) # logging tf.sg_info('Training started from epoch[%03d]-step[%d].' % (start_ep, start_step)) try: # start data queue runner with tf.sg_queue_context(sess): # set session mode to train tf.sg_set_train(sess) # loss history for learning rate decay loss, loss_prev, early_stopped = None, None, False # time stamp for saving and logging last_saved = last_logged = time.time() # epoch loop for ep in range(start_ep, opt.max_ep + 1): # show progressbar if opt.tqdm: iterator = tqdm(range(opt.ep_size), desc='train', ncols=70, unit='b', leave=False) else: iterator = range(opt.ep_size) # batch loop for _ in iterator: # call train function batch_loss = func(sess, opt) # loss history update if batch_loss is not None: if loss is None: loss = np.mean(batch_loss) else: loss = loss * 0.9 + np.mean(batch_loss) * 0.1 # saving if time.time() - last_saved > opt.save_interval: last_saved = time.time() saver.save(sess, opt.save_dir + '/model-%03d' % ep, write_meta_graph=False, global_step=sess.run(tf.sg_global_step())) # logging if time.time() - last_logged > opt.log_interval: last_logged = time.time() # set session mode to infer tf.sg_set_infer(sess) # run evaluation op if len(opt.eval_metric) > 0: sess.run(opt.eval_metric) if opt.console_log: # console logging # log epoch information tf.sg_info('\tEpoch[%03d:lr=%7.5f:gs=%d] - loss = %s' % (ep, sess.run(_learning_rate), sess.run(tf.sg_global_step()), ('NA' if loss is None else '%8.6f' % loss))) else: # tensorboard logging # run logging op summary_writer.add_summary(sess.run(summary_op), global_step=sess.run(tf.sg_global_step())) # learning rate decay if opt.early_stop and loss_prev: # if loss stalling if loss >= 0.95 * loss_prev: # early stopping current_lr = sess.run(_learning_rate) if current_lr < 5e-6: early_stopped = True break else: # decrease learning rate by half sess.run(_learning_rate.assign(current_lr / 2.)) # update loss history loss_prev = loss # revert session mode to train tf.sg_set_train(sess) # log epoch information if not opt.console_log: tf.sg_info('\tEpoch[%03d:lr=%7.5f:gs=%d] - loss = %s' % (ep, sess.run(_learning_rate), sess.run(tf.sg_global_step()), ('NA' if loss is None else '%8.6f' % loss))) if early_stopped: tf.sg_info('\tEarly stopped ( no loss progress ).') break finally: # save last epoch saver.save(sess, opt.save_dir + '/model-%03d' % ep, write_meta_graph=False, global_step=sess.run(tf.sg_global_step())) # set session mode to infer tf.sg_set_infer(sess) # logging tf.sg_info('Training finished at epoch[%d]-step[%d].' % (ep, sess.run(tf.sg_global_step()))) # close session if opt.sess is None: sess.close()
def sg_optim(loss, **kwargs): r"""Applies gradients to variables. Args: loss: A 0-D `Tensor` containing the value to minimize. kwargs: optim: A name for optimizer. 'MaxProp' (default), 'AdaMax', 'Adam', or 'sgd'. lr: A Python Scalar (optional). Learning rate. Default is .001. beta1: A Python Scalar (optional). Default is .9. beta2: A Python Scalar (optional). Default is .99. category: A string or string list. Specifies the variables that should be trained (optional). Only if the name of a trainable variable starts with `category`, it's value is updated. Default is '', which means all trainable variables are updated. """ opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(optim='MaxProp', lr=0.001, beta1=0.9, beta2=0.99, category='') # select optimizer if opt.optim == 'MaxProp': optim = tf.sg_optimize.MaxPropOptimizer(learning_rate=opt.lr, beta2=opt.beta2) elif opt.optim == 'AdaMax': optim = tf.sg_optimize.AdaMaxOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'Adam': optim = tf.train.AdamOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'DP_GD': optim = tf.sg_optimize.DPGradientDescentOptimizer( opt.lr, [opt.eps, opt.delta], opt.gaussian_sanitizer, sigma=opt.sigma, batches_per_lot=opt.batches_per_lot) else: optim = tf.train.GradientDescentOptimizer(learning_rate=opt.lr) # get trainable variables if isinstance(opt.category, (tuple, list)): var_list = [] for cat in opt.category: var_list.extend([t for t in tf.trainable_variables() if t.name.startswith(cat)]) else: var_list = [t for t in tf.trainable_variables() if t.name.startswith(opt.category)] if opt.optim == 'DP_GD': # only handle 1 batch per lot print(type(loss)) print(loss) sanitized_grads = optim.compute_sanitized_gradients(loss, var_list=var_list) for v, g in zip(var_list, sanitized_grads): # exclude batch normal statics if 'mean' not in v.name and 'variance' not in v.name \ and 'beta' not in v.name and 'gamma' not in v.name: tf.sg_summary_gradient(v, g) grad_op = optim.apply_gradients(sanitized_grads, global_step=tf.sg_global_step()) else: # calc gradient gradient = optim.compute_gradients(loss, var_list=var_list) # add summary for v, g in zip(var_list, gradient): # exclude batch normal statics if 'mean' not in v.name and 'variance' not in v.name \ and 'beta' not in v.name and 'gamma' not in v.name: tf.sg_summary_gradient(v, g) # gradient update op grad_op = optim.apply_gradients(gradient, global_step=tf.sg_global_step()) # extra update ops within category ( for example, batch normal running stat update ) if isinstance(opt.category, (tuple, list)): update_op = [] for cat in opt.category: update_op.extend([t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(cat)]) else: update_op = [t for t in tf.get_collection(tf.GraphKeys.UPDATE_OPS) if t.name.startswith(opt.category)] return [grad_op] + update_op
def sg_optim(loss, **kwargs): r"""Applies gradients to variables. Args: loss: A 0-D `Tensor` containing the value to minimize. kwargs: optim: A name for optimizer. 'MaxProp' (default), 'AdaMax', 'Adam', or 'sgd'. lr: A Python Scalar (optional). Learning rate. Default is .001. beta1: A Python Scalar (optional). Default is .9. beta2: A Python Scalar (optional). Default is .99. category: A string or string list. Specifies the variables that should be trained (optional). Only if the name of a trainable variable starts with `category`, it's value is updated. Default is '', which means all trainable variables are updated. """ opt = tf.sg_opt(kwargs) # default training options opt += tf.sg_opt(optim='MaxProp', lr=0.001, beta1=0.9, beta2=0.99, category='') # select optimizer if opt.optim == 'MaxProp': optim = tf.sg_optimize.MaxPropOptimizer(learning_rate=opt.lr, beta2=opt.beta2) elif opt.optim == 'AdaMax': optim = tf.sg_optimize.AdaMaxOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) elif opt.optim == 'Adam': optim = tf.train.AdamOptimizer(learning_rate=opt.lr, beta1=opt.beta1, beta2=opt.beta2) else: optim = tf.train.GradientDescentOptimizer(learning_rate=opt.lr) # get trainable variables if isinstance(opt.category, (tuple, list)): var_list = [] for cat in opt.category: var_list.extend([ t for t in tf.trainable_variables() if t.name.startswith(cat) ]) else: var_list = [ t for t in tf.trainable_variables() if t.name.startswith(opt.category) ] # calc gradient gradient = optim.compute_gradients(loss, var_list=var_list) # add summary for v, g in zip(var_list, gradient): # exclude batch normal statics if 'mean' not in v.name and 'variance' not in v.name \ and 'beta' not in v.name and 'gamma' not in v.name: tf.sg_summary_gradient(v, g) # gradient update op return optim.apply_gradients(gradient, global_step=tf.sg_global_step())
# greedy search policy label = dec.sg_argmax() loss = dec.sg_ce(target=y, mask=True) # run graph for translating with tf.Session() as sess: # init session vars tf.sg_init(sess) # restore parameters saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir)) _step = sess.run(tf.sg_global_step()) for fname in [test_file]: assert (len(test_sources) == len(users) and len(users) == len(ids)) test_targets = test_sources num_its = len(test_sources) // batch_size for t in range(num_its): res_list = [] lock_name = '%s_%s_%d_%d.lck' % (trg_dir, user, _step, t) csv_name = '%s_%s_%d_%d.csv' % (trg_dir, user, _step, t) lock_path = os.path.join(out_dir, trg_dir, lock_name) csv_path = os.path.join(out_dir, trg_dir, csv_name) if not os.path.exists(lock_path):
# get optimizer train_op = sg_optim(opt.loss, optim=opt.optim, lr=lr, beta1=opt.beta1, beta2=opt.beta2, category=opt.category) # checkpoint saver saver = tf.train.Saver(max_to_keep=opt.max_keep, keep_checkpoint_every_n_hours=opt.keep_interval) # create supervisor sv = tf.train.Supervisor(logdir=opt.save_dir, saver=saver, save_model_secs=opt.save_interval, summary_writer=None, save_summaries_secs=opt.log_interval, global_step=tf.sg_global_step(), local_init_op=tf.sg_phase().assign(True)) # training epoch and loss epoch, loss_val = -1, None # training epoch and loss epoch, loss_val = -1, None # create session print "Starting session..." with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
# CTC loss #loss = logit.sg_ctc(target=y, seq_len=seq_len) reg_lambda = 0.0002 trainable = tf.trainable_variables() lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in trainable]) * reg_lambda loss = logit.sg_ce(target=y, one_hot=True) + lossL2 # train config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=6, intra_op_parallelism_threads=6) sess = tf.Session(config=config) tf.sg_init(sess) learning_rate = tf.train.exponential_decay(0.00001, tf.sg_global_step(), 100, 0.95, staircase=False) with tf.name_scope('summaries'): tf.summary.scalar('global_step', tf.sg_global_step()) tf.summary.scalar('real_lr', learning_rate) tf.sg_train(log_interval=30, lr=learning_rate, loss=loss, ep_size=data.num_batch, max_ep=8, early_stop=False, lr_reset=True)