def train(loss, init_fn, hparams): """Wraps slim.learning.train to run a training loop. Args: loss: a loss tensor init_fn: A callable to be executed after all other initialization is done. hparams: a model hyper parameters """ optimizer = create_optimizer(hparams) if FLAGS.sync_replicas: replica_id = tf.constant(FLAGS.task, tf.int32, shape=()) optimizer = tf.LegacySyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, replica_id=replica_id, total_num_replicas=FLAGS.total_num_replicas) sync_optimizer = optimizer startup_delay_steps = 0 else: startup_delay_steps = 0 sync_optimizer = None train_op = slim.learning.create_train_op( loss, optimizer, summarize_gradients=True, clip_gradient_norm=FLAGS.clip_gradient_norm) return train_op
def train(loss, init_fn, hparams): """Wraps slim.learning.train to run a training loop. Args: loss: a loss tensor init_fn: A callable to be executed after all other initialization is done. hparams: a model hyper parameters """ with tf.device("/cpu:0"): global_step = slim.get_or_create_global_step() with tf.device("/cpu:0"): optimizer = create_optimizer(hparams) if FLAGS.sync_replicas: replica_id = tf.constant(FLAGS.task, tf.int32, shape=()) optimizer = tf.LegacySyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, replica_id=replica_id, total_num_replicas=FLAGS.total_num_replicas) sync_optimizer = optimizer startup_delay_steps = 0 else: startup_delay_steps = 0 sync_optimizer = None #train_op = slim.learning.create_train_op( # loss, # optimizer, # summarize_gradients=True, # clip_gradient_norm=FLAGS.clip_gradient_norm) grad = optimizer.compute_gradients(loss) clipped_grad = tf.contrib.training.clip_gradient_norms( grad, FLAGS.clip_gradient_norm) update = optimizer.apply_gradients(clipped_grad, global_step=global_step) with tf.control_dependencies([update]): train_op = tf.identity(loss, name='train_op') session_config = tf.ConfigProto() session_config.gpu_options.allow_growth = True with tf.device("/cpu:0"): slim.learning.train(train_op=train_op, logdir=FLAGS.train_log_dir, graph=loss.graph, master=FLAGS.master, is_chief=(FLAGS.task == 0), number_of_steps=FLAGS.max_number_of_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, startup_delay_steps=startup_delay_steps, sync_optimizer=sync_optimizer, init_fn=init_fn, session_config=session_config)
def train(loss, init_fn, hparams): """Wraps slim.learning.train to run a training loop. Args: loss: a loss tensor init_fn: A callable to be executed after all other initialization is done. hparams: a model hyper parameters """ optimizer = create_optimizer(hparams) if FLAGS.sync_replicas: replica_id = tf.constant(FLAGS.task, tf.int32, shape=()) optimizer = tf.LegacySyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, replica_id=replica_id, total_num_replicas=FLAGS.total_num_replicas) sync_optimizer = optimizer startup_delay_steps = 0 else: startup_delay_steps = 0 sync_optimizer = None train_op = slim.learning.create_train_op( loss, optimizer, summarize_gradients=True, aggregation_method=2, clip_gradient_norm=FLAGS.clip_gradient_norm) gpu_config = tf.ConfigProto() #gpu_config.gpu_options.visible_device_list= '1,2,0,3' #gpu_config.gpu_options.visible_device_list= '2' gpu_config.gpu_options.allow_growth = True #gpu_config.gpu_options.per_process_gpu_memory_fraction = 1.0 #gpu_config.gpu_options.allocator_type = 'BFC' #gpu_config.log_device_placement = True #gpu_config.allow_soft_placement = True slim.learning.train( train_op=train_op, logdir=FLAGS.train_log_dir, graph=loss.graph, master=FLAGS.master, is_chief=(FLAGS.task == 0), number_of_steps=FLAGS.max_number_of_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, startup_delay_steps=startup_delay_steps, sync_optimizer=sync_optimizer, init_fn=init_fn, session_config=gpu_config)
def train(loss, init_fn, CharacterAccuracy, SequenceAccuracy, hparams): """Wraps slim.learning.train to run a training loop. Args: loss: a loss tensor init_fn: A callable to be executed after all other initialization is done. hparams: a model hyper parameters """ optimizer = create_optimizer(hparams) if FLAGS.sync_replicas: replica_id = tf.constant(FLAGS.task, tf.int32, shape=()) optimizer = tf.LegacySyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, replica_id=replica_id, total_num_replicas=FLAGS.total_num_replicas) sync_optimizer = optimizer startup_delay_steps = 0 else: startup_delay_steps = 0 sync_optimizer = None train_op = slim.learning.create_train_op( loss, optimizer, summarize_gradients=True, clip_gradient_norm=FLAGS.clip_gradient_norm) slim.learning.train(train_op=train_op, train_step_fn=train_step, train_step_kwargs={ 'CharacterAccuracy': CharacterAccuracy, 'SequenceAccuracy': SequenceAccuracy }, logdir=FLAGS.train_log_dir, graph=loss.graph, master=FLAGS.master, is_chief=(FLAGS.task == 0), number_of_steps=FLAGS.max_number_of_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, startup_delay_steps=startup_delay_steps, sync_optimizer=sync_optimizer, init_fn=init_fn)
def train_multigpu(losses, init_fn, hparams): """Wraps slim.learning.train to run a training loop. Args: loss: a loss tensor init_fn: A callable to be executed after all other initialization is done. hparams: a model hyper parameters """ with tf.device("/cpu:0"): global_step = slim.create_global_step() with tf.device("/cpu:0"): optimizer = create_optimizer(hparams) if FLAGS.sync_replicas: replica_id = tf.constant(FLAGS.task, tf.int32, shape=()) optimizer = tf.LegacySyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, replica_id=replica_id, total_num_replicas=FLAGS.total_num_replicas) sync_optimizer = optimizer startup_delay_steps = 0 else: startup_delay_steps = 0 sync_optimizer = None #train_op = slim.learning.create_train_op( # loss, # optimizer, # summarize_gradients=True, # clip_gradient_norm=FLAGS.clip_gradient_norm) #with tf.device("/cpu:0"): # tf.summary.scalar('TotalLoss_all', total_loss) # grad = optimizer.compute_gradients(total_loss) #with tf.device("/cpu:0"): # with ops.name_scope('summarize_grads'): # add_gradients_summaries(grad) # clipped_grad = tf.contrib.training.clip_gradient_norms(grad, FLAGS.clip_gradient_norm) # update = optimizer.apply_gradients(clipped_grad, global_step=global_step) #with tf.control_dependencies([update]): # train_op = tf.identity(total_loss, name='train_op') # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, "clone_0") grads = [] total_loss = [] for loss, i in losses: with tf.device("/gpu:{0}".format(i)): scaled_loss = tf.div(loss, 1.0 * FLAGS.num_clones) if i == 0: regularization_loss = tf.add_n( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) scaled_loss = scaled_loss + regularization_loss total_loss.append(scaled_loss) grad = optimizer.compute_gradients(scaled_loss) #if i == 0: # with tf.device("/cpu:0"): # with ops.name_scope("summarize_grads_{0}".format(i)): # add_gradients_summaries(grad) grads.append(grad) total_loss = tf.add_n(total_loss) with tf.device("/cpu:0"): tf.summary.scalar('Total_Loss', total_loss) sum_grad = _sum_clones_gradients(grads) clipped_grad = tf.contrib.training.clip_gradient_norms( sum_grad, FLAGS.clip_gradient_norm) update = optimizer.apply_gradients(clipped_grad, global_step=global_step) update_ops.append(update) with tf.control_dependencies([tf.group(*update_ops)]): train_op = tf.identity(total_loss, name='train_op') session_config = tf.ConfigProto() session_config.gpu_options.allow_growth = True #session_config.log_device_placement = True with tf.device("/cpu:0"): slim.learning.train(train_op=train_op, logdir=FLAGS.train_log_dir, graph=total_loss.graph, master=FLAGS.master, is_chief=(FLAGS.task == 0), number_of_steps=FLAGS.max_number_of_steps, save_summaries_secs=FLAGS.save_summaries_secs, trace_every_n_steps=1000, save_interval_secs=FLAGS.save_interval_secs, startup_delay_steps=startup_delay_steps, sync_optimizer=sync_optimizer, init_fn=init_fn, session_config=session_config)
def train(hparams): """Run training loop.""" data_iterator, clause_metadata = load_data(random_start=True) with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)): # The following three lines prevent hangs during distributed training. vs = tf.get_variable_scope() if vs.caching_device is None: vs.set_caching_device(lambda op: op.device) # Build the graph. global_step = slim.variables.get_or_create_global_step() if FLAGS.model_type == 'tree': m = cnf_model.CNFTreeModel(data_iterator, hparams, clause_metadata) else: m = cnf_model.CNFSequenceModel(data_iterator, hparams, clause_metadata) variables = tf.trainable_variables() learning_rate = tf.train.exponential_decay( hparams.learning_rate, global_step, hparams.decay_steps, hparams.learning_rate_decay_factor, staircase=True) if hparams.optimizer == 'sgd': optimizer = tf.train.GradientDescentOptimizer(learning_rate) elif hparams.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) elif hparams.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.9, momentum=0.9, epsilon=1e-5) else: raise RuntimeError('Unknown optimizer %s' % hparams.optimizer) if FLAGS.master not in ('', 'local') and FLAGS.sync_replicas: replica_id = tf.constant(FLAGS.task, tf.int32, shape=()) optimizer = tf.LegacySyncReplicasOptimizer( opt=optimizer, replicas_to_aggregate=FLAGS.replicas_to_aggregate, replica_id=replica_id, total_num_replicas=FLAGS.worker_replicas) tf.contrib.deprecated.scalar_summary('lr', learning_rate) tf.contrib.deprecated.scalar_summary('loss', m.loss) for metric_name, metric_value in m.metrics.items(): tf.contrib.deprecated.scalar_summary('metric/' + metric_name, metric_value) grads_and_vars = optimizer.compute_gradients(m.loss, variables) if hparams.grad_max_norm > 0: g, v = zip(*grads_and_vars) g, global_norm = tf.clip_by_global_norm(g, hparams.grad_max_norm) tf.contrib.deprecated.scalar_summary('global_norm', global_norm) grads_and_vars = zip(g, v) train_op = optimizer.apply_gradients(grads_and_vars, global_step) summary_op = tf.get_summary_op() if FLAGS.master not in ('', 'local') and FLAGS.sync_replicas: init_token_op = optimizer.get_init_tokens_op() chief_queue_runner = optimizer.get_chief_queue_runner() saver = tf.Saver(keep_checkpoint_every_n_hours=1.0) supervisor = tf.Supervisor( is_chief=(FLAGS.task == 0), logdir=FLAGS.tf_log_dir, global_step=global_step, saver=saver, # We are going to compute summaries ourselves. summary_op=None, save_model_secs=FLAGS.save_model_secs, # But we set this so that this computes global_step/sec. save_summaries_secs=FLAGS.save_summaries_secs) sess = supervisor.prepare_or_wait_for_session(FLAGS.master) # TODO(ricshin): # Rewrite this to use supervisor.managed_session(). # Look at how slim/learning.py handles SyncReplicas, in particular # init_token_op. Use normal text summaries once they exist. # Use supervisor.should_stop(). if FLAGS.task == 0: if FLAGS.master not in ('', 'local') and FLAGS.sync_replicas: supervisor.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_token_op) sampling_temps = [ float(x) for x in FLAGS.sampling_temps.split(',') ] def summarize(): try: summary_strs, global_step_val = sess.run( [summary_op, global_step]) summaries = tf.Summary.FromString(summary_strs) for i, temp in itertools.product( xrange(FLAGS.num_summary_samples), sampling_temps): cnf = textwrap.wrap( cnf_utils.unparse_cnf(m.sample(sess))) summaries.value.add( tag='formula_temp%g_%d' % (temp, i), tensor=make_tensor_proto('\n'.join(cnf))) supervisor.summary_writer.add_summary( summaries.SerializeToString(), global_step_val) status_str = ', '.join('%s=%f' % (value.tag, value.simple_value) for value in summaries.value if value.HasField('simple_value')) tf.logging.info('step=%d: %s', global_step_val, status_str) except: # The supervisor eats the backtrace, so print it here. traceback.print_exc() raise supervisor.loop(FLAGS.save_summaries_secs, summarize) # Run the trainer. for unused_i in xrange(hparams.max_steps): sess.run(train_op)