def accumulate_privacy_spending(self, eps_delta, unused_sigma, num_examples): """Accumulate the privacy spending. Currently only support approximate privacy. Here we assume we use Gaussian noise on randomly sampled batch so we get better composition: 1. the per batch privacy is computed using privacy amplication via sampling bound; 2. the composition is done using the composition with Gaussian noise. TODO(liqzhang) Add a link to a document that describes the bounds used. Args: eps_delta: EpsDelta pair which can be tensors. unused_sigma: the noise sigma. Unused for this accountant. num_examples: the number of examples involved. Returns: a TensorFlow operation for updating the privacy spending. """ eps, delta = eps_delta with tf.control_dependencies( [tf.Assert(tf.greater(delta, 0), ["delta needs to be greater than 0"])]): amortize_ratio = (tf.cast(num_examples, tf.float32) * 1.0 / self._total_examples) # Use privacy amplification via sampling bound. # See Lemma 2.2 in http://arxiv.org/pdf/1405.7085v2.pdf # TODO(liqzhang) Add a link to a document with formal statement # and proof. amortize_eps = tf.reshape(tf.log(1.0 + amortize_ratio * ( tf.exp(eps) - 1.0)), [1]) amortize_delta = tf.reshape(amortize_ratio * delta, [1]) return tf.group(*[tf.assign_add(self._eps_squared_sum, tf.square(amortize_eps)), tf.assign_add(self._delta_sum, amortize_delta)])
def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.): updateOps = [] # obtain the stats var list for stats_var in statsUpdates: stats_new = statsUpdates[stats_var] if accumulate: # simple superbatch averaging update_op = tf.assign_add( stats_var, accumulateCoeff * stats_new, use_locking=True) else: # exponential running averaging update_op = tf.assign( stats_var, stats_var * self._stats_decay, use_locking=True) update_op = tf.assign_add( update_op, (1. - self._stats_decay) * stats_new, use_locking=True) updateOps.append(update_op) with tf.control_dependencies(updateOps): stats_step_op = tf.assign_add(self.stats_step, 1) if KFAC_DEBUG: stats_step_op = (tf.Print(stats_step_op, [tf.convert_to_tensor('step:'), self.global_step, tf.convert_to_tensor('fac step:'), self.factor_step, tf.convert_to_tensor('sgd step:'), self.sgd_step, tf.convert_to_tensor('Accum:'), tf.convert_to_tensor(accumulate), tf.convert_to_tensor('Accum coeff:'), tf.convert_to_tensor(accumulateCoeff), tf.convert_to_tensor('stat step:'), self.stats_step, updateOps[0], updateOps[1]])) return [stats_step_op, ]
def evaluate_precision_recall( input_layer, labels, threshold=0.5, per_example_weights=None, name=PROVIDED, phase=Phase.train ): """Computes the precision and recall of the prediction vs the labels. Args: input_layer: A Pretty Tensor object. labels: The target labels to learn as a float tensor. threshold: The threshold to use to decide if the prediction is true. per_example_weights: A Tensor with a weight per example. name: An optional name. phase: The phase of this model; non training phases compute a total across all examples. Returns: Precision and Recall. """ _ = name # Eliminate warning, name used for namescoping by PT. selected, sum_retrieved, sum_relevant = _compute_precision_recall( input_layer, labels, threshold, per_example_weights ) if phase != Phase.train: dtype = tf.float32 # Create the variables in all cases so that the load logic is easier. relevant_count = tf.get_variable( "relevant_count", [], dtype, tf.zeros_initializer, collections=[bookkeeper.GraphKeys.TEST_VARIABLES], trainable=False, ) retrieved_count = tf.get_variable( "retrieved_count", [], dtype, tf.zeros_initializer, collections=[bookkeeper.GraphKeys.TEST_VARIABLES], trainable=False, ) selected_count = tf.get_variable( "selected_count", [], dtype, tf.zeros_initializer, collections=[bookkeeper.GraphKeys.TEST_VARIABLES], trainable=False, ) with input_layer.g.device(selected_count.device): selected = tf.assign_add(selected_count, selected) with input_layer.g.device(retrieved_count.device): sum_retrieved = tf.assign_add(retrieved_count, sum_retrieved) with input_layer.g.device(relevant_count.device): sum_relevant = tf.assign_add(relevant_count, sum_relevant) return ( tf.select(tf.equal(sum_retrieved, 0), tf.zeros_like(selected), selected / sum_retrieved), tf.select(tf.equal(sum_relevant, 0), tf.zeros_like(selected), selected / sum_relevant), )
def test_summary_saver(self): with tf.Graph().as_default() as g, tf.Session() as sess: log_dir = 'log/dir' summary_writer = testing.FakeSummaryWriter(log_dir, g) var = tf.Variable(0.0) tensor = tf.assign_add(var, 1.0) summary_op = tf.scalar_summary('my_summary', tensor) global_step = tf.contrib.framework.get_or_create_global_step() train_op = tf.assign_add(global_step, 1) hook = tf.train.SummarySaverHook( summary_op=summary_op, save_steps=8, summary_writer=summary_writer) hook.begin() sess.run(tf.initialize_all_variables()) mon_sess = monitored_session._HookedSession(sess, [hook]) for i in range(30): _ = i mon_sess.run(train_op) hook.end(sess) summary_writer.assert_summaries( test_case=self, expected_logdir=log_dir, expected_graph=g, expected_summaries={ 1: {'my_summary': 1.0}, 9: {'my_summary': 2.0}, 17: {'my_summary': 3.0}, 25: {'my_summary': 4.0}, })
def apply(self, var_list): """Applies the running average to a list of variables Creates shadow variables and update op. Returns a grouped update op for all the averages in the list.""" update_ops = [] with tf.variable_scope('running_average'): for var in var_list: # add a shadow var that gets initialized to the same value # and a count to keep track of how many times it's been updated name = var.op.name count = tf.get_variable( name+'_count', dtype=tf.float32, initializer=tf.constant_initializer(0.0), shape=[], trainable=False) shadow = tf.get_variable( name+'_shadow', dtype=var.dtype, initializer=var.initialized_value(), collections=[tf.GraphKeys.MOVING_AVERAGE_VARIABLES, tf.GraphKeys.VARIABLES], trainable=False) # now make the update ops # increase the count count_update = tf.assign_add(count, 1.0) with tf.control_dependencies([count_update]): difference = (var - shadow)/count update = tf.assign_add(shadow, difference) update_ops.append(update) self.shadow_vars[var] = (shadow, count) return update_ops
def test_train_skip_train_if_max_step_already_saved(self): with tf.Graph().as_default() as g, self.test_session(g): with tf.control_dependencies(self._build_inference_graph()): train_op = tf.assign_add(tf.contrib.framework.get_global_step(), 1) learn.graph_actions._monitored_train( # pylint: disable=protected-access g, output_dir=self._output_dir, train_op=train_op, loss_op=tf.constant(2.0), max_steps=10) step = checkpoints.load_variable( self._output_dir, tf.contrib.framework.get_global_step().name) self.assertEqual(10, step) with tf.Graph().as_default() as g, self.test_session(g): with tf.control_dependencies(self._build_inference_graph()): train_op = tf.assign_add(tf.contrib.framework.get_global_step(), 1) learn.graph_actions._monitored_train( # pylint: disable=protected-access g, output_dir=self._output_dir, train_op=train_op, loss_op=tf.constant(2.0), max_steps=10) step = checkpoints.load_variable( self._output_dir, tf.contrib.framework.get_global_step().name) self.assertEqual(10, step)
def _eval_metric(input_, topk, correct_predictions, examples, phase): """Creates the standard tracking varibles if in test and returns accuracy.""" my_parameters = {} if phase in (Phase.test, Phase.infer): dtype = tf.float32 # Create the variables using tf.Variable because we don't want to share. count = tf.Variable( tf.constant(0, dtype=dtype), name="count_%d" % topk, collections=[bookkeeper.GraphKeys.TEST_VARIABLES], trainable=False, ) correct = tf.Variable( tf.constant(0, dtype=dtype), name="correct_%d" % topk, collections=[bookkeeper.GraphKeys.TEST_VARIABLES], trainable=False, ) my_parameters["count"] = count my_parameters["correct"] = correct with input_.g.device(count.device): examples = tf.assign_add(count, examples) with input_.g.device(correct.device): correct_predictions = tf.assign_add(correct, correct_predictions) return correct_predictions, examples, my_parameters
def running_mean(cost, tag_name, batch_size=1): with tf.name_scope("running_mean_" + tag_name): with tf.variable_scope(tag_name): cost_sum = tf.get_variable( "cost_sum", initializer=tf.zeros_initializer, dtype=tf.float64, shape=(), collections=[tf.GraphKeys.LOCAL_VARIABLES], trainable=False) batches = tf.get_variable( "cost_num_batches", initializer=tf.zeros_initializer, dtype=tf.int32, shape=(), collections=[tf.GraphKeys.LOCAL_VARIABLES], trainable=False) cost_add = tf.assign_add(cost_sum, tf.cast(cost, dtype=tf.float64)) batches_add = tf.assign_add(batches, batch_size) update_cost_mean = tf.group(cost_add, batches_add) reset_batches = tf.assign(batches, 0) reset_cost_sum = tf.assign(cost_sum, 0.0) reset_cost_mean = tf.group(reset_batches, reset_cost_sum) mean_cost = tf.divide( cost_sum, tf.cast(batches, dtype=tf.float64)) train_loss_summary = tf.summary.scalar(tag_name, mean_cost) return reset_cost_mean, update_cost_mean, train_loss_summary
def loop_body(i): asn1 = tf.assign_add(var_a, 1, name="a_add") with tf.control_dependencies([asn1]): asn2 = tf.assign_add(var_b, var_a, name="b_add") with tf.control_dependencies([asn2]): ni = tf.add(i, 1, name="i_add") return ni
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
def test_capture_variable(self): monitor = learn.monitors.CaptureVariable(var_name="my_assign_add:0", every_n=8, first_n=2) with tf.Graph().as_default() as g, self.test_session(g): var = tf.Variable(0.0, name="my_var") var.initializer.run() tf.assign_add(var, 1.0, name="my_assign_add") self._run_monitor(monitor, num_epochs=3, num_steps_per_epoch=10) self.assertEqual({0: 1.0, 1: 2.0, 2: 3.0, 10: 4.0, 18: 5.0, 26: 6.0, 29: 7.0}, monitor.values)
def loss(loss_value): """Calculates aggregated mean loss.""" total_loss = tf.Variable(0.0, False) loss_count = tf.Variable(0, False) total_loss_update = tf.assign_add(total_loss, loss_value) loss_count_update = tf.assign_add(loss_count, 1) loss_op = total_loss / tf.cast(loss_count, tf.float32) return [total_loss_update, loss_count_update], loss_op
def train_one_epoch(generator, discriminator, generator_optimizer, discriminator_optimizer, dataset, log_interval, noise_dim): """Trains `generator` and `discriminator` models on `dataset`. Args: generator: Generator model. discriminator: Discriminator model. generator_optimizer: Optimizer to use for generator. discriminator_optimizer: Optimizer to use for discriminator. dataset: Dataset of images to train on. log_interval: How many global steps to wait between logging and collecting summaries. noise_dim: Dimension of noise vector to use. """ total_generator_loss = 0.0 total_discriminator_loss = 0.0 for (batch_index, images) in enumerate(tfe.Iterator(dataset)): with tf.device('/cpu:0'): tf.assign_add(tf.train.get_global_step(), 1) with tf.contrib.summary.record_summaries_every_n_global_steps(log_interval): current_batch_size = images.shape[0] noise = tf.random_uniform(shape=[current_batch_size, noise_dim], minval=-1., maxval=1., seed=batch_index) with tfe.GradientTape(persistent=True) as g: generated_images = generator(noise) tf.contrib.summary.image('generated_images', tf.reshape(generated_images, [-1, 28, 28, 1]), max_images=10) discriminator_gen_outputs = discriminator(generated_images) discriminator_real_outputs = discriminator(images) discriminator_loss_val = discriminator_loss(discriminator_real_outputs, discriminator_gen_outputs) total_discriminator_loss += discriminator_loss_val generator_loss_val = generator_loss(discriminator_gen_outputs) total_generator_loss += generator_loss_val generator_grad = g.gradient(generator_loss_val, generator.variables) discriminator_grad = g.gradient(discriminator_loss_val, discriminator.variables) with tf.variable_scope('generator'): generator_optimizer.apply_gradients(zip(generator_grad, generator.variables)) with tf.variable_scope('discriminator'): discriminator_optimizer.apply_gradients(zip(discriminator_grad, discriminator.variables)) if log_interval and batch_index > 0 and batch_index % log_interval == 0: print('Batch #%d\tAverage Generator Loss: %.6f\t' 'Average Discriminator Loss: %.6f' % ( batch_index, total_generator_loss/batch_index, total_discriminator_loss/batch_index))
def setUp(self): tf.test.TestCase.setUp(self) self.log_dir = 'log/dir' self.summary_writer = testing.FakeSummaryWriter(self.log_dir) var = tf.Variable(0.0) tensor = tf.assign_add(var, 1.0) self.summary_op = tf.summary.scalar('my_summary', tensor) global_step = tf.contrib.framework.get_or_create_global_step() self.train_op = tf.assign_add(global_step, 1)
def accuracy(logits, labels): """Calculates aggregated accuracy.""" is_correct = tf.nn.in_top_k(logits, labels, 1) correct = tf.reduce_sum(tf.cast(is_correct, tf.int32)) incorrect = tf.reduce_sum(tf.cast(tf.logical_not(is_correct), tf.int32)) correct_count = tf.Variable(0, False) incorrect_count = tf.Variable(0, False) correct_count_update = tf.assign_add(correct_count, correct) incorrect_count_update = tf.assign_add(incorrect_count, incorrect) accuracy_op = tf.cast(correct_count, tf.float32) / tf.cast( correct_count + incorrect_count, tf.float32) return [correct_count_update, incorrect_count_update], accuracy_op
def advance_counters(self, total): """Returns ops to advance the per-component step and total counters. Args: total: Total number of actions to increment counters by. Returns: tf.Group op incrementing 'step' by 1 and 'total' by total. """ update_total = tf.assign_add(self._total, total, use_locking=True) update_step = tf.assign_add(self._step, 1, use_locking=True) return tf.group(update_total, update_step)
def test_train_loss(self): with tf.Graph().as_default() as g, self.test_session(g): tf.contrib.framework.create_global_step() loss_var = tf.contrib.framework.local_variable(10.0) train_op = tf.group( tf.assign_add(tf.contrib.framework.get_global_step(), 1), tf.assign_add(loss_var, -1.0)) self._assert_summaries(self._output_dir) loss = learn.graph_actions.train( g, output_dir=self._output_dir, train_op=train_op, loss_op=loss_var.value(), steps=6) self.assertEqual(4.0, loss) self._assert_summaries(self._output_dir, expected_graphs=[g])
def session_run_job(): with tf.Session() as sess: a = tf.Variable(10, dtype=tf.int32, name='a') b = tf.Variable(20, dtype=tf.int32, name='b') d = tf.constant(1, dtype=tf.int32, name='d') inc_a = tf.assign_add(a, d, name='inc_a') inc_b = tf.assign_add(b, d, name='inc_b') inc_ab = tf.group([inc_a, inc_b], name="inc_ab") sess.run(tf.global_variables_initializer()) sess = tf_debug.TensorBoardDebugWrapperSession(sess, self._debugger_url) session_run_results.append(sess.run(inc_ab))
def testPlateauOpHook(self): global_step = tf.train.create_global_step() counter = tf.get_variable("count", initializer=0, dtype=tf.int32) indicator = tf.get_variable("indicator", initializer=0, dtype=tf.int32) tf.summary.scalar("count", counter) incr_global_step = tf.assign_add(global_step, 1) incr_counter = tf.assign_add(counter, 1) incr_indicator = tf.assign_add(indicator, 1) # Stop if the global step has not gone up by more than 1 in 20 steps. ckpt_dir = self.ckpt_dir("plateauop") stop_hook = metrics_hook.PlateauOpHook( ckpt_dir, "count_1", incr_indicator, num_plateau_steps=20, plateau_delta=1., plateau_decrease=False, every_n_steps=10) with self.sess(stop_hook, ckpt_dir) as sess: for _ in range(20): sess.run((incr_global_step, incr_counter)) # Summary files should now have 2 values in them self.flush() # Run for more steps so that the hook gets triggered and we verify that we # don't stop. for _ in range(30): sess.run((incr_global_step, incr_counter)) self.flush() # Run without incrementing the counter for _ in range(30): sess.run(incr_global_step) self.flush() self.assertTrue(sess.run(indicator) < 1) # Metrics should be written such that now the counter has gone >20 steps # without being incremented. # Check that we run the incr_indicator op several times for _ in range(3): for _ in range(10): sess.run(incr_global_step) self.flush() self.assertTrue(sess.run(indicator) > 1)
def setUp(self): self.model_dir = tempfile.mkdtemp() self.graph = tf.Graph() with self.graph.as_default(): self.scaffold = monitored_session.Scaffold() self.global_step = tf.contrib.framework.get_or_create_global_step() self.train_op = tf.assign_add(self.global_step, 1)
def test_recover_and_retry_on_aborted_error(self): # Tests that we silently retry and recover on abort. This test uses # a CheckpointSaver to have something to recover from. logdir = self._test_dir('test_recover_and_retry_on_aborted_error') with tf.Graph().as_default(): gstep = tf.contrib.framework.get_or_create_global_step() do_step = tf.assign_add(gstep, 1) scaffold = supervised_session.Scaffold() abort_monitor = RaiseOnceAtStepN( 3, tf.errors.AbortedError(None, None, 'Abort')) # Save after each step. ckpt_monitor = tf.contrib.learn.monitors.CheckpointSaver( 1, scaffold.saver, logdir) monitors = [abort_monitor, ckpt_monitor] with supervised_session.SupervisedSession('', scaffold=scaffold, checkpoint_dir=logdir, monitors=monitors) as session: self.assertEqual(0, session.run(gstep)) self.assertEqual(1, session.run(do_step)) self.assertEqual(2, session.run(do_step)) self.assertFalse(session.should_stop()) # Here at step 3, the monitor triggers and raises AbortedError. The # SupervisedSession automatically restores and retries. self.assertEqual(3, session.run(do_step)) self.assertTrue(abort_monitor.raised) self.assertFalse(session.should_stop()) self.assertEqual(4, session.run(do_step)) self.assertFalse(session.should_stop())
def accumulate_privacy_spending(self, unused_eps_delta, sigma, num_examples): """Accumulate privacy spending. In particular, accounts for privacy spending when we assume there are num_examples, and we are releasing the vector (sum_{i=1}^{num_examples} x_i) + Normal(0, stddev=l2norm_bound*sigma) where l2norm_bound is the maximum l2_norm of each example x_i, and the num_examples have been randomly selected out of a pool of self.total_examples. Args: unused_eps_delta: EpsDelta pair which can be tensors. Unused in this accountant. sigma: the noise sigma, in the multiples of the sensitivity (that is, if the l2norm sensitivity is k, then the caller must have added Gaussian noise with stddev=k*sigma to the result of the query). num_examples: the number of examples involved. Returns: a TensorFlow operation for updating the privacy spending. """ q = tf.cast(num_examples, tf.float64) * 1.0 / self._total_examples moments_accum_ops = [] for i in range(len(self._log_moments)): moment = self._compute_log_moment(sigma, q, self._moment_orders[i]) moments_accum_ops.append(tf.assign_add(self._log_moments[i], moment)) return tf.group(*moments_accum_ops)
def testPeriodicTargetUpdate(self, use_locking, update_period): """Tests that the simple success case works as expected. This is an integration test. The periodically and update parts are unit-tested in the preceding. Args: use_locking: value for `periodic_target_update`'s `use_locking` argument. update_period: how often an update should happen. """ target_variables = [tf.Variable(tf.zeros([1, 2]))] source_variables = [tf.Variable(tf.random_normal([1, 2]))] increment = tf.ones([1, 2]) update_source_op = tf.assign_add(source_variables[0], increment) updated = target_update_ops.periodic_target_update( target_variables, source_variables, update_period=update_period, use_locking=use_locking) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) for step in range(3 * update_period): sess.run(update_source_op) sess.run(updated) targets, sources = sess.run([target_variables, source_variables]) if step % update_period == 0: self.assertAllClose(targets, sources) else: self.assertNotAllClose(targets, sources)
def __init__(self, train_time, time_limit=None): super(TrainTimeHook, self).__init__() self._train_time = train_time self._time_limit = time_limit self._increment_amount = tf.placeholder(tf.float32, None) self._increment_op = tf.assign_add(train_time, self._increment_amount) self._last_run_duration = None
def test_num_steps(self): logdir = self._test_dir('test_num_steps') with tf.Graph().as_default(): gstep = tf.contrib.framework.get_or_create_global_step() do_step = tf.assign_add(gstep, 1) scaffold = supervised_session.Scaffold() # Do 3 steps and save. monitors = [tf.contrib.learn.monitors.StopAtStep(num_steps=3)] with supervised_session.SupervisedSession('', scaffold=scaffold, monitors=monitors) as session: session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertTrue(session.should_stop()) save_path = scaffold.saver.save(session.session, os.path.join(logdir, 'step-3')) # Restore and do 4 steps. def load_ckpt(scaffold, sess): scaffold.saver.restore(sess, save_path) scaffold = supervised_session.Scaffold(init_fn=load_ckpt) monitors = [tf.contrib.learn.monitors.StopAtStep(num_steps=4)] with supervised_session.SupervisedSession('', scaffold=scaffold, monitors=monitors) as session: self.assertEqual(3, session.run(gstep)) session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertTrue(session.should_stop())
def testEvaluateWithEvalFeedDict(self): # Create a checkpoint. checkpoint_dir = os.path.join(self.get_temp_dir(), 'evaluate_with_eval_feed_dict') self._train_model(checkpoint_dir, num_steps=1) # We need a variable that that the saver will try to restore. tf.contrib.framework.get_or_create_global_step() # Create a variable and an eval op that increments it with a placeholder. my_var = tf.contrib.framework.local_variable(0.0, name='my_var') increment = tf.placeholder(dtype=tf.float32) eval_ops = tf.assign_add(my_var, increment) increment_value = 3 num_evals = 5 expected_value = increment_value * num_evals final_values = tf.contrib.training.evaluate_repeatedly( checkpoint_dir=checkpoint_dir, eval_ops=eval_ops, feed_dict={increment: 3}, final_ops={'my_var': tf.identity(my_var)}, hooks=[ tf.contrib.training.StopAfterNEvalsHook(num_evals), ], max_number_of_evaluations=1) self.assertEqual(final_values['my_var'], expected_value)
def testCallsMonitorsWithLastStep(self): with tf.Graph().as_default(), tf.Session() as sess: global_step_tensor = tf.contrib.framework.create_global_step() mock_mon = FakeMonitor() mock_mon2 = FakeMonitor() mon_sess = monitored_session.MonitoredSession( sess=sess, monitors=[mock_mon, mock_mon2], global_step_tensor=global_step_tensor ) inc_5 = tf.assign_add(global_step_tensor, 5) # Initialize global_step_tensor to '0': sess.run(tf.initialize_all_variables()) mon_sess.run(fetches=[inc_5]) for mon in [mock_mon, mock_mon2]: self.assertEqual(mon.last_begin_step, 1) self.assertEqual(mon.last_end_step, 1) self.assertEqual(mon.last_post_step, 1) mon_sess.run(fetches=[inc_5]) for mon in [mock_mon, mock_mon2]: self.assertEqual(mon.last_begin_step, 6) self.assertEqual(mon.last_end_step, 6) self.assertEqual(mon.last_post_step, 6) mon_sess.run(fetches=[inc_5]) for mon in [mock_mon, mock_mon2]: self.assertEqual(mon.last_begin_step, 11) self.assertEqual(mon.last_end_step, 11) self.assertEqual(mon.last_post_step, 11)
def testStop(self): global_step = tf.train.create_global_step() tf.summary.scalar("global_step", global_step) incr_global_step = tf.assign_add(global_step, 1) ckpt_dir = self.ckpt_dir("stop") dummy = DummyHook(ckpt_dir, every_n_steps=10) with self.sess(dummy, ckpt_dir) as sess: for _ in range(20): sess.run(incr_global_step) # Summary files should now have 2 global step values in them self.flush() # Run for 10 more so that the hook gets triggered again for _ in range(10): sess.run(incr_global_step) # Check that the metrics have actually been collected. self.assertTrue("" in dummy.test_metrics) metrics = dummy.test_metrics[""] self.assertTrue("global_step_1" in metrics) steps, vals = metrics["global_step_1"] self.assertTrue(len(steps) == len(vals)) self.assertTrue(len(steps) >= 2) # Run for 10 more so that the hook triggers stoppage for _ in range(10): sess.run(incr_global_step) with self.assertRaisesRegexp(RuntimeError, "after should_stop requested"): sess.run(incr_global_step)
def testEvalOpAndFinalOp(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'eval_ops_and_final_ops') # Train a model for a single step to get a checkpoint. self._train_model(checkpoint_dir, num_steps=1) checkpoint_path = tf.contrib.training.wait_for_new_checkpoint( checkpoint_dir) # Create the model so we have something to restore. inputs = tf.constant(self._inputs, dtype=tf.float32) logistic_classifier(inputs) num_evals = 5 final_increment = 9.0 my_var = tf.contrib.framework.local_variable(0.0, name='MyVar') eval_ops = tf.assign_add(my_var, 1.0) final_ops = tf.identity(my_var) + final_increment final_ops_values = tf.contrib.training.evaluate_once( checkpoint_path=checkpoint_path, eval_ops=eval_ops, final_ops={'value': final_ops}, hooks=[ tf.contrib.training.StopAfterNEvalsHook(num_evals), ]) self.assertEqual(final_ops_values['value'], num_evals + final_increment)
def _model_fn(features, labels, mode): print("\t_model_fn:features=", features) print("\t_model_fn:labels=", labels) print("\t_model_fn:mode=", mode) # Build a linear model and predict values W = tf.get_variable("W", [1], dtype=tf.float64) b = tf.get_variable("b", [1], dtype=tf.float64) y = W*features['x'] + b # Loss sub-graph """Clouds: what is "labels"? "labels" is the standard answer? where "y" is the predict answer?""" loss = tf.reduce_sum(tf.square(y - labels)) # Training sub-graph global_step = tf.train.get_global_step() optimizer = tf.train.GradientDescentOptimizer(0.01) """Clouds: what is tf.group???""" train = tf.group(optimizer.minimize(loss), tf.assign_add(global_step, 1)) print("--------\n\t_model_fn:train group=", train ) print("--------\n\n") # EstimatorSpec connects subgraphs we built to the # appropriate functionality. return tf.estimator.EstimatorSpec( mode=mode, predictions=y, loss=loss, train_op=train)
def train_adv(model=None): assert FLAGS.train_dir, 'train_dir must be given' print('train dir is %s' % FLAGS.train_dir) # global_step = tf.train.get_or_create_global_step() global_step = tf.Variable(0, trainable=False) add_global = tf.assign_add(global_step, 1) action_prob_op, gene_loss_op, train_gene_op = model.train_generator( global_step) dis_loss_op, train_dis_op, reward_op = model.train_discriminator() train_sentence_op, train_sentence_len_op, train_label_op = model.get_generator_data( ) original_prob_op = model.get_original_prob() dev_acc_op, dev_num_op, dev_init_op = model.build_dev_graph() test_acc_op, test_num_op, test_init_op = model.build_test_graph() train_ckpt_dir = FLAGS.train_dir + '/train_ckpt' os.makedirs(train_ckpt_dir, exist_ok=True) sum_writer = tf.summary.FileWriter(str(train_ckpt_dir), graph=tf.get_default_graph()) best_dev_acc = 0.0 final_acc = 0.0 average_reward = 0 all_reward = 0 all_sent_num = 0 saver = tf.train.Saver(max_to_keep=1) init = tf.global_variables_initializer() with tf.Session(config=utils.get_config()) as sess: tf.set_random_seed(FLAGS.random_seed) np.random.seed(FLAGS.random_seed) sess.run(init) for _ in itertools.count(1): this_global_step = sess.run(add_global) if this_global_step >= FLAGS.max_steps + 1: break sentence, sentence_len, label = sess.run( [train_sentence_op, train_sentence_len_op, train_label_op]) raw_sentence = sentence.copy() if this_global_step < FLAGS.dis_warm_up_step: # discriminator warm up dis_loss, _, = sess.run( [dis_loss_op, train_dis_op], feed_dict={ 'discriminator/sentence:0': sentence, 'discriminator/sentence_len:0': sentence_len, 'discriminator/train_label:0': label, }) gene_loss = 0.0 elif this_global_step < FLAGS.gene_warm_up_step + FLAGS.dis_warm_up_step: # generator warm up original_prob = sess.run(original_prob_op, feed_dict={ 'sentence_original:0': sentence, 'sentence_len_original:0': sentence_len, 'sentence_label_original:0': label }) action = sess.run(action_prob_op, feed_dict={ 'generator/train_sentence:0': sentence, 'generator/train_sentence_len:0': sentence_len }) sentence_new, action_idx = generate_new_sentence_with_action( model.vocab, action, sentence, sentence_len) reward = sess.run(reward_op, feed_dict={ 'discriminator/sentence:0': sentence_new, 'discriminator/sentence_len:0': sentence_len, 'discriminator/train_label:0': label, 'discriminator/original_prob:0': original_prob }) all_sent_num += len(reward) all_reward += np.sum(reward) average_reward = all_reward / all_sent_num reward -= average_reward gene_loss, _ = sess.run( [gene_loss_op, train_gene_op], feed_dict={ 'generator/train_sentence:0': raw_sentence, 'generator/train_sentence_len:0': sentence_len, 'generator/reward_score:0': reward, 'generator/action_idx:0': action_idx }) dis_loss = 0 else: # adversarial train rand_num = random.choice([1] * FLAGS.every + [0]) if rand_num != 0: # train with generated sentences original_prob = sess.run(original_prob_op, feed_dict={ 'sentence_original:0': sentence, 'sentence_len_original:0': sentence_len, 'sentence_label_original:0': label }) action = sess.run(action_prob_op, feed_dict={ 'generator/train_sentence:0': sentence, 'generator/train_sentence_len:0': sentence_len }) sentence_new, action_idx = generate_new_sentence_with_action( model.vocab, action, sentence, sentence_len) dis_loss, _, reward = sess.run( [dis_loss_op, train_dis_op, reward_op], feed_dict={ 'discriminator/sentence:0': sentence_new, 'discriminator/sentence_len:0': sentence_len, 'discriminator/train_label:0': label, 'discriminator/original_prob:0': original_prob }) all_sent_num += len(reward) all_reward += np.sum(reward) average_reward = all_reward / all_sent_num reward -= average_reward gene_loss, _ = sess.run( [gene_loss_op, train_gene_op], feed_dict={ 'generator/train_sentence:0': raw_sentence, 'generator/train_sentence_len:0': sentence_len, 'generator/reward_score:0': reward, 'generator/action_idx:0': action_idx }) else: # train with original sentence dis_loss, _, = sess.run( [dis_loss_op, train_dis_op], feed_dict={ 'discriminator/sentence:0': sentence, 'discriminator/sentence_len:0': sentence_len, 'discriminator/train_label:0': label, }) gene_loss = 0.0 if this_global_step != 0 and this_global_step % FLAGS.test_steps == 0 and this_global_step > FLAGS.dis_warm_up_step + FLAGS.gene_warm_up_step: number = 0 accuracy = 0.0 while True: try: acc, num = sess.run([dev_acc_op, dev_num_op]) number += num accuracy += acc * num except tf.errors.OutOfRangeError: break accuracy /= number print('At step %d. dev num=%d acc=%f.' % (this_global_step, number, accuracy)) if accuracy > best_dev_acc: best_dev_acc = accuracy print("best acc=%f At step %d." % (best_dev_acc, this_global_step)) test_accuracy = 0. test_number = 0 while True: try: test_acc, test_num = sess.run( [test_acc_op, test_num_op]) test_number += test_num test_accuracy += test_acc * test_num except tf.errors.OutOfRangeError: break test_accuracy /= test_number print('test num=%d acc=%f.' % (test_number, test_accuracy)) final_acc = test_accuracy sess.run(test_init_op) save_checkpoint(saver, sess, FLAGS.train_dir, this_global_step) summary = tf.Summary() summary.value.add(tag='test_acc', simple_value=accuracy) summary.value.add(tag='best_dev_acc', simple_value=best_dev_acc) sum_writer.add_summary(summary, this_global_step) sess.run(dev_init_op) sum_writer.close() print('Accuracy of test set is %f .' % final_acc)
def model_fn(features, labels, mode, params): """The model_fn argument for creating an Estimator.""" tf.logging.info("features = %s labels = %s mode = %s params=%s" % (features, labels, mode, params)) global_step = tf.train.get_global_step() graph = mtf.Graph() mesh = mtf.Mesh(graph, "my_mesh") logits, loss = mnist_model(features, labels, mesh) mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape) layout_rules = mtf.convert_to_layout_rules(FLAGS.layout) mesh_size = mesh_shape.size mesh_devices = [""] * mesh_size mesh_impl = placement_mesh_impl.PlacementMeshImpl(mesh_shape, layout_rules, mesh_devices) if mode == tf.estimator.ModeKeys.TRAIN: var_grads = mtf.gradients( [loss], [v.outputs[0] for v in graph.trainable_variables]) optimizer = mtf_optimize.AdafactorOptimizer() update_ops = [] for grad, var in zip(var_grads, graph.trainable_variables): update_ops.extend(optimizer.apply_grad(grad, var)) lowering = mtf.Lowering(graph, {mesh: mesh_impl}) restore_hook = mtf.MtfRestoreHook(lowering) tf_logits = lowering.export_to_tf_tensor(logits) if mode != tf.estimator.ModeKeys.PREDICT: tf_loss = lowering.export_to_tf_tensor(loss) tf.summary.scalar("loss", tf_loss) if mode == tf.estimator.ModeKeys.TRAIN: tf_update_ops = [lowering.lowered_operation(op) for op in update_ops] tf_update_ops.append(tf.assign_add(global_step, 1)) train_op = tf.group(tf_update_ops) saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook(FLAGS.model_dir, save_steps=1000, saver=saver, listeners=[saver_listener]) accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.argmax(tf_logits, axis=1)) # Name tensors to be logged with LoggingTensorHook. tf.identity(tf_loss, "cross_entropy") tf.identity(accuracy[1], name="train_accuracy") # Save accuracy scalar to Tensorboard output. tf.summary.scalar("train_accuracy", accuracy[1]) # restore_hook must come before saver_hook return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op, training_chief_hooks=[restore_hook, saver_hook]) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "classes": tf.argmax(tf_logits, axis=1), "probabilities": tf.nn.softmax(tf_logits), } return tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions, prediction_hooks=[restore_hook], export_outputs={ "classify": tf.estimator.export.PredictOutput(predictions) }) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=tf_loss, evaluation_hooks=[restore_hook], eval_metric_ops={ "accuracy": tf.metrics.accuracy(labels=labels, predictions=tf.argmax(tf_logits, axis=1)), })
def __init__(self, sess, input_norm, config): super(AE_Expert_Network, self).__init__(sess, config, config.expert_lr) self.rng = np.random.RandomState(config.random_seed) self.expert_layer1_dim = config.l1_dim self.expert_layer2_dim = config.l2_dim self.input_norm = input_norm self.use_better_q_gd = False if config.use_better_q_gd == "True": self.use_better_q_gd = True self.better_q_gd_alpha = 1e-2 # config.better_q_gd_alpha self.better_q_gd_max_steps = 10 # config.better_q_gd_max_steps self.better_q_gd_stop = 1e-3 # config.better_q_gd_stop # original network self.inputs, self.phase, self.action, self.q_prediction = self.build_network( scope_name='ae_expert') self.net_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ae_expert') # Target network self.target_inputs, self.target_phase, self.target_action, self.target_q_prediction = self.build_network( scope_name='target_ae_expert') self.target_net_params = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_ae_expert') # Op for periodically updating target network with online network weights self.update_target_net_params = [ tf.assign_add( self.target_net_params[idx], self.tau * (self.net_params[idx] - self.target_net_params[idx])) for idx in range(len(self.target_net_params)) ] # Op for init. target network with identical parameter as the original network self.init_target_net_params = [ tf.assign(self.target_net_params[idx], self.net_params[idx]) for idx in range(len(self.target_net_params)) ] # TODO: Currently doesn't support batchnorm if self.norm_type == 'batch': raise NotImplementedError else: assert (self.norm_type == 'none' or self.norm_type == 'layer' or self.norm_type == 'input_norm') self.batchnorm_ops = [tf.no_op()] self.update_target_batchnorm_params = tf.no_op() self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Optimization Op with tf.control_dependencies(self.batchnorm_ops): # Expert Update self.expert_loss = tf.reduce_mean( tf.squared_difference(self.predicted_q_value, self.q_prediction)) self.expert_optimize = tf.train.AdamOptimizer( self.learning_rate).minimize(self.expert_loss) # Get the gradient of the expert w.r.t. the action self.action_grads = tf.gradients(self.q_prediction, self.action)
def main(args=None): print(args) tf.reset_default_graph() """ Read dataset parser """ flags.network_name = args[0].split('/')[-1].split('.')[0].split( 'main_')[-1] flags.logs_dir = './logs_' + flags.network_name dataset_parser = GANParser(flags=flags) """ Transform data to TFRecord format (Only do once.) """ if False: dataset_parser.load_paths(is_jpg=True, load_val=True) dataset_parser.data2record(name='{}_train.tfrecords'.format( dataset_parser.dataset_name), set_type='train', test_num=None) dataset_parser.data2record(name='{}_val.tfrecords'.format( dataset_parser.dataset_name), set_type='val', test_num=None) # coco_parser.data2record_test(name='coco_stuff2017_test-dev_all_label.tfrecords', is_dev=True, test_num=None) # coco_parser.data2record_test(name='coco_stuff2017_test_all_label.tfrecords', is_dev=False, test_num=None) return """ Build Graph """ with tf.Graph().as_default(): """ Input (TFRecord) """ with tf.name_scope('TFRecord'): # DatasetA training_a_dataset = dataset_parser.tfrecord_get_dataset( name='{}_trainA.tfrecords'.format(dataset_parser.dataset_name), batch_size=flags.batch_size, shuffle_size=None) val_a_dataset = dataset_parser.tfrecord_get_dataset( name='{}_valA.tfrecords'.format(dataset_parser.dataset_name), batch_size=flags.batch_size, need_flip=(flags.mode == 'train')) # DatasetB training_b_dataset = dataset_parser.tfrecord_get_dataset( name='{}_trainB.tfrecords'.format(dataset_parser.dataset_name), batch_size=flags.batch_size, shuffle_size=None) val_b_dataset = dataset_parser.tfrecord_get_dataset( name='{}_valB.tfrecords'.format(dataset_parser.dataset_name), batch_size=flags.batch_size, need_flip=(flags.mode == 'train')) # A feed-able iterator with tf.name_scope('RealA'): handle_a = tf.placeholder(tf.string, shape=[]) iterator_a = tf.contrib.data.Iterator.from_string_handle( handle_a, training_a_dataset.output_types, training_a_dataset.output_shapes) real_a, real_a_name, real_a_shape = iterator_a.get_next() with tf.name_scope('RealB'): handle_b = tf.placeholder(tf.string, shape=[]) iterator_b = tf.contrib.data.Iterator.from_string_handle( handle_b, training_b_dataset.output_types, training_b_dataset.output_shapes) real_b, real_b_name, real_b_shape = iterator_b.get_next() with tf.name_scope('InitialA_op'): training_a_iterator = training_a_dataset.make_initializable_iterator( ) validation_a_iterator = val_a_dataset.make_initializable_iterator( ) with tf.name_scope('InitialB_op'): training_b_iterator = training_b_dataset.make_initializable_iterator( ) validation_b_iterator = val_b_dataset.make_initializable_iterator( ) """ Network (Computes predictions from the inference model) """ with tf.name_scope('Network'): # Input global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int32) global_step_update_op = tf.assign_add(global_step, 1, name='global_step_update_op') # mean_rgb = tf.constant((123.68, 116.78, 103.94), dtype=tf.float32) fake_b_pool = tf.placeholder(tf.float32, shape=[ None, flags.image_height, flags.image_width, flags.c_in_dim ], name='fake_B_pool') image_linear_shape = tf.constant( flags.image_height * flags.image_width * flags.c_in_dim, dtype=tf.int32, name='image_linear_shape') # A -> B ''' with tf.name_scope('Generator'): with slim.arg_scope(vgg.vgg_arg_scope()): net, end_points = vgg.vgg_16(real_a - mean_rgb, num_classes=1, is_training=True, spatial_squeeze=False) print(net) return with tf.variable_scope('Generator_A2B'): pred = tf.layers.conv2d(tf.nn.relu(net), 1, 1, 1) pred_upscale = tf.image.resize_bilinear(pred, (flags.image_height, flags.image_width), name='up_scale') segment_a = tf.nn.sigmoid(pred_upscale, name='segment_a') # sigmoid cross entropy Loss with tf.name_scope('loss_gen_a2b'): loss_gen_a2b = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( logits=pred_upscale, labels=real_b/255.0, name='sigmoid'), name='mean') ''' # A -> B # adjusted_a = tf.zeros_like(real_a, tf.float32, name='mask', optimize=True) # adjusted_a = high_light(real_a, name='high_light') # adjusted_a = tf.layers.average_pooling2d(real_a, 7, strides=1, padding='same', name='adjusted_a') adjusted_a = gaussian_blur(real_a, name='adjusted_a') logits_a = generator_resnet(real_a, flags, False, name="Generator_A2B") segment_a = tf.nn.tanh(logits_a, name='segment_a') logits_a_ori = tf.image.resize_bilinear( logits_a, (real_a_shape[0][0], real_b_shape[0][1]), name='logits_a_ori') segment_a_ori = tf.nn.tanh(logits_a_ori, name='segment_a_ori') with tf.variable_scope('Fake_B'): foreground = tf.multiply(real_a, segment_a, name='foreground') background = tf.multiply(adjusted_a, (1 - segment_a), name='background') fake_b_logits = tf.add(foreground, background, name='fake_b_logits') fake_b = tf.clip_by_value(fake_b_logits, 0, 255, name='fake_b') # fake_b_f = tf.reshape(fake_b, [-1, image_linear_shape], name='fake_b_f') fake_b_pool_f = tf.reshape(fake_b_pool, [-1, image_linear_shape], name='fake_b_pool_f') real_b_f = tf.reshape(real_b, [-1, image_linear_shape], name='real_b_f') dis_fake_b = discriminator_se_wgangp(fake_b_f, flags, reuse=False, name="Discriminator_B") dis_fake_b_pool = discriminator_se_wgangp(fake_b_pool_f, flags, reuse=True, name="Discriminator_B") dis_real_b = discriminator_se_wgangp(real_b_f, flags, reuse=True, name="Discriminator_B") # WGAN Loss with tf.name_scope('loss_gen_a2b'): loss_gen_a2b = -tf.reduce_mean(dis_fake_b) with tf.name_scope('loss_dis_b'): loss_dis_b_adv_real = -tf.reduce_mean(dis_real_b) loss_dis_b_adv_fake = tf.reduce_mean(dis_fake_b_pool) loss_dis_b = tf.reduce_mean(dis_fake_b_pool) - tf.reduce_mean( dis_real_b) with tf.name_scope('wgan-gp'): alpha = tf.random_uniform(shape=[flags.batch_size, 1], minval=0., maxval=1.) differences = fake_b_pool_f - real_b_f interpolates = real_b_f + (alpha * differences) gradients = tf.gradients( discriminator_se_wgangp(interpolates, flags, reuse=True, name="Discriminator_B"), [interpolates])[0] slopes = tf.sqrt( tf.reduce_sum(tf.square(gradients), reduction_indices=[1])) gradient_penalty = tf.reduce_mean((slopes - 1.)**2) loss_dis_b += flags.lambda_gp * gradient_penalty # Optimizer ''' trainable_var_resnet = tf.get_collection( key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='vgg_16') trainable_var_gen_a2b = tf.get_collection( key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='Generator_A2B') + trainable_var_resnet slim.model_analyzer.analyze_vars(trainable_var_gen_a2b, print_info=True) ''' trainable_var_gen_a2b = tf.get_collection( key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='Generator_A2B') trainable_var_dis_b = tf.get_collection( key=tf.GraphKeys.TRAINABLE_VARIABLES, scope='Discriminator_B') with tf.name_scope('learning_rate_decay'): decay = tf.maximum( 0., 1. - (tf.cast(global_step, tf.float32) / flags.training_iter), name='decay') learning_rate = tf.multiply(flags.learning_rate, decay, name='learning_rate') train_op_gen_a2b = train_op(loss_gen_a2b, learning_rate, flags, trainable_var_gen_a2b, name='gen_a2b') train_op_dis_b = train_op(loss_dis_b, learning_rate, flags, trainable_var_dis_b, name='dis_b') saver = tf.train.Saver(max_to_keep=2) # Graph Logs with tf.name_scope('GEN_a2b'): tf.summary.scalar("loss/gen_a2b/all", loss_gen_a2b) with tf.name_scope('DIS_b'): tf.summary.scalar("loss/dis_b/all", loss_dis_b) tf.summary.scalar("loss/dis_b/adv_real", loss_dis_b_adv_real) tf.summary.scalar("loss/dis_b/adv_fake", loss_dis_b_adv_fake) summary_op = tf.summary.merge_all() """ Session """ tfconfig = tf.ConfigProto(allow_soft_placement=True) tfconfig.gpu_options.allow_growth = True with tf.Session(config=tfconfig) as sess: with tf.name_scope('Initial'): ckpt = tf.train.get_checkpoint_state( dataset_parser.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: print("Model restored: {}".format( ckpt.model_checkpoint_path)) saver.restore(sess, ckpt.model_checkpoint_path) else: print("No Model found.") init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) # init_fn = slim.assign_from_checkpoint_fn('./pretrained/vgg_16.ckpt', # slim.get_model_variables('vgg_16')) # init_fn(sess) summary_writer = tf.summary.FileWriter(dataset_parser.logs_dir, sess.graph) """ Training Mode """ if flags.mode == 'train': print('Training mode! Batch size:{:d}'.format( flags.batch_size)) with tf.variable_scope('Input_port'): training_a_handle = sess.run( training_a_iterator.string_handle()) training_b_handle = sess.run( training_b_iterator.string_handle()) # val_a_handle = sess.run(validation_a_iterator.string_handle()) # val_b_handle = sess.run(validation_b_iterator.string_handle()) image_pool_a, image_pool_b = ImagePool( flags.pool_size), ImagePool(flags.pool_size) print('Start Training!') start_time = time.time() sess.run([ training_a_iterator.initializer, training_b_iterator.initializer ]) feed_dict_train = { handle_a: training_a_handle, handle_b: training_b_handle } # feed_dict_valid = {is_training: False} global_step_sess = sess.run(global_step) while global_step_sess < flags.training_iter: try: # Update gen_A2B, gen_B2A _, fake_b_sess = sess.run([train_op_gen_a2b, fake_b], feed_dict=feed_dict_train) # _, loss_gen_a2b_sess = sess.run([train_op_gen_a2b, loss_gen_a2b], feed_dict=feed_dict_train) # Update dis_B, dis_A fake_b_pool_query = image_pool_b.query(fake_b_sess) _ = sess.run(train_op_dis_b, feed_dict={ fake_b_pool: fake_b_pool_query, handle_b: training_b_handle }) sess.run(global_step_update_op) global_step_sess, learning_rate_sess = sess.run( [global_step, learning_rate]) print( 'global step:[{:d}/{:d}], learning rate:{:f}, time:{:4.4f}' .format(global_step_sess, flags.training_iter, learning_rate_sess, time.time() - start_time)) # Logging the events if global_step_sess % flags.log_freq == 1: print('Logging the events') summary_op_sess = sess.run(summary_op, feed_dict={ handle_a: training_a_handle, handle_b: training_b_handle, fake_b_pool: fake_b_pool_query }) summary_writer.add_summary(summary_op_sess, global_step_sess) # summary_writer.flush() # Observe training situation (For debugging.) if flags.debug and global_step_sess % flags.observe_freq == 1: real_a_sess, real_b_sess, adjusted_a_sess, segment_a_sess, fake_b_sess, \ real_a_name_sess, real_b_name_sess = \ sess.run([real_a, real_b, adjusted_a, segment_a, fake_b, real_a_name, real_b_name], feed_dict={handle_a: training_a_handle, handle_b: training_b_handle}) print('Logging training images.') dataset_parser.visualize_data( real_a=real_a_sess, real_b=real_b_sess, adjusted_a=adjusted_a_sess, segment_a=segment_a_sess, fake_b=fake_b_sess, shape=(1, 1), global_step=global_step_sess, logs_dir=dataset_parser.logs_image_train_dir, real_a_name=real_a_name_sess[0].decode(), real_b_name=real_b_name_sess[0].decode()) """ Saving the checkpoint """ if global_step_sess % flags.save_freq == 0: print('Saving model...') saver.save(sess, dataset_parser.checkpoint_dir + '/model.ckpt', global_step=global_step_sess) except tf.errors.OutOfRangeError: print( '----------------One epochs finished!----------------' ) sess.run([ training_a_iterator.initializer, training_b_iterator.initializer ]) elif flags.mode == 'test': from PIL import Image import numpy as np print('Start Testing!') ''' with tf.variable_scope('Input_port'): val_a_handle = sess.run(validation_a_iterator.string_handle()) val_b_handle = sess.run(validation_b_iterator.string_handle()) sess.run([validation_a_iterator.initializer, validation_b_iterator.initializer]) ''' with tf.variable_scope('Input_port'): val_a_handle = sess.run( training_a_iterator.string_handle()) val_b_handle = sess.run( training_b_iterator.string_handle()) sess.run([ training_a_iterator.initializer, training_b_iterator.initializer ]) feed_dict_test = { handle_a: val_a_handle, handle_b: val_b_handle } image_idx = 0 while True: try: segment_a_ori_sess, real_a_name_sess = \ sess.run([segment_a_ori, real_a_name], feed_dict=feed_dict_test) segment_a_ori_sess = np.squeeze( segment_a_ori_sess) * 255 x_png = Image.fromarray( segment_a_ori_sess.astype(np.uint8)) x_png.save('{}/{}.png'.format( dataset_parser.logs_image_val_dir, real_a_name_sess[0].decode()), format='PNG') print(image_idx) image_idx += 1 except tf.errors.OutOfRangeError: print( '----------------One epochs finished!----------------' ) break
def batch_inputs(self, dataset, train): """Contruct batches of training or evaluation examples from the image input_data. Args: dataset: instance of Dataset class specifying the input_data. See input_data.py for details. batch_size: integer train: boolean num_preprocess_threads: integer, total number of preprocessing threads num_readers: integer, number of parallel readers Returns: images: 4-D float Tensor of a batch of images labels: 1-D integer Tensor of [batch_size]. Raises: ValueError: if data is not found """ with tf.name_scope('batch_processing'): data_files = dataset.data_files() if data_files is None: raise ValueError('No data files found for this input_data') # Create filename_queue if train: filename_queue = tf.train.string_input_producer(data_files, shuffle=True, capacity=16) else: filename_queue = tf.train.string_input_producer(data_files, shuffle=False, capacity=1) # Approximate number of examples per shard. examples_per_shard = 1024 # Size the random shuffle queue to balance between good global # mixing (more examples) and memory use (fewer examples). # 1 image uses 299*299*3*4 bytes = 1MB # The default input_queue_memory_factor is 16 implying a shuffling queue # size: examples_per_shard * 16 * 1MB = 17.6GB min_queue_examples = examples_per_shard * self.input_queue_memory_factor if train: examples_queue = tf.RandomShuffleQueue( capacity=min_queue_examples + 3 * self.batch_size, min_after_dequeue=min_queue_examples, dtypes=[tf.string]) else: examples_queue = tf.FIFOQueue(capacity=examples_per_shard + 3 * self.batch_size, dtypes=[tf.string]) # Create multiple readers to populate the queue of examples. if self.num_readers > 1: enqueue_ops = [] for _ in range(self.num_readers): reader = dataset.reader() _, value = reader.read(filename_queue) enqueue_ops.append(examples_queue.enqueue([value])) tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) example_serialized = examples_queue.dequeue() else: reader = dataset.reader() _, example_serialized = reader.read(filename_queue) pos_queue = None neg_queue = None if self.batch_size < 2: pos_queue = tf.RandomShuffleQueue( name="pos-queue", capacity=10, min_after_dequeue=5, dtypes=[tf.float32, tf.float32, tf.string]) neg_queue = tf.RandomShuffleQueue( name="neg-queue", capacity=10, min_after_dequeue=5, dtypes=[tf.float32, tf.float32, tf.string]) pos_queue_enq = [] neg_queue_enq = [] with tf.name_scope('split-merge'): if train and self.ensure_posneg_balance: images_and_masks = [] for thread_id in range(self.num_preprocess_threads): # Parse a serialized Example proto to extract the image and metadata. image_buffer, mask_buffer, img_name_ = self.parse_example_proto( example_serialized) image_ = self.image_preprocessing( image_buffer, img_size=(self.input_size[0], self.input_size[1]), num_channels=self.input_size[2]) mask_ = self.image_preprocessing( mask_buffer, img_size=(self.mask_size[0], self.mask_size[1]), num_channels=self.mask_size[2]) image_ = tf.expand_dims(image_, 0) mask_ = tf.expand_dims(mask_, 0) img_name_ = tf.expand_dims(img_name_, 0) img_shape = tf.TensorShape([ image_.shape[1], image_.shape[2], image_.shape[3] ]) mask_shape = tf.TensorShape( [mask_.shape[1], mask_.shape[2], mask_.shape[3]]) img_name_shape = tf.TensorShape([]) # initialize pos/neg queues with proper shape size on first if pos_queue is None or neg_queue is None: pos_queue = tf.RandomShuffleQueue( name="pos-queue", capacity=10, min_after_dequeue=5, dtypes=[tf.float32, tf.float32, tf.string], shapes=[img_shape, mask_shape, img_name_shape]) neg_queue = tf.RandomShuffleQueue( name="neg-queue", capacity=10, min_after_dequeue=5, dtypes=[tf.float32, tf.float32, tf.string], shapes=[img_shape, mask_shape, img_name_shape]) is_pos = tf.squeeze( tf.reduce_sum(mask_, [1, 2], keep_dims=False)) neg_mask = tf.less_equal(is_pos, 0) pos_idx = tf.reshape( tf.where([tf.logical_not(neg_mask)]), [-1]) neg_idx = tf.reshape(tf.where([neg_mask]), [-1]) pos_data = [ tf.gather(image_, pos_idx), tf.gather(mask_, pos_idx), tf.gather(img_name_, pos_idx) ] neg_data = [ tf.gather(image_, neg_idx), tf.gather(mask_, neg_idx), tf.gather(img_name_, neg_idx) ] pos_queue_enq.append(pos_queue.enqueue_many(pos_data)) neg_queue_enq.append(neg_queue.enqueue_many(neg_data)) tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner( pos_queue, pos_queue_enq)) tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner( neg_queue, neg_queue_enq)) if self.batch_size >= 2: if self.batch_size % 2 != 0: raise Exception( "'batch_size' mod 2 != 0 ! only even batch sizes supported at the moment" ) num_deque = int(self.batch_size / 2) pos_data = pos_queue.dequeue_many(num_deque) neg_data = neg_queue.dequeue_many(num_deque) concat_data = [ tf.concat([pos_data[0], neg_data[0]], axis=0, name='Concat-img'), tf.concat([pos_data[1], neg_data[1]], axis=0, name='Concat-mask'), tf.concat([pos_data[2], neg_data[2]], axis=0, name='Concat-img-name') ] # randomly permute within batch size (is this even necessary ??) idx = tf.Variable(range(0, self.batch_size), trainable=False, dtype=tf.int32) idx = tf.random_shuffle(idx) images = tf.gather(concat_data[0], idx) masks = tf.gather(concat_data[1], idx) img_names = tf.gather(concat_data[2], idx) else: # positive only #images, masks, img_names = pos_queue.dequeue() # negative only #images, masks, img_names = neg_queue.dequeue() # mix 50/50 counter = tf.Variable(initial_value=0, trainable=False, dtype=tf.int32) counter = tf.assign_add(counter, 1) condition_term = tf.equal(tf.mod(counter, 2), tf.constant(0)) images, masks, img_names = tf.cond( condition_term, lambda: pos_queue.dequeue(), lambda: neg_queue.dequeue()) if self.use_random_rotation: images.set_shape( tensor_shape.as_shape([None, None, 1])) masks.set_shape( tensor_shape.as_shape([None, None, 1])) # randomly rotate image by 90 degrees rot_factor = tf.random_uniform([1], minval=0, maxval=3, dtype=tf.int32) rot_factor = tf.gather(rot_factor, 0) images = tf.image.rot90(images, k=rot_factor) masks = tf.image.rot90(masks, k=rot_factor) images = tf.expand_dims(images, axis=0) masks = tf.expand_dims(masks, axis=0) img_names = tf.expand_dims(img_names, axis=0) else: # Parse a serialized Example proto to extract the image and metadata. image_buffer, mask_buffer, img_names = self.parse_example_proto( example_serialized) images = self.image_preprocessing( image_buffer, img_size=(self.input_size[0], self.input_size[1]), num_channels=self.input_size[2]) masks = self.image_preprocessing( mask_buffer, img_size=(self.mask_size[0], self.mask_size[1]), num_channels=1) images = tf.expand_dims(images, axis=0) masks = tf.expand_dims(masks, axis=0) img_names = tf.expand_dims(img_names, axis=0) # Reshape images into these desired dimensions. images = tf.cast(images, tf.float32) masks = tf.cast(masks, tf.float32) images.set_shape( tensor_shape.as_shape( [self.batch_size, None, None, self.input_size[2]])) masks.set_shape( tensor_shape.as_shape([ self.batch_size, self.input_size[0], self.input_size[1], self.mask_size[2] ])) # Display the training images in the visualizer. tf.summary.image('images', images) tf.summary.image('masks', masks) return images, masks, img_names
def __init__(self, name, make_model, devices=get_available_gpus(), master_device=None, TrainerClass=SampleBasedTrainer, sess=None, *args, verbose=False, **kwargs): """ A wrapper-class that performs batch-parallel training with some trainer. """ self.name = name self.sess = sess = sess or tf.get_default_session() or tf.InteractiveSession() self.master_device = master_device = master_device or next(iter(devices)) assert master_device in devices self.verbose = verbose class Worker(TrainerClass): def get_optimizer(self, *args, **kwargs): """ Worker does not update weights by itself. use sgd to avoid wasting memory """ return tf.train.GradientDescentOptimizer(learning_rate=0) with tf.variable_scope(name): self.workers_by_device = {} for i, device in enumerate(devices): with tf.device(device), tf.variable_scope('worker_%i' % i): model = make_model() if device == master_device: worker = TrainerClass(model, *args, **kwargs) else: worker = Worker(model, *args, **kwargs) self.workers_by_device[device] = worker if verbose: print("Created model {} weights and worker on device {}" "".format(model.name, device)) self.master_model = self.workers_by_device[master_device].model self.master_worker = self.workers_by_device[self.master_device] assert isinstance(self.master_worker, TrainerClass) # step 1: send main model's weights to all worker replicas self.scatter_weights = [] for device, worker in self.workers_by_device.items(): if worker == self.master_worker: continue self.scatter_weights.extend(map(tf.assign, worker.optimized_variables, self.master_worker.optimized_variables)) # step 2: compute grads and counters at all workers self.gather_grads, self.gather_counters = [], [] for device, worker in self.workers_by_device.items(): if worker == self.master_worker: continue self.gather_grads.extend( map(tf.assign_add, self.master_worker.accumulated_grads, worker.accumulated_grads) ) self.gather_grads.append( tf.assign_add(self.master_worker.accumulated_num_batches, worker.accumulated_num_batches) ) master_counters_flat = [self.master_worker.accumulated_counters[name] for name in sorted(self.master_worker.accumulated_counters.keys())] worker_counters_flat = [worker.accumulated_counters[name] for name in sorted(self.master_worker.accumulated_counters.keys())] self.gather_counters.extend( map(tf.assign_add, master_counters_flat, worker_counters_flat) ) # step 3: perform gradient step and reset all accumulated values self.reset_slave_grads = [ worker.reset_gradients for worker in self.workers_by_device.values() if worker != self.master_worker ] self.reset_slave_counters = [ worker.reset_counters for worker in self.workers_by_device.values() if worker != self.master_worker ]
def model(features, labels, mode): tc = constant_tensors() tc_1d1_goals_f, tc_home_points_i, tc_away_points_i, calc_poisson_prob, p_tendency_mask_f, p_gdiff_mask_f, p_fulltime_index_matrix = tc with tf.variable_scope("Model"): logits1H = buildGraph("1H", features, columns, mode) t_is_home_bool = tf.equal(features["Where"], "Home") predictions1H = create_predictions(logits1H, t_is_home_bool, tc) if mode == tf.estimator.ModeKeys.TRAIN: logits2H = buildGraph( "2H", features, columns, mode, tf.stack([features["T1_GHT"], features["T2_GHT"]], axis=1)) else: logits2H = buildGraph("2H", features, columns, mode, predictions1H["pred"]) predictions2H = create_predictions(logits2H, t_is_home_bool, tc) predictions = combine1H2H(predictions1H, predictions2H, t_is_home_bool, tc_home_points_i, tc_away_points_i, p_fulltime_index_matrix, calc_poisson_prob) logits2H_alt = buildGraph("2H", features, columns, mode, predictions1H["alt_pred"]) if mode == tf.estimator.ModeKeys.PREDICT: # Build alternative prediction with 2nd-most likely outcome of 1H predictions2H_alt = create_predictions(logits2H_alt, t_is_home_bool, tc) predictions_alt = combine1H2H(predictions1H, predictions2H_alt, t_is_home_bool, tc_home_points_i, tc_away_points_i, p_fulltime_index_matrix, calc_poisson_prob) for key, value in predictions_alt.items(): predictions['Alt_' + key] = value export_outputs = { "predictions": tf.estimator.export.RegressionOutput(predictions["p_marg_1"]) } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) with tf.variable_scope("Evaluation"): t_is_home_loss_bool = (t_is_home_bool & tf.less( features["T1_GFT"], features["T2_GFT"])) | ( tf.logical_not(t_is_home_bool) & tf.greater(features["T1_GFT"], features["T2_GFT"])) t_is_home_win_bool = (t_is_home_bool & tf.greater( features["T1_GFT"], features["T2_GFT"])) | ( tf.logical_not(t_is_home_bool) & tf.less(features["T1_GFT"], features["T2_GFT"])) eval_metric_ops_1H, loss_1H = create_losses_and_metrics_HH( "1H", predictions1H, features["T1_GHT"], features["T2_GHT"], t_is_home_bool, mode, tc, t_is_home_win_bool, t_is_home_loss_bool) eval_metric_ops_2H, loss_2H = create_losses_and_metrics_HH( "2H", predictions2H, features["T1_GFT"] - features["T1_GHT"], features["T2_GFT"] - features["T2_GHT"], t_is_home_bool, mode, tc, t_is_home_win_bool, t_is_home_loss_bool) eval_metric_ops = eval_metric_ops_1H eval_metric_ops.update(eval_metric_ops_2H) loss = loss_1H + loss_2H # softpoints gs = tf.minimum(features["T1_GFT"], 6) gc = tf.minimum(features["T2_GFT"], 6) achievable_points_mask = tf.where( t_is_home_bool, tf.gather(tc_home_points_i, gs * 7 + gc), tf.gather(tc_away_points_i, gs * 7 + gc)) pt_softpoints = tf.reduce_sum(predictions["p_pred_12"] * achievable_points_mask, axis=1) eval_metric_ops["pt_softpoints"] = tf.metrics.mean(pt_softpoints) loss -= tf.reduce_mean(pt_softpoints) result_metrics = create_result_metrics(predictions["pred"][:, 0], predictions["pred"][:, 1], features["T1_GFT"], features["T2_GFT"], t_is_home_bool) eval_metric_ops.update(result_metrics) # loss -= eval_metric_ops["z_points"][1] for key, value in eval_metric_ops.items(): tf.summary.scalar(key, value[1]) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, eval_metric_ops=eval_metric_ops) global_step = tf.train.get_global_step() #optimizer = tf.train.GradientDescentOptimizer(1e-4) learning_rate = 1e-2 print("Learning rate = {}".format(learning_rate)) optimizer = tf.train.AdamOptimizer(learning_rate) train = tf.group(optimizer.minimize(loss), tf.assign_add(global_step, 1)) summary_op = tf.summary.merge_all() summary_hook = tf.train.SummarySaverHook(save_steps=100, output_dir=model_dir + "/train", scaffold=None, summary_op=summary_op) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, loss=loss, train_op=train, eval_metric_ops=eval_metric_ops, training_hooks=[summary_hook])
def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01, seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0, eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False, forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True, side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63, back_offset=0): eval_k = int(round(2621 * eval_memsize / n_models)) eval_batch_size = int( eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers)) # 128 -> 1024, 256->512, 512->256 eval_pct = 0.2 batch_size = hparams.batch_size train_window = hparams.train_window tf.reset_default_graph() if seed: tf.set_random_seed(seed) with tf.device("/cpu:0"): inp = VarFeeder.read_vars("data/vars") if side_split: splitter = Splitter(ucdoc_features(inp), inp.page_map, 3, train_sampling=train_sampling, test_sampling=eval_sampling, seed=seed) else: splitter = FakeSplitter(ucdoc_features(inp), 3, seed=seed, test_sampling=eval_sampling) real_train_pages = splitter.splits[0].train_size real_eval_pages = splitter.splits[0].test_size items_per_eval = real_eval_pages * eval_pct eval_batches = int(np.ceil(items_per_eval / eval_batch_size)) steps_per_epoch = real_train_pages // batch_size eval_every_step = int(round(steps_per_epoch * eval_pct)) # eval_every_step = int(round(items_per_eval * train_sampling / batch_size)) global_step = tf.train.get_or_create_global_step() inc_step = tf.assign_add(global_step, 1) all_models: List[ModelTrainerV2] = [] def create_model(scope, index, prefix, seed): with tf.variable_scope('input') as inp_scope: with tf.device("/cpu:0"): split = splitter.splits[index] pipe = InputPipe( inp, features=split.train_set, n_pages=split.train_size, # mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose, mode=ModelMode.TRAIN_SKIP_PREDICT, batch_size=batch_size, n_epoch=None, verbose=verbose, train_completeness_threshold=train_completeness_threshold, predict_completeness_threshold=train_completeness_threshold, train_window=train_window, predict_window=predict_window, rand_seed=seed, train_skip_first=hparams.train_skip_first, back_offset=back_offset) inp_scope.reuse_variables() if side_split: side_eval_pipe = InputPipe( inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window * (2 if forward_split else 1)) else: side_eval_pipe = None if forward_split: forward_eval_pipe = InputPipe( inp, features=split.test_set, n_pages=split.test_size, mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None, verbose=verbose, predict_window=predict_window, train_completeness_threshold=0.01, predict_completeness_threshold=0, train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches, back_offset=predict_window) else: forward_eval_pipe = None avg_sgd = asgd_decay is not None #asgd_decay = 0.99 if avg_sgd else None train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed) scope.reuse_variables() eval_stages = [] if side_split: side_eval_model = Model( side_eval_pipe, hparams, is_train=False, #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]), seed=seed) eval_stages.append((Stage.EVAL_SIDE, side_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model)) if forward_split: forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed) eval_stages.append((Stage.EVAL_FRWD, forward_eval_model)) if avg_sgd: eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model)) if write_summaries: summ_path = f"{logdir}/{name}_{index}" if os.path.exists(summ_path): shutil.rmtree(summ_path) summ_writer = tf.summary.FileWriter( summ_path) # , graph=tf.get_default_graph() else: summ_writer = None if do_eval and forward_split: stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE' ].avg_epoch else: stop_metric = None return ModelTrainerV2(train_model, eval_stages, index, patience=patience, stop_metric=stop_metric, summary_writer=summ_writer) if n_models == 1: with tf.device(f"/gpu:{gpu}"): scope = tf.get_variable_scope() all_models = [create_model(scope, 0, None, seed=seed)] else: for i in range(n_models): device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}" with tf.device(device): prefix = f"m_{i}" with tf.variable_scope(prefix) as scope: all_models.append( create_model(scope, i, prefix=prefix, seed=seed + i)) trainer = MultiModelTrainer(all_models, inc_step) if save_best_model or save_from_step: saver_path = f'data/cpt/{name}' if os.path.exists(saver_path): shutil.rmtree(saver_path) os.makedirs(saver_path) saver = tf.train.Saver(max_to_keep=10, name='train_saver') else: saver = None avg_sgd = asgd_decay is not None if avg_sgd: from itertools import chain def ema_vars(model): ema = model.train_model.ema return { ema.average_name(v): v for v in model.train_model.ema._averages } ema_names = dict( chain(*[ema_vars(model).items() for model in all_models])) #ema_names = all_models[0].train_model.ema.variables_to_restore() ema_loader = tf.train.Saver(var_list=ema_names, max_to_keep=1, name='ema_loader') ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver') else: ema_loader = None init = tf.global_variables_initializer() if forward_split and do_eval: eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE') eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE') else: eval_smape = DummyMetric() eval_mae = DummyMetric() if side_split and do_eval: eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE') eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE') else: eval_mae_side = DummyMetric() eval_smape_side = DummyMetric() train_smape = trainer.metric(Stage.TRAIN, 'SMAPE') train_mae = trainer.metric(Stage.TRAIN, 'MAE') grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm') eval_stages = [] ema_eval_stages = [] if forward_split and do_eval: eval_stages.append(Stage.EVAL_FRWD) ema_eval_stages.append(Stage.EVAL_FRWD_EMA) if side_split and do_eval: eval_stages.append(Stage.EVAL_SIDE) ema_eval_stages.append(Stage.EVAL_SIDE_EMA) # gpu_options=tf.GPUOptions(allow_growth=False), with tf.Session( config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions( allow_growth=gpu_allow_growth))) as sess: sess.run(init) # pipe.load_vars(sess) inp.restore(sess) for model in all_models: model.init(sess) # if beholder: # visualizer = Beholder(session=sess, logdir=summ_path) step = 0 prev_top = np.inf best_smape = np.inf # Contains best value (first item) and subsequent values best_epoch_smape = [] for epoch in range(max_epoch): # n_steps = pusher.n_pages // batch_size if tqdm: #tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False) tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False, file=logging.root.handlers[0].stream) else: tqr = range(steps_per_epoch) for _ in tqr: try: step = trainer.train_step(sess, epoch) pred, time_y, true_y, true_x, time_x, page_ix, norm_mean, norm_std, lagged_ix = sess.run( [ trainer.trainers[0].train_model.predictions, trainer.trainers[0].train_model.inp.time_y, trainer.trainers[0].train_model.inp.true_y, trainer.trainers[0].train_model.inp.true_x, trainer.trainers[0].train_model.inp.time_x, trainer.trainers[0].train_model.inp.page_ix, trainer.trainers[0].train_model.inp.norm_mean, trainer.trainers[0].train_model.inp.norm_std, trainer.trainers[0].train_model.inp.lagged_x ]) #sess.run(trainer.trainers[0].train_model.inp.inp.hits) #inp = all_models[0].train_model.inp.inp, pred_exp = np.round(np.expm1(pred)) true_exp = np.expm1(true_y) error_exp = np.mean( np.abs(true_exp - pred_exp) / (true_exp)) error = np.mean(np.abs(true_y - pred) / (true_y)) # page_ix = sess.run([trainer.trainers[0].train_model.inp.page_ix])[0][0] # true_x = sess.run([trainer.trainers[0].train_model.inp.true_x])[0][0] last_error = error_exp epsilon = 0.1 # Smoothing factor, helps SMAPE to be well-behaved near zero true_o = np.expm1(true_y) pred_o = np.expm1(pred) summ = np.maximum(np.abs(true_o) + epsilon, 0.5 + epsilon) smape = np.mean(np.abs(pred_o - true_o) / summ) except tf.errors.OutOfRangeError: break # if beholder: # if step % 5 == 0: # noinspection PyUnboundLocalVariable # visualizer.update() if step % eval_every_step == 0: if eval_stages: trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages) if save_best_model and epoch > 0 and eval_smape.last < best_smape: best_smape = eval_smape.last saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if save_from_step and step >= save_from_step: saver.save(sess, f'data/cpt/{name}/cpt', global_step=step) if avg_sgd and ema_eval_stages: ema_saver.save(sess, 'data/cpt_tmp/ema', write_meta_graph=False) # restore ema-backed vars ema_loader.restore(sess, 'data/cpt_tmp/ema') trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages) # restore normal vars ema_saver.restore(sess, 'data/cpt_tmp/ema') MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last) improvement = '↑' if eval_smape.improved else ' ' SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last, train_smape.last) if tqdm: tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE) if not trainer.has_active() or (max_steps and step > max_steps): break if tqdm: tqr.close() trainer.end_epoch() if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[ 0]: best_epoch_smape = [eval_smape.avg_epoch] else: best_epoch_smape.append(eval_smape.avg_epoch) current_top = eval_smape.top if prev_top > current_top: prev_top = current_top has_best_indicator = '↑' else: has_best_indicator = ' ' status = "%2d: Best top SMAPE=%.3f%s (%s)" % ( epoch + 1, current_top, has_best_indicator, ",".join( ["%.3f" % m.top for m in eval_smape.metrics])) if trainer.has_active(): status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am ,Error=%3f " % \ (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_mae.avg_epoch, eval_mae_side.avg_epoch, eval_smape.avg_epoch, eval_smape_side.avg_epoch, trainer.has_active(), last_error) log.info(status) else: log.info(status) log.info("Early stopping!") break if max_steps and step > max_steps: log.info("Max steps calculated") break sys.stderr.flush() # noinspection PyUnboundLocalVariable return np.mean(best_epoch_smape, dtype=np.float64)
def train(): with tf.Graph().as_default(), tf.device('/cpu:0'): tf.gfile.Copy(FLAGS.input_previous_model_path + "/" + FLAGS.tree_index_file, FLAGS.output_model_path + "/" + FLAGS.tree_index_file, overwrite=True) global_step = tf.train.get_or_create_global_step() inc_step = tf.assign_add(global_step, 1) #Training setting train_input_pipe = InputPipe([ FLAGS.input_training_data_path + "/" + i for i in tf.gfile.ListDirectory(FLAGS.input_training_data_path) ], FLAGS.batch_size, FLAGS.num_epochs, 5, "", True) #auc_eval_pipe = InputPipe(FLAGS.input_validation_data_path + "/label_data.txt", FLAGS.eval_batch_size,1,3,"0,1",False) if FLAGS.auc_evaluation else None auc_eval_pipe = InputPipe(FLAGS.input_validation_data_path, FLAGS.eval_batch_size, 1, 3, "", True) if FLAGS.auc_evaluation else None #bleu_eval_pipe = InputPipe(FLAGS.input_validation_data_path + "/bleu_data.txt", FLAGS.eval_batch_size,1,2,"0",False) if FLAGS.bleu_evaluation else None model = TreeModel() trainer = SingleboxTrainer(model, inc_step, train_input_pipe, auc_eval_pipe, None) summary_op = tf.summary.merge_all() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True saver = tf.train.Saver(max_to_keep=FLAGS.max_model_to_keep, name='model_saver') with tf.Session(config=config) as session: summ_writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph) #Load Pretrain session.run(tf.local_variables_initializer()) session.run(tf.global_variables_initializer()) session.run(tf.tables_initializer()) session.run(train_input_pipe.iterator.initializer) ckpt = tf.train.get_checkpoint_state( FLAGS.input_previous_model_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(session, ckpt.model_checkpoint_path) print("Load Model From ", ckpt.model_checkpoint_path) else: print("No Initial Model Found.") trainer.start_time = time.time() while True: try: _, avg_loss, total_weight, step, summary = session.run( trainer.train_ops() + [summary_op]) #print(step) if step % FLAGS.log_frequency == 1: summ_writer.add_summary(summary, step) trainer.print_log(total_weight, step, avg_loss) if step % FLAGS.checkpoint_frequency == 1: if FLAGS.auc_evaluation: trainer.eval(step, session, 'auc') if FLAGS.bleu_evaluation: trainer.eval(step, session, 'bleu') if trainer.improved(): saver.save(session, FLAGS.output_model_path + "/tree_model", global_step=step) elif trainer.early_stop(): print("\nEarly stop") break except tf.errors.OutOfRangeError: print("End of training.") break if not trainer.early_stop(): saver.save(session, FLAGS.output_model_path + "/" + "tree_model_final", global_step=step)
h0 = sample_prob(h0_prob) h1 = h0 for step in range(gibbs_sampling_steps): v1_prob = tf.nn.sigmoid( tf.matmul(h1, tf.transpose(w1)) + tf.transpose(vb1)) v1 = sample_prob(v1_prob) h1_prob = tf.nn.sigmoid(tf.matmul(v1, w1) + tf.transpose(hb1)) h1 = sample_prob(h1_prob) w1_positive_grad = tf.matmul(tf.transpose(X1), h0_prob) w1_negative_grad = tf.matmul(tf.transpose(v1_prob), h1_prob) dw1 = (w1_positive_grad - w1_negative_grad) / tf.to_float(tf.shape(X1)[0]) update_w1 = tf.assign_add(w1, alpha * dw1) update_vb1 = tf.assign_add(vb1, alpha * tf.reduce_mean(X1 - v1, 0)) update_hb1 = tf.assign_add(hb1, alpha * tf.reduce_mean(h0 - h1, 0)) out1 = (update_w1, update_vb1, update_hb1) v1_prob = tf.nn.sigmoid( tf.matmul(h1, tf.transpose(w1)) + tf.transpose(vb1)) v1 = sample_prob(v1_prob) err1 = X1 - v1_prob err_sum1 = tf.reduce_mean(err1 * err1) initialize1 = tf.global_variables_initializer() batch_size = 100
def custom_loss(self, y_true, y_pred): mask_shape = tf.shape(y_true)[:4] cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) cell_y = tf.transpose(cell_x, (0,2,1,3,4)) cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) coord_mask = tf.zeros(mask_shape) conf_mask = tf.zeros(mask_shape) class_mask = tf.zeros(mask_shape) seen = tf.Variable(0.) total_recall = tf.Variable(0.) """ Adjust prediction """ ### adjust x and y pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid ### adjust w and h pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) ### adjust confidence pred_box_conf = tf.sigmoid(y_pred[..., 4]) ### adjust class probabilities pred_box_class = y_pred[..., 5:] """ Adjust ground truth """ ### adjust x and y true_box_xy = y_true[..., 0:2] # relative position to the containing cell ### adjust w and h true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically ### adjust confidence true_wh_half = true_box_wh / 2. true_mins = true_box_xy - true_wh_half true_maxes = true_box_xy + true_wh_half pred_wh_half = pred_box_wh / 2. pred_mins = pred_box_xy - pred_wh_half pred_maxes = pred_box_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) true_box_conf = iou_scores * y_true[..., 4] ### adjust class probabilities true_box_class = tf.argmax(y_true[..., 5:], -1) """ Determine the masks """ ### coordinate mask: simply the position of the ground truth boxes (the predictors) coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale ### confidence mask: penelize predictors + penalize boxes with low IOU # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 true_xy = self.true_boxes[..., 0:2] true_wh = self.true_boxes[..., 2:4] true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy, 4) pred_wh = tf.expand_dims(pred_box_wh, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box conf_mask = conf_mask + y_true[..., 4] * self.object_scale ### class mask: simply the position of the ground truth boxes (the predictors) class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale """ Warm-up training """ no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.) seen = tf.assign_add(seen, 1.) true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, true_box_wh + tf.ones_like(true_box_wh) * \ np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \ no_boxes_mask, tf.ones_like(coord_mask)], lambda: [true_box_xy, true_box_wh, coord_mask]) """ Finalize the loss """ nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) loss = tf.cond(tf.less(seen, self.warmup_batches+1), lambda: loss_xy + loss_wh + loss_conf + loss_class + 10, lambda: loss_xy + loss_wh + loss_conf + loss_class) if self.debug: nb_true_box = tf.reduce_sum(y_true[..., 4]) nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) current_recall = nb_pred_box/(nb_true_box + 1e-6) total_recall = tf.assign_add(total_recall, current_recall) loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) return loss
def batch_norm_log_diff(input_, dim, name, train=True, epsilon=1e-8, decay=.1, axes=[0], reuse=None, bn_lag=DEFAULT_BN_LAG): """Batch normalization with corresponding log determinant Jacobian.""" if reuse is None: reuse = not train # create variables with tf.variable_scope(name) as scope: if reuse: scope.reuse_variables() var = variable_on_cpu("var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu("mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_var = stable_var(input_=input_, mean=used_mean, axes=axes) cur_var = used_var used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_mean /= (1. - bn_lag**(step + 1)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) used_var += 0. * new_mean * new_var * new_step used_var += epsilon return used_mean, used_var
N = settings["NumOptions"] offset = 0 v_select = [] for sample in range(N): v_select.append(v_g[:, sample + offset]) if sample + offset >= dim: continue if np.iscomplex(w_g[sample + offset]): offset += 1 from networks.network import Network #Creating High level policy with tf.device(args.processor): global_step = tf.Variable(0, trainable=False, name='global_step') global_step_next = tf.assign_add(global_step, 1) network = Network(settings["NetworkConfig"], N, netConfigOverride) Method = GetFunction(settings["Method"]) net = Method(network, sess, scope="net", stateShape=dFeatures, actionSize=N, HPs=settings["NetworkHPs"], nTrajs=nTrajs) #Creating Auxilary Functions for logging and saving. writer = tf.summary.FileWriter(LOG_PATH, graph=sess.graph) saver = tf.train.Saver(max_to_keep=3, var_list=net.getVars + [global_step]) net.InitializeVariablesFromFile(saver, MODEL_PATH_) InitializeVariables(
batch_size = 50 batch_label = np.ones([batch_size, n, 1]) x = tf.placeholder(tf.float32, [None, n_his + 1, n, 1]) keep_prob = tf.placeholder(tf.float32) is_training = tf.placeholder(tf.bool) train_loss = STGCN(x, n, n_his, Ks, Kt, keep_prob, is_training) copy_loss = tf.add_n(tf.get_collection('copy_loss')) global_step = tf.Variable(0, trainable=False) n_sample = np.shape(xtr)[0] n_batch = int(n_sample / float(batch_size)) lr = tf.train.exponential_decay( 1e-2, global_step, decay_steps=5 * n_batch, decay_rate=0.7, staircase=True) step_op = tf.assign_add(global_step, 1) with tf.control_dependencies([step_op]): train_op = tf.train.RMSPropOptimizer(lr).minimize(train_loss) saver = tf.train.Saver() sess = tf.Session() sess.run(tf.global_variables_initializer()) min_va_mape9 = min_va_mape6 = min_va_mape3 = 0.4 min_va_mse9 = min_va_mse6 = min_va_mse3 = 1e5 min_va_mae9 = min_va_mae6 = min_va_mae3 = 1e5 min_mape9 = min_mape6 = min_mape3 = 0.4 min_mse9 = min_mse6 = min_mse3 = 1e5 min_mae9 = min_mae6 = min_mae3 = 1e5 flag3 = flag6 = flag9 = False
def batch_norm(input_, dim, name, scale=True, train=True, epsilon=1e-8, decay=.1, axes=[0], bn_lag=DEFAULT_BN_LAG): """Batch normalization.""" # create variables with tf.variable_scope(name): var = variable_on_cpu("var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu("mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) if scale: gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.)) beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.)) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_mean /= (1. - bn_lag**(step + 1)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # normalize res = (input_ - used_mean) / tf.sqrt(used_var + epsilon) # de-normalize if scale: res *= gamma res += beta # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) res += 0. * new_mean * new_var * new_step return res
#try finally import tensorflow as tf array_to_outof = tf.constant([1, 2, 3, 4, 5, 6, 7, 8]) counter = tf.Variable(0) x = tf.Variable(1.) w = tf.Variable(2.) op = tf.multiply(w, x) count_op = tf.assign_add(counter, 1) coord = tf.train.Coordinator() sess = tf.Session() with sess.as_default(): sess.run(tf.global_variables_initializer()) """Start Training""" threads = tf.train.start_queue_runners(coord=coord) try: while not coord.should_stop(): print('in while-loop:') op_result_ = sess.run([op]) print(op_result_) counter_ = sess.run(count_op) print('counter_:', counter_) array_to_outof[counter_] # except tf.errors.OutOfRangeError: # print('except') except ValueError:
def __init__(self): self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE self.classes = utils.read_class_names(cfg.YOLO.CLASSES) self.num_classes = len(self.classes) self.learn_rate_init = cfg.TRAIN.LEARN_RATE_INIT self.learn_rate_end = cfg.TRAIN.LEARN_RATE_END self.first_stage_epochs = cfg.TRAIN.FISRT_STAGE_EPOCHS self.second_stage_epochs = cfg.TRAIN.SECOND_STAGE_EPOCHS self.warmup_periods = cfg.TRAIN.WARMUP_EPOCHS self.initial_weight = cfg.TRAIN.INITIAL_WEIGHT self.time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) self.moving_ave_decay = cfg.YOLO.MOVING_AVE_DECAY self.max_bbox_per_scale = 150 self.train_logdir = "./data/log/train" self.trainset = Dataset('train') self.testset = Dataset('test') self.steps_per_period = len(self.trainset) config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) with tf.name_scope('define_input'): self.input_data = tf.placeholder(dtype=tf.float32, name='input_data') self.label_sbbox = tf.placeholder(dtype=tf.float32, name='label_sbbox') self.label_mbbox = tf.placeholder(dtype=tf.float32, name='label_mbbox') self.label_lbbox = tf.placeholder(dtype=tf.float32, name='label_lbbox') self.true_sbboxes = tf.placeholder(dtype=tf.float32, name='sbboxes') self.true_mbboxes = tf.placeholder(dtype=tf.float32, name='mbboxes') self.true_lbboxes = tf.placeholder(dtype=tf.float32, name='lbboxes') self.trainable = tf.placeholder(dtype=tf.bool, name='training') with tf.name_scope("define_loss"): self.model = YOLOV3(self.input_data, self.trainable) self.net_var = tf.global_variables() self.giou_loss, self.conf_loss, self.prob_loss = self.model.compute_loss( self.label_sbbox, self.label_mbbox, self.label_lbbox, self.true_sbboxes, self.true_mbboxes, self.true_lbboxes) self.loss = self.giou_loss + self.conf_loss + self.prob_loss with tf.name_scope('learn_rate'): self.global_step = tf.Variable(1.0, dtype=tf.float64, trainable=False, name='global_step') warmup_steps = tf.constant(self.warmup_periods * self.steps_per_period, dtype=tf.float64, name='warmup_steps') train_steps = tf.constant( (self.first_stage_epochs + self.second_stage_epochs) * self.steps_per_period, dtype=tf.float64, name='train_steps') self.learn_rate = tf.cond( pred=self.global_step < warmup_steps, true_fn=lambda: self.global_step / warmup_steps * self. learn_rate_init, false_fn=lambda: self.learn_rate_end + 0.5 * (self.learn_rate_init - self.learn_rate_end) * (1 + tf.cos( (self.global_step - warmup_steps) / (train_steps - warmup_steps) * np.pi))) global_step_update = tf.assign_add(self.global_step, 1.0) with tf.name_scope("define_weight_decay"): moving_ave = tf.train.ExponentialMovingAverage( self.moving_ave_decay).apply(tf.trainable_variables()) with tf.name_scope("define_first_stage_train"): self.first_stage_trainable_var_list = [] for var in tf.trainable_variables(): var_name = var.op.name var_name_mess = str(var_name).split('/') if var_name_mess[0] in [ 'conv_sbbox', 'conv_mbbox', 'conv_lbbox' ]: self.first_stage_trainable_var_list.append(var) first_stage_optimizer = tf.train.AdamOptimizer( self.learn_rate).minimize( self.loss, var_list=self.first_stage_trainable_var_list) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies( [first_stage_optimizer, global_step_update]): with tf.control_dependencies([moving_ave]): self.train_op_with_frozen_variables = tf.no_op() with tf.name_scope("define_second_stage_train"): second_stage_trainable_var_list = tf.trainable_variables() second_stage_optimizer = tf.train.AdamOptimizer( self.learn_rate).minimize( self.loss, var_list=second_stage_trainable_var_list) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies( [second_stage_optimizer, global_step_update]): with tf.control_dependencies([moving_ave]): self.train_op_with_all_variables = tf.no_op() with tf.name_scope('loader_and_saver'): self.loader = tf.train.Saver(self.net_var) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) with tf.name_scope('summary'): tf.summary.scalar("learn_rate", self.learn_rate) tf.summary.scalar("giou_loss", self.giou_loss) tf.summary.scalar("conf_loss", self.conf_loss) tf.summary.scalar("prob_loss", self.prob_loss) tf.summary.scalar("total_loss", self.loss) logdir = "./data/log/" if os.path.exists(logdir): shutil.rmtree(logdir) os.mkdir(logdir) self.write_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(logdir, graph=self.sess.graph)
def call(self, x): input_image, y_pred, y_true, true_boxes = x # adjust the shape of the y_predict [batch, grid_h, grid_w, 3, 4+1+nb_class] y_pred = tf.reshape( y_pred, tf.concat([tf.shape(y_pred)[:3], tf.constant([3, -1])], axis=0)) # initialize the masks object_mask = tf.expand_dims(y_true[..., 4], 4) # the variable to keep track of number of batches processed batch_seen = tf.Variable(0.) # compute grid factor and net factor grid_h = tf.shape(y_true)[1] grid_w = tf.shape(y_true)[2] grid_factor = tf.reshape(tf.cast([grid_w, grid_h], tf.float32), [1, 1, 1, 1, 2]) net_h = tf.shape(input_image)[1] net_w = tf.shape(input_image)[2] net_factor = tf.reshape(tf.cast([net_w, net_h], tf.float32), [1, 1, 1, 1, 2]) """ Adjust prediction """ pred_box_xy = (self.cell_grid[:, :grid_h, :grid_w, :, :] + tf.sigmoid(y_pred[..., :2])) # sigma(t_xy) + c_xy pred_box_wh = y_pred[..., 2:4] # t_wh pred_box_conf = tf.expand_dims(tf.sigmoid(y_pred[..., 4]), 4) # adjust confidence pred_box_class = tf.sigmoid(y_pred[..., 5:]) # adjust class probabilities """ Adjust ground truth """ true_box_xy = y_true[..., 0:2] # (sigma(t_xy) + c_xy) true_box_wh = y_true[..., 2:4] # t_wh true_box_conf = tf.expand_dims(y_true[..., 4], 4) true_box_class = y_true[..., 5:] """ Compare each predicted box to all true boxes """ # initially, drag all objectness of all boxes to 0 conf_delta = pred_box_conf - 0 # then, ignore the boxes which have good overlap with some true box true_xy = true_boxes[..., 0:2] / grid_factor true_wh = true_boxes[..., 2:4] / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy / grid_factor, 4) pred_wh = tf.expand_dims( tf.exp(pred_box_wh) * self.anchors / net_factor, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_delta *= tf.expand_dims( tf.to_float(best_ious < self.ignore_thresh), 4) """ Compute some online statistics """ true_xy = true_box_xy / grid_factor true_wh = tf.exp(true_box_wh) * self.anchors / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = pred_box_xy / grid_factor pred_wh = tf.exp(pred_box_wh) * self.anchors / net_factor pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) iou_scores = object_mask * tf.expand_dims(iou_scores, 4) count = tf.reduce_sum(object_mask) count_noobj = tf.reduce_sum(1 - object_mask) detect_mask = tf.to_float(pred_box_conf * object_mask >= 0.5) class_mask = tf.expand_dims( tf.to_float( tf.equal(tf.argmax(pred_box_class, -1), tf.argmax(true_box_class, -1))), 4) recall50 = tf.to_float(iou_scores >= 0.5) * detect_mask recall75 = tf.to_float(iou_scores >= 0.75) * detect_mask recall50_c = tf.reduce_sum(recall50 * class_mask) / (count + 1e-3) recall75_c = tf.reduce_sum(recall75 * class_mask) / (count + 1e-3) recall50 = tf.reduce_sum(recall50) / (count + 1e-3) recall75 = tf.reduce_sum(recall75) / (count + 1e-3) avg_iou = tf.reduce_sum(iou_scores) / (count + 1e-3) avg_obj = tf.reduce_sum(pred_box_conf * object_mask) / (count + 1e-3) avg_noobj = tf.reduce_sum(pred_box_conf * (1 - object_mask)) / (count_noobj + 1e-3) avg_cat = tf.reduce_sum( pred_box_class * true_box_class) / (count + 1e-3) """ Warm-up training """ batch_seen = tf.assign_add(batch_seen, 1.) true_box_xy, true_box_wh, xywh_mask = tf.cond( tf.less(batch_seen, self.warmup_batches + 1), lambda: [ true_box_xy + (0.5 + self.cell_grid[:, :grid_h, :grid_w, :, :]) * (1 - object_mask), true_box_wh + tf.zeros_like(true_box_wh) * (1 - object_mask), tf.ones_like(object_mask) ], lambda: [true_box_xy, true_box_wh, object_mask]) """ Compare each true box to all anchor boxes """ xywh_scale = tf.exp(true_box_wh) * self.anchors / net_factor xywh_scale = tf.expand_dims( 2 - xywh_scale[..., 0] * xywh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale xy_delta = xywh_mask * (pred_box_xy - true_box_xy) * xywh_scale wh_delta = xywh_mask * (pred_box_wh - true_box_wh) * xywh_scale conf_delta = object_mask * (pred_box_conf - true_box_conf) * 5 + ( 1 - object_mask) * conf_delta class_delta = object_mask * (pred_box_class - true_box_class) loss = tf.reduce_sum(tf.square(xy_delta), list(range(1,5))) + \ tf.reduce_sum(tf.square(wh_delta), list(range(1,5))) + \ tf.reduce_sum(tf.square(conf_delta), list(range(1,5))) + \ tf.reduce_sum(tf.square(class_delta), list(range(1,5))) loss = tf.cond( tf.less(batch_seen, self.warmup_batches + 1), # add 10 to the loss if this is the warmup stage lambda: loss + 10, lambda: loss) # loss = tf.Print(loss, [grid_h, avg_obj], message='avg_obj \t\t', summarize=1000) # loss = tf.Print(loss, [grid_h, avg_noobj], message='avg_noobj \t\t', summarize=1000) # loss = tf.Print(loss, [grid_h, avg_iou], message='avg_iou \t\t', summarize=1000) # loss = tf.Print(loss, [grid_h, avg_cat], message='avg_cat \t\t', summarize=1000) # loss = tf.Print(loss, [grid_h, recall50], message='recall50 \t', summarize=1000) # loss = tf.Print(loss, [grid_h, recall75], message='recall75 \t', summarize=1000) # loss = tf.Print(loss, [grid_h, recall50_c], message='recall50_cat \t', summarize=1000) # loss = tf.Print(loss, [grid_h, recall75_c], message='recall75_Cat \t', summarize=1000) # loss = tf.Print(loss, [grid_h, count], message='count \t', summarize=1000) # loss = tf.Print(loss, [grid_h, tf.reduce_sum(loss)], message='loss: \t', summarize=1000) return loss * self.scale
# add tensor to tensorboard ns_image_tb = tf.summary.image(name='ns_image', tensor=tf.reshape(ns, shape=[1, 520, 600, 1])) ns_mean_tb = tf.summary.scalar(name='ns_mean', tensor=tf.reduce_mean(tf.reduce_mean(ns))) ns_hist_tb = tf.summary.histogram(name='ns_hist', values=ns) zs_squre = tf.multiply(zs, zs) zs_add = tf.add(zs_squre, xs) zs_abs = tf.abs(zs_add) zs_less = tf.math.less(zs_abs, 4) zs_cast = tf.cast(zs_less, tf.float32) # step = tf.group(tf.assign(zs, zs_add), tf.assign_add(ns, zs_cast), name='step') # sess = tf.Session() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() tbs = tf.summary.merge_all() writer = tf.summary.FileWriter(logdir='./tensorboard') writer.add_graph(tf.get_default_graph()) for i in range(200): _, tbs_, cast_ = sess.run([step, tbs, zs_cast]) writer.add_summary(tbs_, global_step=i) saver.save(sess, save_path='./model/mandelbrot')
train_step = tf.train.AdagradOptimizer( learning_rate=opt.hyper.learning_rate / 10).minimize( total_loss, var_list=all_var) elif opt.optimizer == 4: train_step = tf.train.ProximalAdagradOptimizer(learning_rate=opt.hyper.learning_rate/10). \ minimize(total_loss, var_list=all_var) elif opt.optimizer == 5: train_step = tf.train.RMSPropOptimizer( learning_rate=opt.hyper.learning_rate / 10).minimize( total_loss, var_list=all_var) elif opt.optimizer == 6: train_step = tf.train.FtrlOptimizer( learning_rate=opt.hyper.learning_rate / 10).minimize( total_loss, var_list=all_var) inc_global_step = tf.assign_add(global_step, 1, name='increment') raw_grads = tf.gradients(total_loss, all_var) grads = list(zip(raw_grads, tf.trainable_variables())) for g, v in grads: summary.gradient_summaries(g, v, opt) ################################################################################################ ################################################################################################ # Set up checkpoints and data ################################################################################################ saver = tf.compat.v1.train.Saver(max_to_keep=opt.max_to_keep_checkpoints) # Automatic restore model, or force train from scratch
def _finish(self, update_ops, name_scope): with tf.control_dependencies(update_ops): return tf.assign_add(self._step_count, 1)
def train(train_imgs,train_labels,val_imgs,val_labels): has_train=True TB_LOG_DIR=os.path.join('..','model') ckpt = tf.train.get_checkpoint_state(TB_LOG_DIR) if not ckpt and not ckpt.model_checkpoint_path: has_train=False if has_train==False: #dataset param EPOCHS=150 SHUFFLE_SZ=1000 BATCH_SZ=200 #model param OUTPUT_CNS=[24,48,96,192,1024] CLASS_NUM=200 WEIGHT_DECAY=4e-5 #training param WARM_UP_LR=0.002 LEARNING_RATE=0.5 LEARNING_RATE_DECAY=0.95 TOTAL_STEPS=EPOCHS*100000//BATCH_SZ LEARNING_RATE_STEPS=TOTAL_STEPS//100 MOMENTUM=0.9 #display DISPLAY_STEP=TOTAL_STEPS//100 TB_LOG_DIR=os.path.join('..','model') #validation VAL_SZ=10000 else: #dataset param EPOCHS=50 SHUFFLE_SZ=1000 BATCH_SZ=200 #model param OUTPUT_CNS=[24,48,96,192,1024] CLASS_NUM=200 WEIGHT_DECAY=4e-5 #training param WARM_UP_LR=0.0005 LEARNING_RATE=0.0005 LEARNING_RATE_DECAY=0.9 TOTAL_STEPS=EPOCHS*100000//BATCH_SZ LEARNING_RATE_STEPS=TOTAL_STEPS//100 MOMENTUM=0.9 #display DISPLAY_STEP=TOTAL_STEPS//100 TB_LOG_DIR=os.path.join('..','model') #validation VAL_SZ=10000 imgpaths=tf.convert_to_tensor(train_imgs) labels=tf.convert_to_tensor(train_labels) valimgpaths=tf.convert_to_tensor(val_imgs) vallabels=tf.convert_to_tensor(val_labels) #sess=tf.Session() def _parse_function(imgpath,label): img=tf.read_file(imgpath) img_decoded=tf.image.decode_jpeg(img,3) img_decoded.set_shape([64,64,3]) img_decoded=tf.cast(img_decoded,dtype=tf.float32) return img_decoded,label dataset=tf.data.Dataset.from_tensor_slices((imgpaths,labels)).map(_parse_function) dataset=dataset.shuffle(buffer_size=SHUFFLE_SZ) dataset=dataset.repeat(EPOCHS) dataset=dataset.batch(BATCH_SZ) iterator=dataset.make_initializable_iterator() batch_imgs,batch_labels=iterator.get_next() valset=tf.data.Dataset.from_tensor_slices((valimgpaths,vallabels)).map(_parse_function) valset=valset.batch(VAL_SZ) valiterator=dataset.make_initializable_iterator() valbatch_imgs,valbatch_labels=valiterator.get_next() #dimgs,dlabels=sess.run([batch_imgs,batch_labels]) initial=tf.variance_scaling_initializer() regular=tf.contrib.layers.l2_regularizer(1.0) logits=model(batch_imgs,OUTPUT_CNS,CLASS_NUM,True,regular,initial) with tf.name_scope('loss'): loss=tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=batch_labels)) reg=tf.losses.get_regularization_loss() loss+=WEIGHT_DECAY*reg with tf.name_scope('train'): global_step=tf.get_variable('step',shape=[],trainable=False, initializer=tf.zeros_initializer(dtype=tf.int64)) def get_lr(global_step,total_step,base_lr,warm_up_lr): warm_up_total_step=total_step//20 transition_total_step=warm_up_total_step remain_total_step=total_step-warm_up_total_step-transition_total_step transition_dlrt=tf.convert_to_tensor((1.0*base_lr-warm_up_lr)/transition_total_step,dtype=tf.float32) base_lrt=tf.convert_to_tensor(base_lr,dtype=tf.float32) warm_up_lrt=tf.convert_to_tensor(warm_up_lr,dtype=tf.float32) warm_up_total_step=tf.convert_to_tensor(warm_up_total_step,dtype=tf.float32) transition_total_step=tf.convert_to_tensor(transition_total_step,dtype=tf.float32) remain_total_step=tf.convert_to_tensor(remain_total_step,dtype=tf.float32) transition_lr=(tf.cast(global_step,tf.float32)-warm_up_total_step)*transition_dlrt+warm_up_lrt remain_lr=tf.train.exponential_decay(base_lrt,tf.cast(global_step,tf.float32)-warm_up_total_step-transition_total_step, remain_total_step//120 ,LEARNING_RATE_DECAY) lr=tf.case({tf.less(global_step,warm_up_total_step): lambda:warm_up_lrt, tf.greater(global_step,transition_total_step+warm_up_total_step): lambda:remain_lr}, default=lambda:transition_lr,exclusive=True) return lr if has_train==False: learning_rate=get_lr(global_step,TOTAL_STEPS,LEARNING_RATE,WARM_UP_LR) else: learning_rate=tf.train.exponential_decay(LEARNING_RATE,global_step,LEARNING_RATE_STEPS,LEARNING_RATE_DECAY) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op=tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=MOMENTUM).minimize(loss) with tf.control_dependencies([train_op]): global_step_update=tf.assign_add(global_step,1) if has_train==False: init=tf.global_variables_initializer() with tf.name_scope('batch_train_accuracy'): logits_train=model(batch_imgs,OUTPUT_CNS,CLASS_NUM,False,regular,initial) correct_pred_train=tf.equal(tf.cast(tf.argmax(logits_train,1),dtype=tf.int32),batch_labels) accuracy_train=tf.reduce_mean(tf.cast(correct_pred_train,tf.float32)) with tf.name_scope('val_accuracy'): logits_val=model(valbatch_imgs,OUTPUT_CNS,CLASS_NUM,False,regular,initial) correct_pred_val=tf.equal(tf.cast(tf.argmax(logits_val,1),dtype=tf.int32),valbatch_labels) accuracy_val=tf.reduce_mean(tf.cast(correct_pred_val,tf.float32)) sess=tf.Session() if has_train==False: sess.run(init) else: saver=tf.train.Saver() saver.restore(sess,ckpt.model_checkpoint_path) sess.run(iterator.initializer) tf.summary.scalar('loss',loss) tf.summary.scalar('batch_train_accuracy',accuracy_train) tf.summary.scalar('val_accuracy',accuracy_val) tf.summary.scalar('learning_rate',learning_rate) tb_merge_summary_op=tf.summary.merge_all() summary_writer=tf.summary.FileWriter(os.path.join(TB_LOG_DIR,'tensorboard'),graph=sess.graph) saver=tf.train.Saver() sess.run(tf.assign(global_step,0.0)) for step in range(1,TOTAL_STEPS+1): try: #_,print_step=sess.run(train_op) sess.run(global_step_update) except tf.errors.OutOfRangeError: break if step%DISPLAY_STEP==0 or step==1: sess.run(valiterator.initializer) l,acct,accv,lr,summary_str=sess.run([loss,accuracy_train,accuracy_val,learning_rate,tb_merge_summary_op]) summary_writer.add_summary(summary_str,step) print("epoch {:d} steps {:d}: loss={:.4f}, accuracy_batch_train={:.4f}, accuracy_val={:.4f}, learning_rate={:.5f}".format( step//(TOTAL_STEPS//EPOCHS),step,l,acct,accv,lr)) summary_writer.close() saver.save(sess,os.path.join(TB_LOG_DIR,'model_1.ckpt'))
def _assign_moving_average(orig_val, new_val, momentum, name): with tf.name_scope(name): scaled_diff = (1 - momentum) * (new_val - orig_val) return tf.assign_add(orig_val, scaled_diff)
def __init__(self, debug=False, **kwargs): super(Learner, self).__init__(**kwargs) with self.graph.as_default(), tf.device(self.device): # initialize predictive model, if either: # * you want to use the predictive model to "undo delay" # * you want a predictive model to help you explore # note: self.predict is perhaps a misnomer. if self.predict or (self.train_model or self.explore_scale): self._init_model(**kwargs) if self.train_policy: self._init_policy(**kwargs) # build computation graph # to train the the policy, you have to train the critic. (self.train_policy and # self.train_critic might both be false, if we're only training the predictive # model) if self.train_policy or self.train_critic: print("Creating critic.") self.critic = Critic(self.core.output_size, **kwargs) # experience = trajectory. usually a list of SimpleStateAction's. self.experience = ct.inputCType( ssbm.SimpleStateAction, [None, self.config.experience_length], "experience") # instantaneous rewards for all but the last state self.experience['reward'] = tf.placeholder( tf.float32, [None, self.config.experience_length - 1], name='experience/reward') # manipulating time along the first axis is much more efficient experience = util.deepMap(tf.transpose, self.experience) # initial state for recurrent networks self.experience['initial'] = tuple( tf.placeholder(tf.float32, [None, size], name='experience/initial/%d' % i) for i, size in enumerate(self.core.hidden_size)) experience['initial'] = self.experience['initial'] states = self.embedGame(experience['state']) prev_actions = self.embedAction(experience['prev_action']) combined = tf.concat(axis=2, values=[states, prev_actions]) actions = self.embedAction(experience['action']) memory = self.config.memory delay = self.config.delay length = self.config.experience_length - memory history = [combined[i:i + length] for i in range(memory + 1)] inputs = tf.concat(axis=-1, values=history) if self.core.recurrent: def f(prev, current_input): _, prev_state = prev return self.core(current_input, prev_state) batch_size = tf.shape(self.experience['reward'])[0] dummy_output = tf.zeros( tf.stack([batch_size, tf.constant(self.core.output_size)])) scan_fn = tf.scan if self.dynamic else tfl.scan core_outputs, hidden_states = scan_fn( f, inputs, (dummy_output, experience['initial'])) else: core_outputs, hidden_states = self.core( inputs, experience['initial']) actions = actions[memory:] rewards = experience['reward'][memory:] print("Creating train ops") train_ops = [] losses = [] loss_vars = [] if self.train_model or self.predict: model_loss, predicted_core_outputs = self.model.train( history, core_outputs, hidden_states, actions, experience['state']) if self.train_model: #train_ops.append(train_model) losses.append(model_loss) loss_vars.extend(self.model.getVariables()) if self.train_policy: if self.predict: predict_steps = self.model.predict_steps actor_inputs = predicted_core_outputs else: predict_steps = 0 actor_inputs = core_outputs delay_length = length - delay actor_inputs = actor_inputs[:delay_length] # delayed_actions is a D+1-P length list of shape [T-M-D, B] tensors # The valid state indices are [M+P, T+P-D) # Element i corresponds to the i'th queued up action: 0 is the action about to be taken, D-P was the action chosen on this frame. delayed_actions = [] for i in range(predict_steps, delay + 1): delayed_actions.append(actions[i:i + delay_length]) train_probs, train_log_probs, entropy = self.policy.train_probs( actor_inputs, delayed_actions) behavior_probs = experience['prob'][ memory + delay:] # these are the actions we can compute probabilities for prob_ratios = tf.minimum(train_probs / behavior_probs, 1.) self.kls = -tf.reduce_mean(tf.log(prob_ratios), 0) self.kls = tf.check_numerics(self.kls, 'kl') kl = tf.reduce_mean(self.kls) tf.summary.scalar('kl', kl) else: prob_ratios = tf.ones_like() # todo if self.explore_scale: if self.evolve_explore_scale: self.explore_scale = tf.Variable(self.explore_scale, trainable=False, name='explore_scale') self.evo_variables.append( ('explore_scale', self.explore_scale, relative(1.5))) distances, _ = self.model.distances(history, core_outputs, hidden_states, actions, experience['state'], predict_steps=1) distances = tf.add_n(list(util.deepValues( distances))) # sum over different state components explore_rewards = self.explore_scale * distances[0] explore_rewards = tf.stop_gradient(explore_rewards) tfl.stats(explore_rewards, 'explore_rewards') rewards += explore_rewards # build the critic (which you'll also need to train the policy) if self.train_policy or self.train_critic: shifted_core_outputs = core_outputs[: delay_length] if self.unshift_critic else core_outputs[ delay:] critic_loss, targets, advantages = self.critic( shifted_core_outputs, rewards[delay:], prob_ratios[:-1]) if self.train_critic: losses.append(critic_loss) loss_vars.extend(self.critic.variables) if self.train_policy: policy_loss = self.policy.train(train_log_probs[:-1], advantages, entropy[:-1]) losses.append(policy_loss) loss_vars.extend(self.policy.getVariables()) if self.evolve_learning_rate: self.learning_rate = tf.Variable(self.learning_rate, trainable=False, name='learning_rate') self.evo_variables.append( ('learning_rate', self.learning_rate, relative(1.5))) total_loss = tf.add_n(losses) with tf.variable_scope('train'): optimizer = tf.train.AdamOptimizer(self.learning_rate) gvs = optimizer.compute_gradients(total_loss) # gvs = [(tf.check_numerics(g, v.name), v) for g, v in gvs] gs, vs = zip(*gvs) norms = tf.stack([tf.norm(g) for g in gs]) max_norm = tf.reduce_max(norms) tf.summary.scalar('max_grad_norm', max_norm) capped_gs = [ tf.clip_by_norm(g, self.clip_max_grad) for g in gs ] train_op = optimizer.apply_gradients(zip(capped_gs, vs)) train_ops.append(train_op) print("Created train op(s)") avg_reward, _ = tfl.stats(experience['reward'], 'reward') misc_ops = [] if not self.dynamic: misc_ops.append(tf.add_check_numerics_ops()) if self.pop_id >= 0: self.reward = tf.Variable(0., trainable=False, name='avg_reward') tf.summary.scalar('avg_reward', self.reward) new_reward = (1. - self.reward_decay ) * self.reward + self.reward_decay * avg_reward misc_ops.append(tf.assign(self.reward, new_reward)) self.mutators = [] for name, evo_variable, mutator in self.evo_variables: tf.summary.scalar(name, evo_variable, family='evolution') self.mutators.append( tf.assign(evo_variable, mutator(evo_variable))) self.summarize = tf.summary.merge_all() misc_ops.append(tf.assign_add(self.global_step, 1)) self.misc = tf.group(*misc_ops) self.train_ops = tf.group(*train_ops) print("Creating summary writer at logs/%s." % self.name) #self.writer = tf.summary.FileWriter('logs/' + self.name)#, self.graph) self.writer = tf.summary.FileWriter(self.path) self._finalize_setup()
import tensorflow as tf q = tf.FIFOQueue(1000, "float") counter = tf.Variable(0.0) increment_op = tf.assign_add(counter, tf.constant(1.0)) enqueue_op = q.enqueue([counter]) qr = tf.train.QueueRunner(q, enqueue_ops=[increment_op, enqueue_op] * 1) sess = tf.Session() sess.run(tf.global_variables_initializer()) #coordinator协调器,可以看作一种信号量,用来做同步 coord = tf.train.Coordinator() enqueue_threads = qr.create_threads(sess, coord=coord, start=True) coord.request_stop() #通知其他线程关闭 #主线程 for i in range(0, 10): try: print(sess.run(q.dequeue())) except tf.errors.OutOfRangeError: break coord.join(enqueue_threads) #直到其他线程结束才退出
def __init__( self, training_pipeline: InputPipeline, cv_pipeline: InputPipeline, network_factory: NetworkFactory, objective_factory: ObjectiveFactory, training_options: TrainingOptions, learning_rate: float, beta1: float = 0.9, beta2: float = 0.999): """ Create a new network trainer :param training_pipeline: Input pipeline used for training :param cv_pipeline: Input pipeline used for cross-validation :param network_factory: Factory to create training and evaluation networks :param objective_factory: Factory to create generator and discriminator losses :param training_options: Options controlling the training process :param learning_rate: Learning rate to use in the Adam optimizer :param beta1: Beta1 to use in the Adam optimizer :param beta2: Beta2 to use in the Adam optimizer """ self._training_options = training_options self._restored_iteration = None # Create input pipelines with use_cpu(): self._training_pipeline = training_pipeline self._train_x, self._train_y, _ = training_pipeline.create_pipeline() self._cv_pipeline = cv_pipeline self._cv_x, self._cv_y, _ = self._cv_pipeline.create_pipeline() # Create training graph with tf.name_scope("training"): # Create networks self._generator = network_factory.create_generator(self._train_x, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) self._discriminator_generated = network_factory.create_discriminator( self._train_x, self._generator, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) self._discriminator_real = network_factory.create_discriminator( self._train_x, self._train_y, reuse=True, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) # Create losses self._generator_loss, generator_summary = objective_factory.create_generator_loss( self._train_x, self._train_y, self._generator, self._discriminator_generated, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) self._discriminator_loss, discriminator_summary = objective_factory.create_discriminator_loss( self._train_x, self._train_y, self._generator, self._discriminator_generated, self._discriminator_real, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) with tf.device(select_device(self._training_options.use_gpu)): # Create optimizers trainable_variables = tf.trainable_variables() variables_discriminator = [var for var in trainable_variables if var.name.startswith("discriminator")] variables_generator = [var for var in trainable_variables if var.name.startswith("generator")] self._optimizer_generator = tf.train.AdamOptimizer(learning_rate, beta1, beta2, name="adam_generator") self._optimizer_discriminator = tf.train.AdamOptimizer(learning_rate, beta1, beta2, name="adam_discriminator") self._op_generator = self._optimizer_generator.minimize(self._generator_loss, var_list=variables_generator) self._op_discriminator = self._optimizer_discriminator.minimize(self._discriminator_loss, var_list=variables_discriminator) with use_cpu(): # Iteration counter self._global_step = tf.Variable(0, trainable=False, name="global_step", dtype=tf.int64) self._step_op = tf.assign_add(self._global_step, 1) # Create summary operation accuracy, precision, recall, f1_score, specificity, jaccard_similarity = _create_summaries(self._generator, self._train_y) summary_operations = [ tf.summary.scalar("accuracy", accuracy), tf.summary.scalar("precision", precision), tf.summary.scalar("recall", recall), tf.summary.scalar("f1_score", f1_score), tf.summary.scalar("specificity", specificity), tf.summary.scalar("jaccard_similarity", jaccard_similarity) ] self._train_saver = tf.train.Saver(keep_checkpoint_every_n_hours=1) # Merge summaries self._train_summary = tf.summary.merge(summary_operations + generator_summary + discriminator_summary) self._train_summary_writer = tf.summary.FileWriter( os.path.join(self._training_options.summary_directory, "training"), graph=tf.get_default_graph()) # Create CV graph with tf.name_scope("cv"): # Create networks generator = network_factory.create_generator( self._cv_x, reuse=True, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) discriminator_generated = network_factory.create_discriminator( self._cv_x, generator, reuse=True, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) discriminator_real = network_factory.create_discriminator( self._cv_x, self._cv_y, reuse=True, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) # Create losses _, generator_summary = objective_factory.create_generator_loss( self._cv_x, self._cv_y, generator, discriminator_generated, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) _, discriminator_summary = objective_factory.create_discriminator_loss( self._cv_x, self._cv_y, generator, discriminator_generated, discriminator_real, use_gpu=self._training_options.use_gpu, data_format=self._training_options.data_format) # Create other summary options accuracy, precision, recall, f1_score, specificity, jaccard_similarity = _create_summaries(generator, self._cv_y) # Create summary operation summary_operations = [ tf.summary.scalar("accuracy", accuracy), tf.summary.scalar("precision", precision), tf.summary.scalar("recall", recall), tf.summary.scalar("f1_score", f1_score), tf.summary.scalar("specificity", specificity), tf.summary.scalar("jaccard_similarity", jaccard_similarity) ] with use_cpu(): # Concatenated images self._concatenated_images_op = _create_concatenated_images( self._cv_x, self._cv_y, generator, self._cv_pipeline.color_converter, self._training_options.data_format ) # Merge summaries self._cv_summary = tf.summary.merge(summary_operations + generator_summary + discriminator_summary) self._cv_summary_writer = tf.summary.FileWriter( os.path.join(self._training_options.summary_directory, "cv"))
def testAccumulator(self): # testAccumulator compares # - explicit averaging of independently computed var_grads1 and # var_grads2, # - Accumulator(SGD) optimizer effectively doing this over 2 steps. np.random.seed(12345) np_input1 = np.random.normal(0.1, 0.5, [2, 4, 3]) np.random.seed(12346) np_input2 = np.random.normal(0.1, 0.5, [2, 4, 3]) with self.session(use_gpu=True, graph=tf.Graph()) as sess: tf.set_random_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.is_eval = False params.batch_norm = False proj_layer = layers.ProjectionLayer(params) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs2 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) in_padding2 = tf.zeros([2, 4, 1], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) output2 = proj_layer.FPropDefaultTheta(inputs2, in_padding2) loss1 = tf.reduce_sum(output1) loss2 = tf.reduce_sum(output2) var_grads1 = py_utils.ComputeGradients(loss1, proj_layer.vars) var_grads2 = py_utils.ComputeGradients(loss2, proj_layer.vars) op = optimizer.SGD.Params() opt = op.Instantiate() lr = 1e-1 with tf.control_dependencies([loss1, loss2]): var_update_op1 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads1, 1. / 2.)) with tf.control_dependencies([var_update_op1]): var_update_op2 = opt.Apply( lr, py_utils.ApplyGradMultiplier(var_grads2, 1. / 2.)) sess.run(tf.global_variables_initializer()) vars1 = sess.run(proj_layer.vars.Flatten()) loss1_1, grads1_1, loss1_2, grads1_2 = sess.run( [loss1, var_grads1, loss2, var_grads2], feed_dict={ inputs1: np_input1, inputs2: np_input2, }) sess.run( [var_update_op2], feed_dict={ inputs1: np_input1, inputs2: np_input2, }) vars1_1 = sess.run(proj_layer.vars.Flatten()) with self.session(use_gpu=True, graph=tf.Graph()) as sess: tf.set_random_seed(123456) params = layers.ProjectionLayer.Params() params.name = 'proj' params.dtype = tf.float64 params.input_dim = 3 params.output_dim = 2 params.params_init = py_utils.WeightInit.Gaussian(0.01, 123456) params.is_eval = False params.batch_norm = False proj_layer = layers.ProjectionLayer(params) in_padding1 = tf.zeros([2, 4, 1], dtype=tf.float64) inputs1 = tf.placeholder(shape=[2, 4, 3], dtype=tf.float64) output1 = proj_layer.FPropDefaultTheta(inputs1, in_padding1) loss = tf.reduce_sum(output1) var_grads = py_utils.ComputeGradients(loss, proj_layer.vars) op = optimizer.Accumulator.Params().Set( accum_steps=2, dtype=tf.float64, optimizer_tpl=optimizer.SGD.Params()) opt = op.Instantiate() lr = 1e-1 var_update_op = opt.Apply(lr, var_grads) increment_global_step_op = tf.assign_add( py_utils.GetOrCreateGlobalStepVar(), 1) sess.run(tf.global_variables_initializer()) vars2 = sess.run(proj_layer.vars.Flatten()) loss2_1, grads2_1 = sess.run( [loss, var_grads], feed_dict={ inputs1: np_input1, }) loss2_2, grads2_2 = sess.run( [loss, var_grads], feed_dict={ inputs1: np_input2, }) acc_0 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] sess.run( [var_update_op], feed_dict={ inputs1: np_input1, }) acc_1 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] vars2_intermediate = sess.run(proj_layer.vars.Flatten()) sess.run(increment_global_step_op) sess.run( [var_update_op], feed_dict={ inputs1: np_input2, }) acc_2 = sess.run( [v for v in tf.global_variables() if 'grad_accumulator' in v.name])[0] vars2_1 = sess.run(proj_layer.vars.Flatten()) self.assertAllClose(vars1, vars2) self.assertAllClose(acc_0, np.zeros_like(acc_0)) self.assertAllClose(acc_1, grads2_1['w'][1]) self.assertAllClose(acc_2, np.zeros_like(acc_0)) self.assertAllClose(loss1_1, loss2_1) self.assertAllClose(loss1_2, loss2_2) self.assertAllClose(grads1_1, grads2_1) self.assertAllClose(grads1_2, grads2_2) self.assertAllClose(vars1, vars2_intermediate) self.assertAllClose(vars2[0], grads2_1['w'][0]) self.assertAllClose(vars2[0], grads2_2['w'][0]) self.assertAllClose( vars1[0] - 0.5 * lr * (grads1_1['w'][1] + grads1_2['w'][1]), vars1_1[0]) self.assertAllClose( vars2[0] - 0.5 * lr * (grads2_1['w'][1] + grads2_2['w'][1]), vars2_1[0]) self.assertAllClose(vars2, vars2_intermediate) self.assertAllClose(vars1_1, vars2_1)
def train(model=None): assert FLAGS.train_dir, 'train_dir must be given' global_step = tf.Variable(0, trainable=False) add_global = tf.assign_add(global_step, 1) loss_op, train_op, acc_op = model.build_train_graph(global_step) dev_acc_op, dev_num_op, dev_init_op = model.build_dev_graph() test_acc_op, test_num_op, test_init_op = model.build_test_graph() train_ckpt_dir = FLAGS.train_dir + '/train_ckpt' os.makedirs(train_ckpt_dir, exist_ok=True) sum_writer = tf.summary.FileWriter(str(train_ckpt_dir), graph=tf.get_default_graph()) best_dev_acc = 0.0 final_acc = 0.0 saver = tf.train.Saver(max_to_keep=1) init = tf.global_variables_initializer() with tf.Session(config=utils.get_config()) as sess: tf.set_random_seed(FLAGS.random_seed) np.random.seed(FLAGS.random_seed) sess.run(init) for _ in itertools.count(1): this_global_step = sess.run(add_global) if this_global_step >= FLAGS.max_steps + 1: break _, loss, acc = sess.run([train_op, loss_op, acc_op]) if this_global_step != 0 and this_global_step % FLAGS.test_steps == 0: number = 0 accuracy = 0.0 while True: try: acc, num = sess.run([dev_acc_op, dev_num_op]) number += num accuracy += acc * num except tf.errors.OutOfRangeError: break accuracy /= number print('At step %d. dev num=%d acc=%f.' % (this_global_step, number, accuracy)) if accuracy > best_dev_acc: best_dev_acc = accuracy print("best acc=%f At step %d." % (best_dev_acc, this_global_step)) test_accuracy = 0. test_number = 0 while True: try: test_acc, test_num = sess.run( [test_acc_op, test_num_op]) test_number += test_num test_accuracy += test_acc * test_num except tf.errors.OutOfRangeError: break test_accuracy /= test_number print('test num=%d acc=%f.' % (test_number, test_accuracy)) final_acc = test_accuracy sess.run(test_init_op) save_checkpoint(saver, sess, FLAGS.train_dir, this_global_step) summary = tf.Summary() summary.value.add(tag='test_acc', simple_value=accuracy) summary.value.add(tag='best_dev_acc', simple_value=best_dev_acc) sum_writer.add_summary(summary, this_global_step) sess.run(dev_init_op) sum_writer.close() print('Accuracy of test set is %f .' % final_acc)
def pathint_stabilization(self, adam_optimizer): """ Synaptic stabilization via the Zenke method """ # Set up method optimizer_task = tf.train.GradientDescentOptimizer(learning_rate=1.0) small_omega_var = {} small_omega_var_div = {} reset_small_omega_ops = [] update_small_omega_ops = [] update_big_omega_ops = [] # If using reinforcement learning, update rewards if par['training_method'] == 'RL': self.previous_reward = tf.Variable(-tf.ones([]), trainable=False) self.current_reward = tf.Variable(-tf.ones([]), trainable=False) reward_stacked = tf.stack(self.reward, axis=0) current_reward = tf.reduce_mean( tf.reduce_sum(reward_stacked, axis=0)) self.update_current_reward = tf.assign(self.current_reward, current_reward) self.update_previous_reward = tf.assign(self.previous_reward, self.current_reward) # Iterate over variables in the model for var in tf.trainable_variables(): # Reset the small omega vars small_omega_var[var.op.name] = tf.Variable(tf.zeros( var.get_shape()), trainable=False) small_omega_var_div[var.op.name] = tf.Variable(tf.zeros( var.get_shape()), trainable=False) reset_small_omega_ops.append( tf.assign(small_omega_var[var.op.name], small_omega_var[var.op.name] * 0.0)) reset_small_omega_ops.append( tf.assign(small_omega_var_div[var.op.name], small_omega_var_div[var.op.name] * 0.0)) # Update the big omega vars based on the training method if par['training_method'] == 'RL': update_big_omega_ops.append(tf.assign_add( self.big_omega_var[var.op.name], tf.div(tf.abs(small_omega_var[var.op.name]), \ (par['omega_xi'] + small_omega_var_div[var.op.name])))) elif par['training_method'] == 'SL': update_big_omega_ops.append(tf.assign_add( self.big_omega_var[var.op.name], tf.div(tf.nn.relu(small_omega_var[var.op.name]), \ (par['omega_xi'] + small_omega_var_div[var.op.name]**2)))) # After each task is complete, call update_big_omega and reset_small_omega self.update_big_omega = tf.group(*update_big_omega_ops) # Reset_small_omega also makes a backup of the final weights, used as hook in the auxiliary loss self.reset_small_omega = tf.group(*reset_small_omega_ops) # This is called every batch self.delta_grads = adam_optimizer.return_delta_grads() self.gradients = optimizer_task.compute_gradients(self.pol_loss) # Update the samll omegas using the gradients for (grad, var) in self.gradients: if par['training_method'] == 'RL': delta_reward = self.current_reward - self.previous_reward update_small_omega_ops.append( tf.assign_add(small_omega_var[var.op.name], self.delta_grads[var.op.name] * delta_reward)) update_small_omega_ops.append( tf.assign_add( small_omega_var_div[var.op.name], tf.abs(self.delta_grads[var.op.name] * delta_reward))) elif par['training_method'] == 'SL': update_small_omega_ops.append( tf.assign_add(small_omega_var[var.op.name], -self.delta_grads[var.op.name] * grad)) update_small_omega_ops.append( tf.assign_add(small_omega_var_div[var.op.name], self.delta_grads[var.op.name])) # Make update group self.update_small_omega = tf.group( *update_small_omega_ops) # 1) update small_omega after each train!
def accu_global_ms(self): accu = list() for grad, ms in zip(self._grads, self._global_ms): accu.append(tf.assign_add(ms, tf.square(grad))) inc_op = tf.assign_add(self._batches_num, tf.ones([])) return [accu, inc_op]