Exemplo n.º 1
0
  def after_apply(self):
    self._moving_averager = tf.train.ExponentialMovingAverage(decay=self._beta, zero_debias=self._zero_debias)
    assert self._grads != None and len(self._grads) > 0
    after_apply_ops = []

    # get per var g**2 and norm**2
    self._grad_squared = []
    self._grad_norm_squared = []
    for v, g in zip(self._tvars, self._grads):
      with ops.colocate_with(v):
        self._grad_squared.append(tf.square(g) )
    self._grad_norm_squared = [tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared]

    # the following running average on squared norm of gradient is shared by grad_var and dist_to_opt
    avg_op = self._moving_averager.apply(self._grad_norm_squared)
    with tf.control_dependencies([avg_op] ):
      self._grad_norm_squared_avg = [self._moving_averager.average(val) for val in self._grad_norm_squared]
      self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
      self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
    after_apply_ops.append(avg_op)

    with tf.control_dependencies([avg_op] ):
      curv_range_ops = self.curvature_range()
      after_apply_ops += curv_range_ops
      grad_var_ops = self.grad_variance()
      after_apply_ops += grad_var_ops
      dist_to_opt_ops = self.dist_to_opt() 
      after_apply_ops += dist_to_opt_ops

    return tf.group(*after_apply_ops)
Exemplo n.º 2
0
  def testTensorArrayReadTwice(self):
    with self.test_session(use_gpu=self._use_gpu):
      value = tf.constant([[1.0, -1.0], [10.0, -10.0]])

      ta_readonce = tensor_array_ops.TensorArray(
          dtype=tf.float32, tensor_array_name="foo", size=2)

      w_readonce = ta_readonce.unpack(value)
      r0_readonce = w_readonce.read(0)
      with tf.control_dependencies([r0_readonce]):
        r1_readonce = w_readonce.read(0)

      with self.assertRaisesOpError(
          r"Could not read index 0 twice because it was cleared after a "
          r"previous read \(perhaps try setting clear_after_read = false\?\)"):
        r1_readonce.eval()

      ta_readtwice = tensor_array_ops.TensorArray(
          dtype=tf.float32, tensor_array_name="foo", size=2,
          clear_after_read=False)
      w_readtwice = ta_readtwice.unpack(value)
      r0_readtwice = w_readtwice.read(0)
      with tf.control_dependencies([r0_readtwice]):
        r1_readtwice = w_readtwice.read(0)

      self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
Exemplo n.º 3
0
def moving_average(value, window):
    value = tf.to_float(value)
    shape = value.get_shape()

    queue_init = tf.zeros(tf.TensorShape(window).concatenate(shape))
    total_init = tf.zeros(shape)
    num_init = tf.constant(0, dtype=tf.float32)

    queue = tf.FIFOQueue(window, [tf.float32], shapes=[shape])
    total = tf.Variable(total_init, trainable=False)
    num = tf.Variable(num_init, trainable=False)

    init = tf.cond(
        tf.equal(queue.size(), 0),
        lambda: tf.group(
            queue.enqueue_many(queue_init),
            total.assign(total_init),
            num.assign(num_init)),
        lambda: tf.no_op())

    with tf.control_dependencies([init]):
        total_ = total + value - queue.dequeue()
        num_ = num + 1
        value_averaged = total_ / (tf.minimum(num_, window) + EPSILON)

        with tf.control_dependencies([queue.enqueue([value]), total.assign(total_), num.assign(num_)]):
            return tf.identity(value_averaged)
    def update_parameters(self, loss):
        if self.regularization_constant != 0:
            l2_norm = tf.reduce_sum([tf.sqrt(tf.reduce_sum(tf.square(param))) for param in tf.trainable_variables()])
            loss = loss + self.regularization_constant*l2_norm

        optimizer = self.get_optimizer(self.learning_rate_var, self.beta1_decay_var)
        grads = optimizer.compute_gradients(loss)
        clipped = [(tf.clip_by_value(g, -self.grad_clip, self.grad_clip), v_) for g, v_ in grads]

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            step = optimizer.apply_gradients(clipped, global_step=self.global_step)

        if self.enable_parameter_averaging:
            maintain_averages_op = self.ema.apply(tf.trainable_variables())
            with tf.control_dependencies([step]):
                self.step = tf.group(maintain_averages_op)
        else:
            self.step = step

        logging.info('all parameters:')
        logging.info(pp.pformat([(var.name, shape(var)) for var in tf.global_variables()]))

        logging.info('trainable parameters:')
        logging.info(pp.pformat([(var.name, shape(var)) for var in tf.trainable_variables()]))

        logging.info('trainable parameter count:')
        logging.info(str(np.sum(np.prod(shape(var)) for var in tf.trainable_variables())))
Exemplo n.º 5
0
def get_run_op():
  # Create an optimizer that performs gradient descent.
  #opt = tf.train.GradientDescentOptimizer(learning_rate=0.01)
  slice_size = FLAGS.batch_size / FLAGS.num_cuts
  print('Slice size:{}'.format(slice_size))
  data = None
  label = None
  last_fc = [tf.no_op()]
  with tf.device('/gpu:0'):
    data = tf.get_variable(
        name = 'data',
        shape=[slice_size, FLAGS.hidden_size],
        trainable=False)
    '''
    label = tf.get_variable(
        name = 'label',
        shape = [slice_size, FLAGS.hidden_size],
        trainable=False))
    with tf.variable_scope('fc_in'):
      weight_in = tf.zeros([1000, FLAGS.hidden_size])
      for k in xrange(FLAGS.num_cuts):
        with tf.control_dependencies([last_fc[-1]]):
            last_fc.append(tf.matmul(data[k+1], weight_in))
    '''
  for i in xrange(FLAGS.num_cuts):
    last_fc.append(data)
  for i in xrange(FLAGS.num_layers):
    dev = '/gpu:%d' % (i * FLAGS.num_gpus / FLAGS.num_layers)
    with tf.device(dev), scopes.arg_scope([variables.variable], device=dev):
      tmp_fc = [tf.no_op()]
      with tf.variable_scope('fc%d' % i):
        w = tf.get_variable(
            name='w',
            shape=[FLAGS.hidden_size, FLAGS.hidden_size],
            trainable=True)
        for k in xrange(FLAGS.num_cuts):
          with tf.control_dependencies([tmp_fc[-1]]):
            tmp_fc.append(tf.matmul(last_fc[k+1], w))
      last_fc = tmp_fc
      if i == FLAGS.num_layers - 1:
        with tf.control_dependencies(last_fc):
          train_op = tf.no_op()
  '''
  with tf.device('/gpu:%d' % (FLAGS.num_gpus - 1)):
    tmp_fc = [tf.no_op()]
    with tf.variable_scope('fc_out'):
      weight_out = tf.zeros([FLAGS.hidden_size, 1000])
      for k in xrange(FLAGS.num_cuts):
        with tf.control_dependencies([tmp_fc[-1]]):
          tmp_fc.append(tf.matmul(last_fc[k+1], weight_out))
    last_fc = tmp_fc
  loss = tf.nn_softmax_cross_entropy_with_logits(last_fc, labels, name='xentropy')
  grads = opt.compute_gradients(loss)
  apply_gradient_op = opt.apply_gradients(grads)

  train_op = tf.group(apply_gradient_op)
  '''
  init_op = tf.initialize_all_variables()

  return init_op, train_op
def train(total_loss, global_step):
    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True)
    tf.scalar_summary("learning_rate", lr)

    loss_averages_op = _add_loss_summaries(total_loss)

    with tf.control_dependencies([loss_averages_op]):
        opt = tf.train.GradientDescentOptimizer(lr)
        grads = opt.compute_gradients(total_loss)

    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    for var in tf.trainable_variables():
        tf.histogram_summary(var.op.name, var)

    for grad, var in grads:
        if grad:
            tf.histogram_summary(var.op.name + "/gradients", grad)

    #variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    #variables_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op]):
        train_op = tf.no_op(name="train")

    return train_op
Exemplo n.º 7
0
  def _outputs_with_release(self, handle, inputs, outputs):
    """Ensures ComputeSession is released before outputs are returned.

    Args:
      handle: Handle to ComputeSession on which all computation until now has
          depended. It will be released and assigned to the output 'run'.
      inputs: list of nodes we want to pass through without any dependencies.
      outputs: list of nodes whose access should ensure the ComputeSession is
          safely released.

    Returns:
      A dictionary of both input and output nodes.
    """
    with tf.control_dependencies(outputs.values()):
      with tf.name_scope('ComputeSession'):
        release_op = dragnn_ops.release_session(handle)
      run_op = tf.group(release_op, name='run')
      for output in outputs:
        with tf.control_dependencies([release_op]):
          outputs[output] = tf.identity(outputs[output], name=output)
    all_nodes = inputs.copy()
    all_nodes.update(outputs)

    # Add an alias for simply running without collecting outputs.
    # Common, for instance, with training.
    all_nodes['run'] = run_op
    return all_nodes
Exemplo n.º 8
0
  def backward_grads(self, y, dy, training=True):
    """Manually compute backward gradients given input and output grads."""
    dy1, dy2 = dy
    y1, y2 = y

    with tf.GradientTape() as gtape:
      gtape.watch(y1)
      gy1 = self.g(y1, training=training)
    grads_combined = gtape.gradient(
        gy1, [y1] + self.g.trainable_variables, output_gradients=dy2)
    dg = grads_combined[1:]
    dx1 = dy1 + grads_combined[0]
    # This doesn't affect eager execution, but improves memory efficiency with
    # graphs
    with tf.control_dependencies(dg + [dx1]):
      x2 = y2 - gy1

    with tf.GradientTape() as ftape:
      ftape.watch(x2)
      fx2 = self.f(x2, training=training)
    grads_combined = ftape.gradient(
        fx2, [x2] + self.f.trainable_variables, output_gradients=dx1)
    df = grads_combined[1:]
    dx2 = dy2 + grads_combined[0]
    # Same behavior as above
    with tf.control_dependencies(df + [dx2]):
      x1 = y1 - fx2

    x = x1, x2
    dx = dx1, dx2
    grads = df + dg

    return x, dx, grads
 def loop_body(i):
   asn1 = tf.assign_add(var_a, 1, name="a_add")
   with tf.control_dependencies([asn1]):
     asn2 = tf.assign_add(var_b, var_a, name="b_add")
   with tf.control_dependencies([asn2]):
     ni = tf.add(i, 1, name="i_add")
     return ni
Exemplo n.º 10
0
def batch_norm(value, is_train = True, name = 'batch_norm', epsilon = 1e-5, momentum = 0.9):
    #return value
    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        
        ema = tf.train.ExponentialMovingAverage(decay = momentum)
        shape = value.get_shape().as_list()[-1]
        beta = bias('beta', [shape], bias_start = 0.0)
        gamma = bias('gamma', [shape], bias_start = 1.0)
        
        if is_train:

            batch_mean, batch_variance = tf.nn.moments(value, [0, 1, 2], name = 'moments')

            moving_mean = bias('moving_mean', [shape], 0.0, False)
            moving_variance = bias('moving_variance', [shape], 1.0, False)
            
            ema_apply_op = ema.apply([batch_mean, batch_variance])
            
            assign_mean = moving_mean.assign(ema.average(batch_mean))
            assign_variance = \
                moving_variance.assign(ema.average(batch_variance))
            
            with tf.control_dependencies([ema_apply_op]):
                mean, variance = \
                    tf.identity(batch_mean), tf.identity(batch_variance)
            
            with tf.control_dependencies([assign_mean, assign_variance]):
                return tf.nn.batch_normalization(value, mean, variance, beta, gamma, 1e-5)
        
        else:
            mean = bias('moving_mean', [shape], 0.0, False)
            variance = bias('moving_variance', [shape], 1.0, False)

            return tf.nn.batch_normalization(value, mean, variance, beta, gamma, epsilon)
Exemplo n.º 11
0
    def train(self, total_loss):
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        losses = tf.get_collection('losses')
        loss_averages_op = loss_averages.apply(losses + [total_loss])

        for l in losses + [total_loss]:
            tf.scalar_summary(l.op.name + ' (raw)', l)

        # Apply gradients, and add histograms
        with tf.control_dependencies([loss_averages_op]):
            opt = tf.train.AdamOptimizer()
            grads = opt.compute_gradients(total_loss)
        apply_gradient_op = opt.apply_gradients(grads)
        for var in tf.trainable_variables():
            tf.histogram_summary(var.op.name, var)
        for grad, var in grads:
            if grad is not None:
                tf.histogram_summary(var.op.name + '/gradients', grad)

        # Track the moving averages of all trainable variables
        variable_averages = tf.train.ExponentialMovingAverage(Recognizer.MOVING_AVERAGE_DECAY)
        variables_averages_op = variable_averages.apply(tf.trainable_variables())

        with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
            train_op = tf.no_op(name='train')
        return train_op
Exemplo n.º 12
0
    def build_eval_graph(self):
        # Keep track of the totals while running through the batch data
        self.total_loss = tf.Variable(0.0, trainable=False, collections=[])
        self.total_correct = tf.Variable(0.0, trainable=False, collections=[])
        self.example_count = tf.Variable(0.0, trainable=False, collections=[])

        # Calculates the means
        self.mean_loss = self.total_loss / self.example_count
        self.accuracy = self.total_correct / self.example_count

        # Operations to modify to the stateful variables
        inc_total_loss = self.total_loss.assign_add(self.model.total_loss)
        inc_total_correct = self.total_correct.assign_add(
            tf.reduce_sum(tf.cast(self.model.correct_predictions, "float")))
        inc_example_count = self.example_count.assign_add(self.model.batch_size)

        # Operation to reset all the stateful vars. Should be called before starting a data set evaluation.
        with tf.control_dependencies(
                [self.total_loss.initializer, self.total_correct.initializer, self.example_count.initializer]):
            self.eval_reset = tf.no_op()

        # Operation to modify the stateful variables with data from one batch
        # Should be called for each batch in the evaluatin set
        with tf.control_dependencies([inc_total_loss, inc_total_correct, inc_example_count]):
            self.eval_step = tf.no_op()

        # Summaries
        summary_mean_loss = tf.scalar_summary("mean_loss", self.mean_loss)
        summary_acc = tf.scalar_summary("accuracy", self.accuracy)
        self.summaries = tf.merge_summary([summary_mean_loss, summary_acc])
Exemplo n.º 13
0
  def _apply(self, grad, var, indices=None):
    lr = tf.cast(self._learning_rate_tensor, var.dtype.base_dtype)
    m = self.get_slot(var, "m")
    v = self.get_slot(var, "v")
    beta1_t = tf.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = tf.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = tf.cast(self._epsilon_t, var.dtype.base_dtype)

    # m_t = beta1 * m + (1 - beta1) * g_t
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = tf.assign(m, m * beta1_t, use_locking=self._use_locking)
    with tf.control_dependencies([m_t]):
      m_t = self._assign_add(m, updates=m_scaled_g_values, indices=indices)
    m_gathered = self._gather(m_t, indices=indices)

    # Also see tf.nn.moments.
    variance = tf.squared_difference(grad, m_gathered)

    # v_t = beta2 * v + (1 - beta2) * variance
    v_scaled_new_values = variance * (1 - beta2_t)
    v_t = tf.assign(v, v * beta2_t, use_locking=self._use_locking)
    with tf.control_dependencies([v_t]):
      v_t = self._assign_add(v, updates=v_scaled_new_values, indices=indices)
    v_gathered = self._gather(v_t, indices=indices)

    factor = v_gathered / (variance + epsilon_t)
    update = lr * grad * tf.minimum(factor, 1.0)
    var_update = self._assign_sub(ref=var, updates=update, indices=indices)
    return tf.group(*[var_update, m_t])
Exemplo n.º 14
0
  def testAssertIntegerForm(self):
    # This should only be detected as an integer.
    x = [1., 5, 10, 15, 20]
    y = [1.1, 5, 10, 15, 20]
    # First component isn't less than float32.eps = 1e-7
    z = [1.0001, 5, 10, 15, 20]
    # This shouldn"t be detected as an integer.
    w = [1e-8, 5, 10, 15, 20]
    with self.test_session():
      with tf.control_dependencies([distribution_util.assert_integer_form(x)]):
        tf.identity(x).eval()

      with self.assertRaisesOpError("x has non-integer components"):
        with tf.control_dependencies([
            distribution_util.assert_integer_form(y)]):
          tf.identity(y).eval()

      with self.assertRaisesOpError("x has non-integer components"):
        with tf.control_dependencies([
            distribution_util.assert_integer_form(z)]):
          tf.identity(z).eval()

      with self.assertRaisesOpError("x has non-integer components"):
        with tf.control_dependencies([
            distribution_util.assert_integer_form(w)]):
          tf.identity(w).eval()
Exemplo n.º 15
0
  def _define_step(self, done, score, summary):
    """Combine operations of a phase.

    Keeps track of the mean score and when to report it.

    Args:
      done: Tensor indicating whether current score can be used.
      score: Tensor holding the current, possibly intermediate, score.
      summary: Tensor holding summary string to write if not an empty string.

    Returns:
      Tuple of summary tensor, mean score, and new global step. The mean score
      is zero for non reporting steps.
    """
    if done.shape.ndims == 0:
      done = done[None]
    if score.shape.ndims == 0:
      score = score[None]
    score_mean = streaming_mean.StreamingMean((), tf.float32)
    with tf.control_dependencies([done, score, summary]):
      done_score = tf.gather(score, tf.where(done)[:, 0])
      submit_score = tf.cond(tf.reduce_any(done), lambda: score_mean.submit(done_score), tf.no_op)
    with tf.control_dependencies([submit_score]):
      mean_score = tf.cond(self._report, score_mean.clear, float)
      steps_made = tf.shape(score)[0]
      next_step = self._step.assign_add(steps_made)
    with tf.control_dependencies([mean_score, next_step]):
      return tf.identity(summary), mean_score, next_step, steps_made
Exemplo n.º 16
0
  def _training(self):
    """Perform multiple training iterations of both policy and value baseline.

    Training on the episodes collected in the memory. Reset the memory
    afterwards. Always returns a summary string.

    Returns:
      Summary tensor.
    """
    with tf.name_scope('training'):
      assert_full = tf.assert_equal(self._memory_index, self._config.update_every)
      with tf.control_dependencies([assert_full]):
        data = self._memory.data()
      (observ, action, old_mean, old_logstd, reward), length = data
      with tf.control_dependencies([tf.assert_greater(length, 0)]):
        length = tf.identity(length)
      observ = self._observ_filter.transform(observ)
      reward = self._reward_filter.transform(reward)
      update_summary = self._perform_update_steps(observ, action, old_mean, old_logstd, reward,
                                                  length)
      with tf.control_dependencies([update_summary]):
        penalty_summary = self._adjust_penalty(observ, old_mean, old_logstd, length)
      with tf.control_dependencies([penalty_summary]):
        clear_memory = tf.group(self._memory.clear(), self._memory_index.assign(0))
      with tf.control_dependencies([clear_memory]):
        weight_summary = utility.variable_summaries(tf.trainable_variables(),
                                                    self._config.weight_summaries)
        return tf.summary.merge([update_summary, penalty_summary, weight_summary])
Exemplo n.º 17
0
def train(total_loss, global_step):
    total_sample = 274
    num_batches_per_epoch = 274/1
    """ fix lr """
    lr = INITIAL_LEARNING_RATE
    loss_averages_op = _add_loss_summaries(total_loss)

    # Compute gradients.
    with tf.control_dependencies([loss_averages_op]):
      opt = tf.train.AdamOptimizer(lr)
      grads = opt.compute_gradients(total_loss)
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
      tf.summary.histogram(var.op.name, var)

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        tf.summary.histogram(var.op.name + '/gradients', grad)

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(
        MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
      train_op = tf.no_op(name='train')

    return train_op
    def optimize(self, learning_rate, train_layers,global_step,source_centroid,target_centroid):
        print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++'
	print train_layers
	var_list=[v for v in tf.trainable_variables() if v.name.split('/')[1] in ['conv1','conv2','fc1','fc2']]
	self.Gregloss=5e-4*tf.reduce_mean([tf.nn.l2_loss(x) for x in var_list if 'weights' in x.name])
	
	new_weights=[v for v in var_list if 'weights' in v.name or 'gamma' in v.name]
	new_biases=[v for v in var_list if 'biases' in v.name or 'beta' in v.name]

	
	print '==============new_weights======================='
	print new_weights
	print '==============new_biases======================='
	print new_biases

        self.F_loss=self.loss+self.Gregloss+global_step*self.Semanticloss+global_step*self.G_loss
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
	print '+++++++++++++++ batch norm update ops +++++++++++++++++'
  	print update_ops
	with tf.control_dependencies(update_ops):
	    train_op3=tf.train.MomentumOptimizer(learning_rate*1.0,0.9).minimize(self.F_loss, var_list=new_weights)
            train_op4=tf.train.MomentumOptimizer(learning_rate*2.0,0.9).minimize(self.F_loss, var_list=new_biases)
	train_op=tf.group(train_op3,train_op4)
	
	with tf.control_dependencies([train_op3,train_op4]):
	    update_sc=self.source_moving_centroid.assign(source_centroid)
	    update_tc=self.target_moving_centroid.assign(target_centroid)
	return tf.group(update_sc,update_tc)
Exemplo n.º 19
0
 def _define_experience(self, agent_indices, observ, action, reward):
   """Implement the branch of experience() entered during training."""
   update_filters = tf.summary.merge(
       [self._observ_filter.update(observ),
        self._reward_filter.update(reward)])
   with tf.control_dependencies([update_filters]):
     if self._config.train_on_agent_action:
       # NOTE: Doesn't seem to change much.
       action = self._last_action
     batch = (observ, action, tf.gather(self._last_mean,
                                        agent_indices), tf.gather(self._last_logstd,
                                                                  agent_indices), reward)
     append = self._episodes.append(batch, agent_indices)
   with tf.control_dependencies([append]):
     norm_observ = self._observ_filter.transform(observ)
     norm_reward = tf.reduce_mean(self._reward_filter.transform(reward))
     # pylint: disable=g-long-lambda
     summary = tf.cond(
         self._should_log, lambda: tf.summary.merge([
             update_filters,
             self._observ_filter.summary(),
             self._reward_filter.summary(),
             tf.summary.scalar('memory_size', self._memory_index),
             tf.summary.histogram('normalized_observ', norm_observ),
             tf.summary.histogram('action', self._last_action),
             tf.summary.scalar('normalized_reward', norm_reward)
         ]), str)
     return summary
Exemplo n.º 20
0
def train(total_loss, global_step, learning_rate=INITIAL_LEARNING_RATE):
  lr = tf.train.exponential_decay(learning_rate,
                                  global_step,
                                  DECAY_STEPS,#number of steps required for it to decay
                                  LEARNING_RATE_DECAY_FACTOR,
                                  staircase=True)

  tf.scalar_summary('learning_rate', lr)

  #compute gradient step
  with tf.control_dependencies([total_loss]):
    opt = tf.train.MomentumOptimizer(lr, momentum=0.95)
    grads = opt.compute_gradients(total_loss)

  #if we wanted to clip the gradients
  #would apply the operation here

  #apply the gradients
  apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

  for grad, var in grads:
    if grad is not None:
      print("Found gradients for: ", var.op.name)
      tf.histogram_summary(var.op.name + "/gradients", grad)

  with tf.control_dependencies([apply_gradient_op]):
    train_op = tf.no_op(name="train")

  #opt = tf.train.GradientDescentOptimizer(lr).minimize(total_loss, global_step=global_step)
  # grads = opt.compute_gradients(total_loss)

  return train_op
Exemplo n.º 21
0
  def update(self, value):
    """Update the mean and variance estimates.

    Args:
      value: Batch or single value tensor.

    Returns:
      Summary tensor.
    """
    with tf.name_scope(self._name + '/update'):
      if value.shape.ndims == self._mean.shape.ndims:
        # Add a batch dimension if necessary.
        value = value[None, ...]
      count = tf.shape(value)[0]
      with tf.control_dependencies([self._count.assign_add(count)]):
        step = tf.cast(self._count, tf.float32)
        mean_delta = tf.reduce_sum(value - self._mean[None, ...], 0)
        new_mean = self._mean + mean_delta / step
        new_mean = tf.cond(self._count > 1, lambda: new_mean, lambda: value[0])
        var_delta = (value - self._mean[None, ...]) * (value - new_mean[None, ...])
        new_var_sum = self._var_sum + tf.reduce_sum(var_delta, 0)
      with tf.control_dependencies([new_mean, new_var_sum]):
        update = self._mean.assign(new_mean), self._var_sum.assign(new_var_sum)
      with tf.control_dependencies(update):
        if value.shape.ndims == 1:
          value = tf.reduce_mean(value)
        return self._summary('value', tf.reduce_mean(value))
Exemplo n.º 22
0
def train(total_loss, global_step):
    """
    Create an optimizer and apply to all trainable variables. Add moving
    average for all trainable variables.
    Args:
        total_loss: Total loss from loss().
        global_step: Integer Variable counting the number of training steps processed.
    Returns:
        train_op: op for training.
    """
    
    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    losses = tf.get_collection('losses')
    loss_averages_op = loss_averages.apply(losses + [total_loss])
    
    with tf.control_dependencies([loss_averages_op]):
        opt = tf.train.AdamOptimizer(FLAGS.learning_rate)
        grads = opt.compute_gradients(total_loss)
        
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
    
    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())
    
    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
        train_op = tf.no_op(name='train')
        
    return train_op
Exemplo n.º 23
0
  def append(self, transitions, rows=None):
    """Append a batch of transitions to rows of the memory.

    Args:
      transitions: Tuple of transition quantities with batch dimension.
      rows: Episodes to append to, defaults to all.

    Returns:
      Operation.
    """
    rows = tf.range(self._capacity) if rows is None else rows
    assert rows.shape.ndims == 1
    assert_capacity = tf.assert_less(
        rows, self._capacity,
        message='capacity exceeded')
    with tf.control_dependencies([assert_capacity]):
      assert_max_length = tf.assert_less(
          tf.gather(self._length, rows), self._max_length,
          message='max length exceeded')
    append_ops = []
    with tf.control_dependencies([assert_max_length]):
      for buffer_, elements in zip(self._buffers, transitions):
        timestep = tf.gather(self._length, rows)
        indices = tf.stack([rows, timestep], 1)
        append_ops.append(tf.scatter_nd_update(buffer_, indices, elements))
    with tf.control_dependencies(append_ops):
      episode_mask = tf.reduce_sum(tf.one_hot(
          rows, self._capacity, dtype=tf.int32), 0)
      return self._length.assign_add(episode_mask)
Exemplo n.º 24
0
  def testCaching(self):
    """Confirm caching of control output is recacluated between calls."""
    a = tf.constant(1)
    b = tf.constant(2)
    with tf.control_dependencies([a]):
      c = tf.constant(42)

    shared = {}

    def sub(t):
      shared[t] = shared.get(t, 0) + 1
      return t

    a = subscribe.subscribe(a, lambda t: tf.py_func(sub, [t], [t.dtype]))

    with tf.control_dependencies([b]):
      d = tf.constant(11)

    # If it was using outdated cached control_outputs then
    # evaling would not trigger the new subscription.
    b = subscribe.subscribe(b, lambda t: tf.py_func(sub, [t], [t.dtype]))

    with self.test_session() as sess:
      c_out = sess.run([c])
      d_out = sess.run([d])

    self.assertEquals(c_out, [42])
    self.assertEquals(d_out, [11])
    self.assertEquals(shared, {2: 1, 1: 1})
Exemplo n.º 25
0
    def body(i, xs_copy, logprob_prev, grads_prev):
        ps_init = _init_ps(xs_copy)
        ps = _update_ps(ps_init, grads_prev, epsilon, coeff=+0.5)
        max_iters = tf.random_uniform((), minval=lmin, maxval=lmax, dtype=tf.int32)

        dep_list = _flat([max_iters], ps, ps_init)
        with tf.control_dependencies(dep_list):
            leapfrog_result = _leapfrog_step(xs, ps, epsilon, max_iters, logprob_grads_fn)
            proceed, xs_new, ps_new, logprob_new, grads_new = leapfrog_result
            dep_list = _flat([proceed], [logprob_new], xs_new, ps_new, grads_new)

            def standard_proposal():
                with tf.control_dependencies(dep_list):
                    return _reject_accept_proposal(
                        xs_new, xs_copy, ps_new, ps_init,
                        logprob_new, logprob_prev,
                        grads_new, grads_prev, epsilon)

            def premature_reject():
                with tf.control_dependencies(dep_list):
                    return _premature_reject(
                        xs_copy, logprob_prev, grads_prev)

            xs_out, logprob_out, grads_out = tf.cond(proceed,
                                                     standard_proposal,
                                                     premature_reject,
                                                     strict=True)

            xs_assign = _assign_variables(xs, xs_out)
            with tf.control_dependencies(xs_assign):
                xs_out_copy = _copy_variables(xs_assign)
                with tf.control_dependencies(xs_copy):
                    return i + 1, xs_out_copy, logprob_out, grads_out
Exemplo n.º 26
0
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Applying gradients and tune hyperparams with YellowFin.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name:  Optional name for the returned operation. Default to the
        name passed to the Optimizer constructor.

    Returns:
        (A group of operations)
        Variable Update with Momentum ops,
        YellowFin ops(Curvature, Variance, Distance) ops,
        SingleStep and lr_mu tuning ops,
        Step increment ops.
    """
    self._grad, self._vars = zip(*[(g, t)
                                   for g, t in grads_and_vars if g is not None])

    # Var update with Momentum.
    with tf.variable_scope("apply_updates"):
      # Gradient Clipping?
      if self._clip_thresh_var is not None:
        self._grad, _ = tf.clip_by_global_norm(
            self._grad, self._clip_thresh_var)

        apply_grad_op = self._momentum_optimizer.apply_gradients(
            zip(self._grad, self._vars),
            global_step=global_step,
            name=name)
      else:
        apply_grad_op = self._momentum_optimizer.apply_gradients(
            zip(self._grad, self._vars),
            global_step=global_step,
            name=name)

    # Begin lr and mu tuning.
    with tf.variable_scope("prepare_yellowFin_variables"):
      # the dependencies ideally only need to be after clip is done,
      # i.e. depends on self._grads. However, the control_dependencies
      # does not support indexed slice for sparse gradients.
      # The alternative dependencies here might be slightly slower due
      # to less parallelization.
      with tf.control_dependencies([apply_grad_op,]):
        prepare_variables_op = self._prepare_variables()

    with tf.variable_scope("yellowfin"):
      with tf.control_dependencies([prepare_variables_op]):
        yellowfin_op = self._yellowfin()

    # Update YellowFin step variable.
    with tf.control_dependencies([yellowfin_op]):
      self._increment_step_op = tf.assign_add(self._step, 1).op

    return tf.group(apply_grad_op,
                    prepare_variables_op,
                    yellowfin_op,
                    self._increment_step_op)
Exemplo n.º 27
0
  def test_train_skip_train_if_max_step_already_saved(self):
    with tf.Graph().as_default() as g, self.test_session(g):
      with tf.control_dependencies(self._build_inference_graph()):
        train_op = tf.assign_add(tf.contrib.framework.get_global_step(), 1)
      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
          g,
          output_dir=self._output_dir,
          train_op=train_op,
          loss_op=tf.constant(2.0),
          max_steps=10)
      step = checkpoints.load_variable(
          self._output_dir, tf.contrib.framework.get_global_step().name)
      self.assertEqual(10, step)

    with tf.Graph().as_default() as g, self.test_session(g):
      with tf.control_dependencies(self._build_inference_graph()):
        train_op = tf.assign_add(tf.contrib.framework.get_global_step(), 1)
      learn.graph_actions._monitored_train(  # pylint: disable=protected-access
          g,
          output_dir=self._output_dir,
          train_op=train_op,
          loss_op=tf.constant(2.0),
          max_steps=10)
      step = checkpoints.load_variable(
          self._output_dir, tf.contrib.framework.get_global_step().name)
      self.assertEqual(10, step)
Exemplo n.º 28
0
  def replace(self, episodes, length, rows=None):
    """Replace full episodes.

    Args:
      episodes: Tuple of transition quantities with batch and time dimensions.
      length: Batch of sequence lengths.
      rows: Episodes to replace, defaults to all.

    Returns:
      Operation.
    """
    rows = tf.range(self._capacity) if rows is None else rows
    assert rows.shape.ndims == 1
    assert_capacity = tf.assert_less(
        rows, self._capacity, message='capacity exceeded')
    with tf.control_dependencies([assert_capacity]):
      assert_max_length = tf.assert_less_equal(
          length, self._max_length, message='max length exceeded')
    replace_ops = []
    with tf.control_dependencies([assert_max_length]):
      for buffer_, elements in zip(self._buffers, episodes):
        replace_op = tf.scatter_update(buffer_, rows, elements)
        replace_ops.append(replace_op)
    with tf.control_dependencies(replace_ops):
      return tf.scatter_update(self._length, rows, length)
Exemplo n.º 29
0
  def _dist_to_opt(self):
    """Distance to optimum.

    Returns:
      D_t ops
    """
    dist_to_opt_ops = []
    # Running average of the norm of gradient
    self._grad_norm = tf.sqrt(self._grad_norm_squared)
    avg_op = self._moving_averager.apply([self._grad_norm,])
    dist_to_opt_ops.append(avg_op)
    with tf.control_dependencies([avg_op]):
      self._grad_norm_avg = self._moving_averager.average(self._grad_norm)
      # Single iteration distance estimation, note here
      # self._grad_norm_avg is per variable
      self._d_t = self._grad_norm_avg / self._grad_norm_squared_avg
    # Running average of distance
    avg_op = self._moving_averager.apply([self._d_t])
    dist_to_opt_ops.append(avg_op)
    with tf.control_dependencies([avg_op]):
      self._dist_to_opt_avg = tf.identity(
          self._moving_averager.average(self._d_t))
      if self._sparsity_debias:
        self._dist_to_opt_avg /= tf.sqrt(self._sparsity_avg)
    return dist_to_opt_ops  # D_t
Exemplo n.º 30
0
  def get_best(self, n):
    """Return the indices and values of the n highest scores in the TopN."""

    def refresh_shortlist():
      """Update the shortlist with the highest scores in id_to_score."""
      new_scores, new_ids = tf.nn.top_k(self.id_to_score, self.shortlist_size)
      smallest_new_score = tf.reduce_min(new_scores)
      new_length = tf.reduce_sum(
          tf.to_int32(tf.greater(new_scores, tf.float32.min)))
      u1 = self.sl_ids.assign(
          tf.to_int64(tf.concat_v2([[new_length], new_ids], 0)))
      u2 = self.sl_scores.assign(
          tf.concat_v2([[smallest_new_score], new_scores], 0))
      self.last_ops = [u1, u2]
      return tf.group(u1, u2)

    # We only need to refresh the shortlist if n is greater than the
    # current shortlist size (which is stored in sl_ids[0]).
    with tf.control_dependencies(self.last_ops):
      cond_op = tf.cond(n > self.sl_ids[0], refresh_shortlist, tf.no_op)
      with tf.control_dependencies([cond_op]):
        topk_values, topk_indices = tf.nn.top_k(
            self.sl_scores, tf.minimum(n, tf.to_int32(self.sl_ids[0])))
        # topk_indices are the indices into the shortlist, we want to return
        # the indices into id_to_score
        gathered_indices = tf.gather(self.sl_ids, topk_indices)
        return gathered_indices, topk_values
Exemplo n.º 31
0
def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5):
    """Performs box voting as described in S. Gidaris and N.

  Komodakis, ICCV 2015.

  Performs box voting as described in 'Object detection via a multi-region &
  semantic segmentation-aware CNN model', Gidaris and Komodakis, ICCV 2015. For
  each box 'B' in selected_boxes, we find the set 'S' of boxes in pool_boxes
  with iou overlap >= iou_thresh. The location of B is set to the weighted
  average location of boxes in S (scores are used for weighting). And the score
  of B is set to the average score of boxes in S.

  Args:
    selected_boxes: BoxList containing a subset of boxes in pool_boxes. These
      boxes are usually selected from pool_boxes using non max suppression.
    pool_boxes: BoxList containing a set of (possibly redundant) boxes.
    iou_thresh: (float scalar) iou threshold for matching boxes in
      selected_boxes and pool_boxes.

  Returns:
    BoxList containing averaged locations and scores for each box in
    selected_boxes.

  Raises:
    ValueError: if
      a) selected_boxes or pool_boxes is not a BoxList.
      b) if iou_thresh is not in [0, 1].
      c) pool_boxes does not have a scores field.
  """
    if not 0.0 <= iou_thresh <= 1.0:
        raise ValueError('iou_thresh must be between 0 and 1')
    if not isinstance(selected_boxes, box_list.BoxList):
        raise ValueError('selected_boxes must be a BoxList')
    if not isinstance(pool_boxes, box_list.BoxList):
        raise ValueError('pool_boxes must be a BoxList')
    if not pool_boxes.has_field('scores'):
        raise ValueError('pool_boxes must have a \'scores\' field')

    iou_ = iou(selected_boxes, pool_boxes)
    match_indicator = tf.cast(tf.greater(iou_, iou_thresh), dtype=tf.float32)
    num_matches = tf.reduce_sum(match_indicator, 1)
    # TODO(kbanoop): Handle the case where some boxes in selected_boxes do not
    # match to any boxes in pool_boxes. For such boxes without any matches, we
    # should return the original boxes without voting.
    match_assert = tf.Assert(tf.reduce_all(tf.greater(num_matches, 0)), [
        'Each box in selected_boxes must match with at least one box '
        'in pool_boxes.'
    ])

    scores = tf.expand_dims(pool_boxes.get_field('scores'), 1)
    scores_assert = tf.Assert(tf.reduce_all(tf.greater_equal(scores, 0)),
                              ['Scores must be non negative.'])

    with tf.control_dependencies([scores_assert, match_assert]):
        sum_scores = tf.matmul(match_indicator, scores)
    averaged_scores = tf.reshape(sum_scores, [-1]) / num_matches

    box_locations = tf.matmul(match_indicator,
                              pool_boxes.get() * scores) / sum_scores
    averaged_boxes = box_list.BoxList(box_locations)
    _copy_extra_fields(averaged_boxes, selected_boxes)
    averaged_boxes.add_field('scores', averaged_scores)
    return averaged_boxes
Exemplo n.º 32
0
def sac_n_step(env_name, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
               steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
               polyak=0.995, lr=1e-3, alpha=0.2,
               n_step=5, batch_size=100, start_steps=10000,
               without_delay_train=False,
               max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph = core.placeholders(obs_dim, act_dim, obs_dim)
    # r_ph = tf.placeholder(dtype=tf.float32, shape=(None, n_step))
    # d_ph = tf.placeholder(dtype=tf.float32, shape=(None, n_step))
    r_ph = tf.placeholder(dtype=tf.float32, shape=(None, None))
    d_ph = tf.placeholder(dtype=tf.float32, shape=(None, None))
    n_step_ph = tf.placeholder(dtype=tf.float32, shape=())

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in
                       ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts)


    # Targets for Q and V regression
    # q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ)
    # q_backup = tf.stop_gradient(
    #     tf.reduce_sum(tf.multiply([gamma ** (i) for i in range(n_step)] * (1 - d_ph), r_ph), axis=1)
    #     + gamma ** n_step * (1 - d_ph[:, -1]) * v_targ)
    q_backup = tf.stop_gradient(
        tf.reduce_sum(tf.multiply(tf.pow(gamma, tf.range(0, n_step_ph))
                                  * (1 - tf.slice(d_ph, [0, 0], [batch_size, n_step])), r_ph), axis=1)
        + gamma ** n_step_ph * (1 - tf.reshape(tf.slice(d_ph, [0, n_step], [batch_size, 1]), [-1])) * v_targ)
    v_backup = tf.stop_gradient(q1_pi - alpha * logp_pi)

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v) ** 2)
    value_loss = q1_loss + v_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, v_loss, q1, v, logp_pi,
                train_pi_op, train_value_op, target_update]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                          outputs={'mu': mu, 'pi': pi, 'q1': q1, 'v': v})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if t > batch_size and without_delay_train:
            # batch = replay_buffer.sample_batch(batch_size)
            batch = replay_buffer.sample_batch_n_step(batch_size, n_step=n_step)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         r_ph: batch['rews'],
                         d_ph: batch['done'],
                         n_step_ph: n_step
                         }
            outs = sess.run(step_ops, feed_dict)
            logger.store(LossPi=outs[0], LossQ1=outs[1],
                         LossV=outs[2], Q1Vals=outs[3],
                         VVals=outs[4], LogPi=outs[5])

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            if not without_delay_train:
                for j in range(ep_len):
                    # batch = replay_buffer.sample_batch(batch_size)
                    batch = replay_buffer.sample_batch_n_step(batch_size, n_step=n_step)
                    feed_dict = {x_ph: batch['obs1'],
                                 x2_ph: batch['obs2'],
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done'],
                                 n_step_ph: n_step
                                 }
                    outs = sess.run(step_ops, feed_dict)
                    logger.store(LossPi=outs[0], LossQ1=outs[1],
                                 LossV=outs[2], Q1Vals=outs[3],
                                 VVals=outs[4], LogPi=outs[5])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # # Save model
            # if (epoch % save_freq == 0) or (epoch == epochs - 1):
            #     logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Exemplo n.º 33
0
def train():

    # Makes this the default graph where all ops will be added
    with tf.Graph().as_default(), tf.device('/gpu:' + str(FLAGS.GPU)):

        # Load the images and labels.
        data, _ = network.inputs(skip=True)

        # Define phase of training
        phase_train = tf.placeholder(tf.bool)

        # Perform the forward pass:
        logits, l2loss = network.forward_pass_res(data['image_data'],
                                                  phase_train=phase_train)

        # Calculate loss
        SCE_loss = network.total_loss(logits,
                                      data['label_data'],
                                      loss_type='DICE')

        # Add the L2 regularization loss
        loss = tf.add(SCE_loss, l2loss, name='TotalLoss')

        # Update the moving average batch norm ops
        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        # Retreive the training operation with the applied gradients
        with tf.control_dependencies(extra_update_ops):
            train_op = network.backward_pass(loss)

        # -------------------  Housekeeping functions  ----------------------

        # Merge the summaries
        all_summaries = tf.summary.merge_all()

        # Initialize variables operation
        var_init = tf.group(tf.global_variables_initializer(),
                            tf.local_variables_initializer())

        # Restore moving average of the variables
        var_ema = tf.train.ExponentialMovingAverage(FLAGS.moving_avg_decay)

        # Define variables to restore
        var_restore = var_ema.variables_to_restore()

        # Initialize the saver
        saver = tf.train.Saver(var_restore, max_to_keep=4)

        # -------------------  Session Initializer  ----------------------

        # Set the intervals
        max_steps = int(
            (FLAGS.epoch_size / FLAGS.batch_size) * FLAGS.num_epochs)
        print_interval = int(
            (FLAGS.epoch_size / FLAGS.batch_size) * FLAGS.print_interval)
        checkpoint_interval = int(
            (FLAGS.epoch_size / FLAGS.batch_size) * FLAGS.checkpoint_interval)
        print('Max Steps: %s, Print Interval: %s, Checkpoint: %s' %
              (max_steps, print_interval, checkpoint_interval))

        # Allow memory placement growth
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as mon_sess:

            # Initialize the variables
            mon_sess.run(var_init)

            # Initialize the handle to the summary writer in our training directory
            summary_writer = tf.summary.FileWriter(
                FLAGS.train_dir + FLAGS.RunInfo, mon_sess.graph)

            # Initialize the step counter
            timer = 0

            # Use slim to handle queues:
            with slim.queues.QueueRunners(mon_sess):
                for i in range(max_steps):

                    # Run and time an iteration
                    start = time.time()
                    mon_sess.run(train_op, feed_dict={phase_train: True})
                    timer += (time.time() - start)

                    # Calculate current epoch
                    Epoch = int((i * FLAGS.batch_size) / FLAGS.epoch_size)

                    # Console and Tensorboard print interval
                    if i % print_interval == 0:

                        # First retreive the loss values
                        l2, sce, tot = mon_sess.run(
                            [l2loss, SCE_loss, loss],
                            feed_dict={phase_train: True})
                        tot *= 1e6
                        l2 *= 1e6
                        sce *= 1e6

                        # Get timing stats
                        elapsed = timer / print_interval
                        timer = 0

                        # Calc epoch
                        Epoch = int((i * FLAGS.batch_size) / FLAGS.epoch_size)

                        # Now print the loss values
                        print('-' * 70)
                        print(
                            'Epoch: %s, Time: %.1f sec, L2 Loss (ppm): %.4f, Prediction Loss (ppm): %.4f, Total Loss (ppm): %.4f, Eg/s: %.4f, Seconds Per: %.4f'
                            % (Epoch, elapsed, l2, sce, tot, FLAGS.batch_size /
                               elapsed, elapsed / FLAGS.batch_size))

                        # Run a session to retrieve our summaries
                        summary = mon_sess.run(all_summaries,
                                               feed_dict={phase_train: True})

                        # Add the summaries to the protobuf for Tensorboard
                        summary_writer.add_summary(summary, i)

                        # Timer
                        start_time = time.time()

                    if i % checkpoint_interval == 0:

                        print(
                            '-' * 70, '\nSaving... GPU: %s, File:%s' %
                            (FLAGS.GPU, FLAGS.RunInfo[:-1]))

                        # Define the filename
                        file = ('Epoch_%s' % Epoch)

                        # Define the checkpoint file:
                        checkpoint_file = os.path.join(
                            FLAGS.train_dir + FLAGS.RunInfo, file)

                        # Save the checkpoint
                        saver.save(mon_sess, checkpoint_file)
Exemplo n.º 34
0
def main(_):
    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)

    # Start a new TensorFlow session.
    sess = tf.InteractiveSession()

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)
    fingerprint_size = model_settings['fingerprint_size']
    # fingerprint_size가 뭘까
    # 일단 모델 세팅한 후에 나온 값이고, 위에서 모델의 설정과 오디오 처리기에 대한 정의는 해둠
    label_count = model_settings['label_count']

    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
    # Range to randomly shift the training audio by in time.
    # time shifting을 적용, 전처리 과정으로 생각하면 될 듯하다.
    # pitch shifting 느낌

    # Figure out the learning rates for each training phase. Since it's often
    # effective to have high learning rates at the start of training, followed by
    # lower levels towards the end, the number of steps and learning rates can be
    # specified as comma-separated lists to define the rate at each stage. For
    # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
    # will run 13,000 training loops in total, with a rate of 0.001 for the first
    # 10,000, and 0.0001 for the final 3,000.
    training_steps_list = list(
        map(int, FLAGS.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits, dropout_prob = models.create_model(fingerprint_input,
                                               model_settings,
                                               FLAGS.model_architecture,
                                               is_training=True)
    # 모델의 세팅과 입력, 모델에서 사용할 구조(conv, ...)

    # Define loss and optimizer
    ground_truth_input = tf.placeholder(tf.int64, [None],
                                        name='groundtruth_input')

    # Optionally we can add runtime checks to spot when NaNs or other symptoms of
    # numerical errors start occurring during training.
    control_dependencies = []
    if FLAGS.check_nans:
        checks = tf.add_check_numerics_ops()
        control_dependencies = [checks]

    # Create the back propagation and training evaluation machinery in the graph.
    with tf.name_scope('cross_entropy'):
        cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
            labels=ground_truth_input, logits=logits)
    tf.summary.scalar('cross_entropy', cross_entropy_mean)
    with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
        learning_rate_input = tf.placeholder(tf.float32, [],
                                             name='learning_rate_input')
        train_step = tf.train.GradientDescentOptimizer(
            learning_rate_input).minimize(cross_entropy_mean)
    predicted_indices = tf.argmax(logits, 1)
    correct_prediction = tf.equal(predicted_indices, ground_truth_input)
    confusion_matrix = tf.confusion_matrix(ground_truth_input,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', evaluation_step)

    global_step = tf.train.get_or_create_global_step()
    increment_global_step = tf.assign(global_step, global_step + 1)

    saver = tf.train.Saver(tf.global_variables())

    # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
    merged_summaries = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                         sess.graph)
    validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                              '/validation')
    tf.global_variables_initializer().run()
    start_step = 1

    if FLAGS.start_checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
        start_step = global_step.eval(session=sess)

    tf.logging.info('Training from step: %d ', start_step)

    # Save graph.pbtxt.
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         FLAGS.model_architecture + '.pbtxt')

    # Save list of words.
    with gfile.GFile(
            os.path.join(FLAGS.train_dir,
                         FLAGS.model_architecture + '_labels.txt'), 'w') as f:
        f.write('\n'.join(audio_processor.words_list))

    # The main roles of the tf.gfile module are :
    # 1. To provide an API that is close to Python's file objects
    # 2. To provide an implementation based on Tensorflow's C++ FileSystem API

    # C++ FileSystem API Supports multiple file system implementations, including local files, Google Cloud storage, and HDFS
    # these implementations for saving and loading checkpoints, wirting Tensorboard logs, and accessing training data
    # However, if all of your files are local, you can use the regular Python file API without any problem.
    # less-conventional filesystem에 필요하고, 그 외에는 일반적으로 사용하는 파이썬 API를 사용해도 된다.

    # Training loop.
    training_steps_max = np.sum(training_steps_list)
    for training_step in xrange(start_step, training_steps_max + 1):
        # Figure out what the current learning rate is.
        # xrange는 ranage 함수와 차이가 있는데, 데이터 타입이 다르고 동작 방식이 다르다.
        # xrange를 사용하는 경우가 지정하는 범위가 커질 경우 메모리 사용 효율이 커지게 된다.
        # 자신에 속한 데이터 값을 한꺼번에 메모리에 로드하는 것이 아니라 해당 값에 접근할 때 마다 그 값을 하나씩 로딩하는 방식
        # list에서 제공하는 편리한 함수를 못쓰지만, 순차적 접근이나 index를 통한 접근을 위주로 할 때는 xrange()가 훨씬 메모리 효율적이다.
        training_steps_sum = 0
        for i in range(len(training_steps_list)):
            training_steps_sum += training_steps_list[i]
            if training_step <= training_steps_sum:
                learning_rate_value = learning_rates_list[i]
                break
        # Pull the audio samples we'll use for training.
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
            FLAGS.background_volume, time_shift_samples, 'training', sess)
        # Run the graph with this batch of training data.
        train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
            [
                merged_summaries, evaluation_step, cross_entropy_mean,
                train_step, increment_global_step
            ],
            feed_dict={
                fingerprint_input: train_fingerprints,
                ground_truth_input: train_ground_truth,
                learning_rate_input: learning_rate_value,
                dropout_prob: 0.5
            })
        train_writer.add_summary(train_summary, training_step)
        tf.logging.info(
            'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
            (training_step, learning_rate_value, train_accuracy * 100,
             cross_entropy_value))
        is_last_step = (training_step == training_steps_max)
        if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
            set_size = audio_processor.set_size('validation')
            total_accuracy = 0
            total_conf_matrix = None
            for i in xrange(0, set_size, FLAGS.batch_size):
                validation_fingerprints, validation_ground_truth = (
                    audio_processor.get_data(FLAGS.batch_size, i,
                                             model_settings, 0.0, 0.0, 0,
                                             'validation', sess))
                # Run a validation step and capture training summaries for TensorBoard
                # with the `merged` op.
                validation_summary, validation_accuracy, conf_matrix = sess.run(
                    [merged_summaries, evaluation_step, confusion_matrix],
                    feed_dict={
                        fingerprint_input: validation_fingerprints,
                        ground_truth_input: validation_ground_truth,
                        dropout_prob: 1.0
                    })
                validation_writer.add_summary(validation_summary,
                                              training_step)
                batch_size = min(FLAGS.batch_size, set_size - i)
                total_accuracy += (validation_accuracy * batch_size) / set_size
                if total_conf_matrix is None:
                    total_conf_matrix = conf_matrix
                else:
                    total_conf_matrix += conf_matrix
            tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
            tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                            (training_step, total_accuracy * 100, set_size))

        # Save the model checkpoint periodically.
        if (training_step % FLAGS.save_step_interval == 0
                or training_step == training_steps_max):
            checkpoint_path = os.path.join(FLAGS.train_dir,
                                           FLAGS.model_architecture + '.ckpt')
            tf.logging.info('Saving to "%s-%d"', checkpoint_path,
                            training_step)
            saver.save(sess, checkpoint_path, global_step=training_step)

    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in xrange(0, set_size, FLAGS.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
                dropout_prob: 1.0
            })
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Final test accuracy = %.1f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
Exemplo n.º 35
0
    def train(self):

        # Instantiate the dataset class
        data = dataset_badGAN(
            num_classes=F.num_classes,
            extraction_step=self.extraction_step,
            number_images_training=F.number_train_images,
            batch_size=F.batch_size,
            patch_shape=self.patch_shape,
            number_unlab_images_training=F.number_train_unlab_images,
            data_directory=F.data_directory)

        # Optimizer operations
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            d_optim = tf.train.AdamOptimizer(F.learning_rate_D, beta1=F.beta1D)\
                        .minimize(self.d_loss,var_list=self.d_vars)
            g_optim = tf.train.AdamOptimizer(F.learning_rate_G, beta1=F.beta1G)\
                        .minimize(self.g_loss,var_list=self.g_vars)
            if F.badGAN:
                e_optim = tf.train.AdamOptimizer(F.learning_rate_E, beta1=F.beta1E)\
                          .minimize(self.g_loss,var_list=self.e_vars)

        tf.global_variables_initializer().run()

        # Load checkpoints if required
        if F.load_chkpt:
            try:
                load_model(F.checkpoint_dir, self.sess, self.saver)
                print("\n [*] Checkpoint loaded succesfully!")
            except:
                print("\n [!] Checkpoint loading failed!")
        else:
            print("\n [*] Checkpoint load not required.")

        # Load the validation data
        patches_val, labels_val_patch, labels_val = preprocess_dynamic_lab(
            F.data_directory,
            F.num_classes,
            self.extraction_step,
            self.patch_shape,
            F.number_train_images,
            validating=F.training,
            testing=F.testing,
            num_images_testing=F.number_test_images)

        predictions_val = np.zeros((patches_val.shape[0], self.patch_shape[0],
                                    self.patch_shape[1], self.patch_shape[2]),
                                   dtype="uint8")
        max_par = 0.0
        max_loss = 100
        for epoch in xrange(int(F.epoch)):
            idx = 0
            batch_iter_train = data.batch_train()
            total_val_loss = 0
            total_train_loss_CE = 0
            total_train_loss_UL = 0
            total_train_loss_FK = 0
            total_gen_FMloss = 0

            for patches_lab, patches_unlab, labels in batch_iter_train:
                # Network update
                sample_z_gen = np.random.uniform(
                    -1, 1, [F.batch_size, F.noise_dim]).astype(np.float32)

                _ = self.sess.run(d_optim,
                                  feed_dict={
                                      self.patches_lab: patches_lab,
                                      self.patches_unlab: patches_unlab,
                                      self.z_gen: sample_z_gen,
                                      self.labels: labels,
                                      self.phase: True
                                  })

                if F.badGAN:
                    _, _ = self.sess.run(
                        [e_optim, g_optim],
                        feed_dict={
                            self.patches_unlab: patches_unlab,
                            self.z_gen: sample_z_gen,
                            self.z_gen: sample_z_gen,
                            self.phase: True
                        })
                else:
                    _ = self.sess.run(g_optim,
                                      feed_dict={
                                          self.patches_unlab: patches_unlab,
                                          self.z_gen: sample_z_gen,
                                          self.z_gen: sample_z_gen,
                                          self.phase: True
                                      })

                feed_dict = {
                    self.patches_lab: patches_lab,
                    self.patches_unlab: patches_unlab,
                    self.z_gen: sample_z_gen,
                    self.labels: labels,
                    self.phase: True
                }

                # Evaluate losses for plotting/printing purposes
                d_loss_lab = self.d_loss_lab.eval(feed_dict)
                d_loss_unlab_true = self.true_loss.eval(feed_dict)
                d_loss_unlab_fake = self.fake_loss.eval(feed_dict)
                g_loss_fm = self.g_loss_fm.eval(feed_dict)

                total_train_loss_CE = total_train_loss_CE + d_loss_lab
                total_train_loss_UL = total_train_loss_UL + d_loss_unlab_true
                total_train_loss_FK = total_train_loss_FK + d_loss_unlab_fake
                total_gen_FMloss = total_gen_FMloss + g_loss_fm

                idx += 1
                if F.badGAN:
                    vi_loss = self.vi_loss.eval(feed_dict)
                    print((
                        "Epoch:[%2d] [%4d/%4d] Labeled loss:%.2e Unlabeled loss:%.2e Fake loss:%.2e Generator FM loss:%.8f Generator VI loss:%.8f\n"
                    ) % (epoch, idx, data.num_batches, d_loss_lab,
                         d_loss_unlab_true, d_loss_unlab_fake, g_loss_fm,
                         vi_loss))
                else:
                    print((
                        "Epoch:[%2d] [%4d/%4d] Labeled loss:%.2e Unlabeled loss:%.2e Fake loss:%.2e Generator loss:%.8f \n"
                    ) % (epoch, idx, data.num_batches, d_loss_lab,
                         d_loss_unlab_true, d_loss_unlab_fake, g_loss_fm))

            # Save the curret model
            save_model(F.checkpoint_dir, self.sess, self.saver)

            avg_train_loss_CE = total_train_loss_CE / (idx * 1.0)
            avg_train_loss_UL = total_train_loss_UL / (idx * 1.0)
            avg_train_loss_FK = total_train_loss_FK / (idx * 1.0)
            avg_gen_FMloss = total_gen_FMloss / (idx * 1.0)

            print('\n\n')

            total_batches = int(patches_val.shape[0] / F.batch_size)
            print("Total number of batches for validation: ", total_batches)

            # Prediction of validation patches
            for batch in range(total_batches):
                patches_feed = patches_val[batch * F.batch_size:(batch + 1) *
                                           F.batch_size, :, :, :, :]
                labels_feed = labels_val_patch[batch *
                                               F.batch_size:(batch + 1) *
                                               F.batch_size, :, :, :]
                feed_dict = {
                    self.patches_lab: patches_feed,
                    self.labels: labels_feed,
                    self.phase: False
                }
                preds = self.Val_output.eval(feed_dict)
                val_loss = self.d_loss_lab.eval(feed_dict)

                predictions_val[batch * F.batch_size:(batch + 1) *
                                F.batch_size, :, :, :] = preds
                print(("Validated Patch:[%8d/%8d]") % (batch, total_batches))
                total_val_loss = total_val_loss + val_loss

            # To compute average patchvise validation loss(cross entropy loss)
            avg_val_loss = total_val_loss / (total_batches * 1.0)

            print("All validation patches Predicted")

            print("Shape of predictions_val, min and max:",
                  predictions_val.shape, np.min(predictions_val),
                  np.max(predictions_val))

            # To stitch back the patches into an entire image
            val_image_pred = recompose3D_overlap(predictions_val, 144, 192,
                                                 256, self.extraction_step[0],
                                                 self.extraction_step[1],
                                                 self.extraction_step[2])
            val_image_pred = val_image_pred.astype('uint8')

            print("Shape of Predicted Output Groundtruth Images:",
                  val_image_pred.shape, np.unique(val_image_pred),
                  np.unique(labels_val), np.mean(val_image_pred),
                  np.mean(labels_val))

            pred2d = np.reshape(val_image_pred,
                                (val_image_pred.shape[0] * 144 * 192 * 256))
            lab2d = np.reshape(labels_val,
                               (labels_val.shape[0] * 144 * 192 * 256))

            # For printing the validation results
            F1_score = f1_score(lab2d, pred2d, [0, 1, 2, 3], average=None)
            print("Validation Dice Coefficient.... ")
            print("Background:", F1_score[0])
            print("CSF:", F1_score[1])
            print("GM:", F1_score[2])
            print("WM:", F1_score[3])

            # To Save the best model
            if (max_par < (F1_score[2] + F1_score[3])):
                max_par = (F1_score[2] + F1_score[3])
                save_model(F.best_checkpoint_dir, self.sess, self.saver)
                print("Best checkpoint updated from validation results.")

            # To save the losses for plotting
            print("Average Validation Loss:", avg_val_loss)
            with open('Val_loss_GAN.txt', 'a') as f:
                f.write('%.2e \n' % avg_val_loss)
            with open('Train_loss_CE.txt', 'a') as f:
                f.write('%.2e \n' % avg_train_loss_CE)
            with open('Train_loss_UL.txt', 'a') as f:
                f.write('%.2e \n' % avg_train_loss_UL)
            with open('Train_loss_FK.txt', 'a') as f:
                f.write('%.2e \n' % avg_train_loss_FK)
            with open('Train_loss_FM.txt', 'a') as f:
                f.write('%.2e \n' % avg_gen_FMloss)
        return
Exemplo n.º 36
0
    def __init__(self, config, name):
        assert name in ('validation', 'training', 'test')
        self.name = name
        logging.debug('{} - model - initialize'.format(self.name))
        self.is_training = True if self.name == 'training' else False
        self.config = config

        if not self.is_training:
            self.reinitializable_iter_for_dataset = None
        self.batch = self._gen_batch_fn()  # generate mini-batch

        with tf.name_scope(self.name):
            with tf.variable_scope('full_conv', reuse=tf.AUTO_REUSE):
                logits_stereo = self._nn_model_fn()

            logits_stereo_flattened = flatten_maybe_padded_sequences(
                maybe_padded_sequences=logits_stereo,
                lengths=tf.tile(input=self.batch['num_frames'], multiples=[2]))
            logits_left_flattened, logits_right_flattened = tf.split(
                value=logits_stereo_flattened, num_or_size_splits=2, axis=0)
            logits_minor_flattened = tf.minimum(logits_left_flattened, logits_right_flattened)
            logits_larger_flattened = tf.maximum(logits_left_flattened, logits_right_flattened)
            labels_bool_flattened = flatten_maybe_padded_sequences(
                maybe_padded_sequences=self.batch['label'], lengths=self.batch['num_frames'])
            negated_labels_bool_flattened = tf.logical_not(labels_bool_flattened)
            labels_float_flattened = tf.cast(x=labels_bool_flattened, dtype=tf.float32)
            logits_mono_flattened = tf.where(
                tf.equal(labels_bool_flattened, True), logits_minor_flattened, logits_larger_flattened)
            loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_float_flattened,
                                                           logits=logits_mono_flattened)
            loss = tf.reduce_mean(loss)

            if self.is_training:
                _update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                if _update_ops:
                    with tf.control_dependencies(_update_ops):
                        training_op = tf.train.AdamOptimizer(self.config.learning_rate).minimize(loss)
                else:
                    training_op = tf.train.AdamOptimizer(self.config.learning_rate).minimize(loss)

            pred_labels_flattened = tf.greater(logits_left_flattened + logits_right_flattened, 0.)
            negated_pred_labels_flattened = tf.logical_not(pred_labels_flattened)

            # individual and ensemble statistics for test and validation
            if not self.is_training:
                with tf.name_scope('individual_and_ensemble_stats'):
                    with tf.variable_scope(
                            '{}_local_vars'.format(self.name), reuse=tf.AUTO_REUSE):
                        individual_tps_fps_tns_fns_var = tf.get_variable(
                            name='individual_tps_fps_tns_fns',
                            shape=[len(self.config.file_names[self.name]), 4],
                            dtype=tf.int32,
                            initializer=tf.zeros_initializer,
                            trainable=False,
                            collections=[tf.GraphKeys.LOCAL_VARIABLES]
                        )

                        acc_loss_var = tf.get_variable(
                            name='acc_loss',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.zeros_initializer,
                            trainable=False,
                            collections=[tf.GraphKeys.LOCAL_VARIABLES]
                        )

                        batch_counter_var = tf.get_variable(
                            name='batch_counter',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.zeros_initializer,
                            trainable=False,
                            collections=[tf.GraphKeys.LOCAL_VARIABLES]
                        )

                    loop_var_proto = collections.namedtuple(
                        'loop_var_proto',
                        ['sample_idx', 'batch_size', 'preds', 'negated_preds',
                         'labels', 'negated_labels', 'lengths', 'me_ids'])

                    def cond_fn(loop_var):
                        return tf.less(loop_var.sample_idx, loop_var.batch_size)

                    def body_fn(loop_var):
                        start_pos = tf.reduce_sum(loop_var.lengths[:loop_var.sample_idx])
                        end_pos = start_pos + loop_var.lengths[loop_var.sample_idx]
                        cur_preds = loop_var.preds
                        negated_cur_preds = loop_var.negated_preds
                        cur_labels = loop_var.labels
                        negated_cur_labels = loop_var.negated_labels
                        cur_preds, negated_cur_preds, cur_labels, negated_cur_labels = \
                            [value[start_pos:end_pos]
                             for value in [cur_preds, negated_cur_preds, cur_labels, negated_cur_labels]]
                        tps = tf.logical_and(cur_preds, cur_labels)
                        fps = tf.logical_and(cur_preds, negated_cur_labels)
                        tns = tf.logical_and(negated_cur_preds, negated_cur_labels)
                        fns = tf.logical_and(negated_cur_preds, cur_labels)
                        tps, fps, tns, fns = \
                            [tf.reduce_sum(tf.cast(value, tf.int32)) for value in [tps, fps, tns, fns]]
                        me_id = loop_var.me_ids[loop_var.sample_idx]
                        stats_var = individual_tps_fps_tns_fns_var
                        _new_value = stats_var[me_id] + tf.convert_to_tensor([tps, fps, tns, fns])
                        _update_stats = tf.scatter_update(
                            stats_var, me_id, _new_value, use_locking=True)
                        with tf.control_dependencies([_update_stats]):
                            sample_idx = loop_var.sample_idx + 1
                        loop_var = loop_var_proto(
                            sample_idx=sample_idx,
                            batch_size=loop_var.batch_size,
                            preds=loop_var.preds,
                            negated_preds=loop_var.negated_preds,
                            labels=loop_var.labels,
                            negated_labels=loop_var.negated_labels,
                            lengths=loop_var.lengths,
                            me_ids=loop_var.me_ids
                        )

                        return [loop_var]

                    sample_idx = tf.constant(0, dtype=tf.int32)
                    cur_batch_size = tf.shape(self.batch['num_frames'])[0]
                    loop_var = loop_var_proto(
                        sample_idx=sample_idx,
                        batch_size=cur_batch_size,
                        preds=pred_labels_flattened,
                        negated_preds=negated_pred_labels_flattened,
                        labels=labels_bool_flattened,
                        negated_labels=negated_labels_bool_flattened,
                        lengths=self.batch['num_frames'],
                        me_ids=self.batch['me_id']
                    )
                    final_sample_idx = tf.while_loop(
                        cond=cond_fn,
                        body=body_fn,
                        loop_vars=[loop_var],
                        parallel_iterations=self.config.batch_size,
                        back_prop=False,
                        return_same_structure=True
                    )[0].sample_idx

                    individual_tps_fps_tns_fns_float = tf.cast(individual_tps_fps_tns_fns_var, tf.float32)
                    tps, fps, _, fns = tf.unstack(individual_tps_fps_tns_fns_float, axis=1)
                    me_wise_precisions = tps / (tps + fps + 1e-7)
                    me_wise_recalls = tps / (tps + fns + 1e-7)
                    me_wise_f1s = 2. * me_wise_precisions * me_wise_recalls / \
                                  (me_wise_precisions + me_wise_recalls + 1e-7)
                    me_wise_prfs = tf.stack([me_wise_precisions, me_wise_recalls, me_wise_f1s], axis=1)
                    assert me_wise_prfs.shape.as_list() == [len(self.config.file_names[self.name]), 3]
                    average_me_wise_prf = tf.reduce_mean(me_wise_prfs, axis=0)
                    assert average_me_wise_prf.shape.as_list() == [3]

                    # ensemble stats
                    ensemble_tps_fps_tns_fns = tf.reduce_sum(individual_tps_fps_tns_fns_var, axis=0)
                    tps, fps, _, fns = tf.unstack(tf.cast(ensemble_tps_fps_tns_fns, tf.float32))
                    en_precision = tps / (tps + fps + 1e-7)
                    en_recall = tps / (tps + fns + 1e-7)
                    en_f1 = 2. * en_precision * en_recall / (en_precision + en_recall + 1e-7)
                    batch_counter_update_op = tf.assign_add(batch_counter_var, 1)
                    acc_loss_update_op = tf.assign_add(acc_loss_var, loss)
                    ensemble_prf_and_loss = tf.convert_to_tensor(
                        [en_precision, en_recall, en_f1, acc_loss_var / tf.cast(batch_counter_var, tf.float32)])

                    update_op_after_each_batch = tf.group(
                        final_sample_idx, batch_counter_update_op, acc_loss_update_op,
                        name='grouped update ops to be run after each batch'.replace(' ', '_'))
                    stats_after_each_epoch = dict(
                        individual_tps_fps_tns_fns=individual_tps_fps_tns_fns_var,
                        individual_prfs=me_wise_prfs,
                        ensemble_tps_fps_tns_fns=ensemble_tps_fps_tns_fns,
                        ensemble_prf_and_loss=ensemble_prf_and_loss,
                        average_prf=average_me_wise_prf
                    )

            # ensemble stats for training
            if self.is_training:
                with tf.name_scope('ensemble_stats'):
                    with tf.variable_scope(
                            '{}_local_vars'.format(self.name), reuse=tf.AUTO_REUSE):
                        ensemble_tps_fps_tns_fns_var = tf.get_variable(
                            name='ensemble_tps_fps_tns_fns',
                            shape=[4],
                            dtype=tf.int32,
                            initializer=tf.zeros_initializer,
                            trainable=False,
                            collections=[tf.GraphKeys.LOCAL_VARIABLES]
                        )
                        acc_loss_var = tf.get_variable(
                            name='acc_loss',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.zeros_initializer,
                            trainable=False,
                            collections=[tf.GraphKeys.LOCAL_VARIABLES]
                        )
                        batch_counter_var = tf.get_variable(
                            name='batch_counter',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.zeros_initializer,
                            trainable=False,
                            collections=[tf.GraphKeys.LOCAL_VARIABLES]
                        )

                    tps = tf.logical_and(pred_labels_flattened, labels_bool_flattened)
                    fps = tf.logical_and(pred_labels_flattened, negated_labels_bool_flattened)
                    tns = tf.logical_and(negated_pred_labels_flattened, negated_labels_bool_flattened)
                    fns = tf.logical_and(negated_pred_labels_flattened, labels_bool_flattened)
                    tps, fps, tns, fns = [tf.reduce_sum(tf.cast(value, tf.int32)) for value in [tps, fps, tns, fns]]
                    ensemble_tps_fps_tns_fns_update_op = tf.assign_add(
                        ensemble_tps_fps_tns_fns_var, tf.convert_to_tensor([tps, fps, tns, fns]))
                    acc_loss_update_op = tf.assign_add(acc_loss_var, loss)
                    batch_counter_update_op = tf.assign_add(batch_counter_var, 1)
                    ensemble_tps_fps_tns_fns_float = tf.cast(ensemble_tps_fps_tns_fns_var, tf.float32)
                    tps, fps, _, fns = tf.unstack(ensemble_tps_fps_tns_fns_float)
                    ensemble_precision = tps / (tps + fps + 1e-7)
                    ensemble_recall = tps / (tps + fns + 1e-7)
                    ensemble_f1 = 2. * ensemble_precision * ensemble_recall / \
                                  (ensemble_precision + ensemble_recall + 1e-7)
                    ensemble_loss = acc_loss_var / tf.cast(batch_counter_var, tf.float32)
                    ensemble_prf_and_loss = tf.convert_to_tensor(
                        [ensemble_precision, ensemble_recall, ensemble_f1, ensemble_loss])

                    update_op_after_each_batch = tf.group(
                        batch_counter_update_op, ensemble_tps_fps_tns_fns_update_op, acc_loss_update_op)
                    stats_after_each_epoch = dict(
                        ensemble_tps_fps_tns_fns=ensemble_tps_fps_tns_fns_var,
                        ensemble_prf_and_loss=ensemble_prf_and_loss
                    )

            # define tensorboard summaries
            with tf.name_scope('tensorboard_summary'):
                with tf.name_scope('statistics'):
                    if not self.is_training:
                        list_of_summaries = []
                        with tf.name_scope('ensemble'):
                            p, r, f, lo = tf.unstack(stats_after_each_epoch['ensemble_prf_and_loss'])
                            items_for_summary = dict(precision=p, recall=r, f1=f, average_loss=lo)
                            for item_name, item_value in items_for_summary.iteritems():
                                tmp = tf.summary.scalar(item_name, item_value)
                                list_of_summaries.append(tmp)
                        with tf.name_scope('individual'):
                            p, r, f = tf.unstack(stats_after_each_epoch['average_prf'])
                            items_for_summary = dict(precision=p, recall=r, f1=f)
                            for item_name, item_value in items_for_summary.iteritems():
                                tmp = tf.summary.scalar(item_name, item_value)
                                list_of_summaries.append(tmp)
                    else:
                        list_of_summaries = []
                        with tf.name_scope('ensemble'):
                            p, r, f, lo = tf.unstack(stats_after_each_epoch['ensemble_prf_and_loss'])
                            items_for_summary = dict(precision=p, recall=r, f1=f, average_loss=lo)
                            for item_name, item_value in items_for_summary.iteritems():
                                tmp = tf.summary.scalar(item_name, item_value)
                                list_of_summaries.append(tmp)

                    statistical_summary = tf.summary.merge(list_of_summaries)

                with tf.name_scope('images'):
                    image_summary_length = int(6 * 16000 // 512)
                    labels_uint8 = self.batch['label'][:, :image_summary_length, :]
                    labels_uint8 = tf.cast(labels_uint8, tf.uint8) * 255
                    assert labels_uint8.dtype == tf.uint8
                    labels_uint8 = labels_uint8[..., None]

                    _logits_left = tf.split(value=logits_stereo, num_or_size_splits=2, axis=0)[0]
                    logits_prob_uint8 = tf.sigmoid(_logits_left[:, :image_summary_length, :])
                    logits_prob_uint8 = tf.cast(logits_prob_uint8 * 255., tf.uint8)
                    logits_prob_uint8 = logits_prob_uint8[..., None]

                    images = tf.concat([labels_uint8, logits_prob_uint8, tf.zeros_like(labels_uint8)], axis=-1)
                    images = tf.transpose(images, [0, 2, 1, 3])
                    images.set_shape([None, 88, image_summary_length, 3])
                    image_summary = tf.summary.image('images', images)

                if self.is_training:
                    with tf.name_scope('params'):
                        var_summary_dict = dict()
                        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
                            var_summary_dict[var.op.name] = tf.summary.histogram(var.op.name, var)
                        param_summary = tf.summary.merge(var_summary_dict.values())

        if self.is_training:
            op_dict = dict(
                training_op=training_op,
                tb_summary=dict(statistics=statistical_summary, image=image_summary, parameter=param_summary),
                update_op_after_each_batch=update_op_after_each_batch,
                statistics_after_each_epoch=stats_after_each_epoch
            )
        else:
            op_dict = dict(
                tb_summary=dict(statistics=statistical_summary, image=image_summary),
                update_op_after_each_batch=update_op_after_each_batch,
                statistics_after_each_epoch=stats_after_each_epoch
            )

        self.op_dict = op_dict
Exemplo n.º 37
0
    def build_trainer(self, child_model):
        # actor
        child_model.build_valid_rl()
        self.valid_acc = (tf.to_float(child_model.valid_shuffle_acc) /
                          tf.to_float(child_model.batch_size))
        self.reward = self.valid_acc

        if self.use_critic:
            # critic
            all_h = tf.concat(self.all_h, axis=0)
            value_function = tf.matmul(all_h, self.w_critic)
            advantage = value_function - self.reward
            critic_loss = tf.reduce_sum(advantage**2)
            self.baseline = tf.reduce_mean(value_function)
            self.loss = -tf.reduce_mean(self.sample_log_probs * advantage)

            critic_train_step = tf.Variable(0,
                                            dtype=tf.int32,
                                            trainable=False,
                                            name="critic_train_step")
            critic_train_op, _, _, _ = get_train_ops(critic_loss,
                                                     [self.w_critic],
                                                     critic_train_step,
                                                     clip_mode=None,
                                                     lr_init=1e-3,
                                                     lr_dec_start=0,
                                                     lr_dec_every=int(1e9),
                                                     optim_algo="adam",
                                                     sync_replicas=False)
        else:
            # or baseline
            self.sample_log_probs = tf.reduce_sum(self.sample_log_probs)
            self.baseline = tf.Variable(0.0, dtype=tf.float32, trainable=False)
            baseline_update = tf.assign_sub(self.baseline, (1 - self.bl_dec) *
                                            (self.baseline - self.reward))
            with tf.control_dependencies([baseline_update]):
                self.reward = tf.identity(self.reward)
            self.loss = self.sample_log_probs * (self.reward - self.baseline)

        self.train_step = tf.Variable(0,
                                      dtype=tf.int32,
                                      trainable=False,
                                      name="train_step")
        tf_variables = [
            var for var in tf.trainable_variables()
            if var.name.startswith(self.name) and "w_critic" not in var.name
        ]
        print("-" * 80)
        for var in tf_variables:
            print(var)
        self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops(
            self.loss,
            tf_variables,
            self.train_step,
            clip_mode=self.clip_mode,
            grad_bound=self.grad_bound,
            l2_reg=self.l2_reg,
            lr_init=self.lr_init,
            lr_dec_start=self.lr_dec_start,
            lr_dec_every=self.lr_dec_every,
            lr_dec_rate=self.lr_dec_rate,
            optim_algo=self.optim_algo,
            sync_replicas=self.sync_replicas,
            num_aggregate=self.num_aggregate,
            num_replicas=self.num_replicas)

        if self.use_critic:
            self.train_op = tf.group(self.train_op, critic_train_op)
def main(argv=None):
    m_cfg = sys_cfg()
    config = get_config(FLAGS)
    config.batch_size = FLAGS.batch_size_per_gpu * FLAGS.num_gpus
    config.num_layers = 3
    config.num_steps = 5
    #
    eval_config = get_config(FLAGS)
    eval_config.batch_size = 2
    eval_config.num_layers = 3
    eval_config.num_steps = 5
    #============================ I. Model options ==============================#
    #>>>>>>>>>>>>>>>for PWCnet module network
    nn_opts = deepcopy(_DEFAULT_PWCNET_VAL_OPTIONS)
    if FLAGS.flownet_type is 'small':
        nn_opts['use_dense_cx'] = False
        nn_opts['use_res_cx'] = False
        nn_opts['pyr_lvls'] = 6
        nn_opts['flow_pred_lvl'] = 2
        nn_opts[
            'ckpt_path'] = '/work/cascades/lxiaol9/ARC/PWC/checkpoints/pwcnet-sm-6-2-multisteps-chairsthingsmix/pwcnet.ckpt-592000'  # Model to eval
    else:
        nn_opts['use_dense_cx'] = True
        nn_opts['use_res_cx'] = True
        nn_opts['pyr_lvls'] = 6
        nn_opts['flow_pred_lvl'] = 2
        nn_opts[
            'ckpt_path'] = '/work/cascades/lxiaol9/ARC/PWC/checkpoints/pwcnet-lg-6-2-multisteps-chairsthingsmix/pwcnet.ckpt-595000'

    nn_opts['verbose'] = True
    nn_opts['batch_size'] = 32  # This is Batch_size per GPU(16*4/2/2 = 16)
    nn_opts[
        'use_tf_data'] = False  # Don't use tf.data reader for this simple task
    nn_opts['gpu_devices'] = ['/device:GPU:0', '/device:GPU:1']  #
    nn_opts['controller'] = '/device:CPU:0'  # Evaluate on CPU or GPU?
    nn_opts['adapt_info'] = (1, 436, 1024, 2)
    nn_opts['x_shape'] = [2, 512, 512,
                          3]  # image pairs input shape [2, H, W, 3]
    nn_opts['y_shape'] = [512, 512, 2]  # u,v flows output shape [H, W, 2]
    #>>>>>>>>>>>>>>>> For EAST module network
    east_opts = {
        'verbose': True,
        'ckpt_path': FLAGS.pretrained_model_path,
        'batch_size': 40,
        'batch_size_per_gpu': 20,
        'gpu_devices': ['/device:GPU:0', '/device:GPU:1'],
        # controller device to put the model's variables on (usually, /cpu:0 or /gpu:0 -> try both!)
        'controller': '/device:CPU:0',
        'x_dtype': tf.float32,  # image pairs input type
        'x_shape': [512, 512, 3],  # image pairs input shape [2, H, W, 3]
        'y_score_shape': [128, 128, 1],  # u,v flows output type
        'y_geometry_shape': [128, 128, 5],  # u,v flows output shape [H, W, 2]
        'x_mask_shape': [128, 128, 1]
    }
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list
    if not tf.gfile.Exists(FLAGS.checkpoint_path):
        tf.gfile.MkDir(FLAGS.checkpoint_path)
    else:
        if not FLAGS.restore:
            tf.gfile.DeleteRecursively(FLAGS.checkpoint_path)
            tf.gfile.MkDir(FLAGS.checkpoint_path)
#=============================== II. building graph for east + agg =================================#
# 1.1 Input placeholders
    batch_size = FLAGS.batch_size_per_gpu * FLAGS.num_gpus
    len_seq = FLAGS.num_steps
    # input_images = tf.placeholder(tf.float32, shape=[batch_size*len_seq, 512, 512, 3], name='input_images')
    input_feat_maps = tf.placeholder(tf.float32,
                                     shape=[batch_size, len_seq, 128, 128, 32],
                                     name='input_feature_maps')
    input_flow_maps = tf.placeholder(
        tf.float32,
        shape=[batch_size, len_seq - 1, 128, 128, 2],
        name='input_flow_maps')
    input_score_maps = tf.placeholder(tf.float32,
                                      shape=[batch_size, len_seq, 128, 128, 1],
                                      name='input_score_maps')
    if FLAGS.geometry == 'RBOX':
        input_geo_maps = tf.placeholder(
            tf.float32,
            shape=[batch_size, len_seq, 128, 128, 5],
            name='input_geo_maps')
    else:
        input_geo_maps = tf.placeholder(
            tf.float32,
            shape=[batch_size, len_seq, 128, 128, 8],
            name='input_geo_maps')
    input_training_masks = tf.placeholder(
        tf.float32,
        shape=[batch_size, len_seq, 128, 128, 1],
        name='input_training_masks')
    # 1.2 lr & opt
    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)
    learning_rate = tf.train.exponential_decay(FLAGS.learning_rate,
                                               global_step,
                                               decay_steps=10000,
                                               decay_rate=0.8,
                                               staircase=True)
    opt = tf.train.AdamOptimizer(learning_rate)
    # 1.3 add summary
    tf.summary.scalar('learning_rate', learning_rate)
    # tf.summary.image('input_images', input_images[2:20:5, :, :, :])
    # 1.4 build graph in tf
    # input_images_split     = tf.split(input_images, FLAGS.num_gpus)
    input_feature_split = tf.split(input_feat_maps, FLAGS.num_gpus)
    input_score_maps_split = tf.split(input_score_maps, FLAGS.num_gpus)
    input_geo_maps_split = tf.split(input_geo_maps, FLAGS.num_gpus)
    input_training_masks_split = tf.split(input_training_masks, FLAGS.num_gpus)
    input_flow_maps_split = tf.split(input_flow_maps, FLAGS.num_gpus)
    tower_grads = []
    reuse_variables = None
    tvars = []
    gpus = list(range(len(FLAGS.gpu_list.split(','))))
    for i, gpu_id in enumerate(gpus):
        with tf.device('/gpu:%d' % gpu_id):
            with tf.name_scope('model_%d' % gpu_id) as scope:
                iis = input_feature_split[i]
                ifms = input_flow_maps_split[i]
                isms = input_score_maps_split[i]
                igms = input_geo_maps_split[i]
                itms = input_training_masks_split[i]
                # model changed to recurrent one, we only need the recurrent loss returned
                total_loss, model_loss = model_gru_agg.tower_loss(
                    iis,
                    ifms,
                    isms,
                    igms,
                    itms,
                    gpu_id=gpu_id,
                    config=config,
                    reuse_variables=reuse_variables)
                batch_norm_updates_op = tf.group(
                    *tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope))
                reuse_variables = True
                # tvar1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='tiny_embed')
                # tvar2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module')
                # tvars = tvar1 + tvar2
                grads = opt.compute_gradients(total_loss)
                tower_grads.append(grads)
    # 1.5 gradient parsering
    grads = average_gradients(tower_grads)
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
    # 1.6 get training operations
    summary_op = tf.summary.merge_all()
    # variable_averages = tf.train.ExponentialMovingAverage(
    #     FLAGS.moving_average_decay, global_step)
    # variables_averages_op = variable_averages.apply(tf.trainable_variables())
    with tf.control_dependencies([apply_gradient_op, batch_norm_updates_op]):
        train_op = tf.no_op(name='train_op')
    # 1.8 Saver & Session & Restore
    saver = tf.train.Saver(tf.global_variables())
    # sv = tf.train.Supervisor()
    summary_writer = tf.summary.FileWriter(FLAGS.checkpoint_path,
                                           tf.get_default_graph())
    init = tf.global_variables_initializer()
    g = tf.get_default_graph()
    with g.as_default():
        config1 = tf.ConfigProto()
        config1.gpu_options.allow_growth = True
        config1.allow_soft_placement = True
        sess1 = tf.Session(config=config1)
        if FLAGS.restore:
            print('continue training from previous checkpoint')
            ckpt = FLAGS.prev_checkpoint_path
            saver.restore(sess1, ckpt)
        else:
            sess1.run(init)
            # var_list1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='multi_rnn_cell')
            # var_list2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module')
            # var_list_part1 = var_list1 + var_list2
            # saver_alter1 = tf.train.Saver({v.op.name: v for v in var_list_part1})
            # # var_list3 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='tiny_embed')
            # # var_list4 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='pred_module')
            # # var_list_part2 = var_list3 + var_list4
            # # saver_alter2 = tf.train.Saver({v.op.name: v for v in var_list_part2})
            # print('continue training from previous weights')
            # ckpt1 = FLAGS.prev_checkpoint_path
            # print('Restore from {}'.format(ckpt1))
            # saver_alter1.restore(sess1, ckpt1)
            # # print('continue training from previous Flow weights')
            # # ckpt2 = FLAGS.prev_checkpoint_path
            # # print('Restore from {}'.format(ckpt2))
            # # saver_alter2.restore(sess1, ckpt2)


#============================= III. Other necessary componets before training =============================#
    print("Step 1: AGG model has been reconstructed")
    GPUtil.showUtilization()
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>> EAST model >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> #
    east_net = model_flow_east.EAST(mode='test', options=east_opts)
    print("Step 2: EAST model has been reconstructed")
    GPUtil.showUtilization()
    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>> PWCnet model >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>#
    nn = ModelPWCNet(mode='test', options=nn_opts)
    print("Step 3: PWC model has been reconstructed")
    GPUtil.showUtilization()
    train_data_generator = icdar_light.get_batch_seq(
        num_workers=FLAGS.num_readers, config=config, is_training=True)
    # val_data_generator = icdar.get_batch_seq(num_workers=FLAGS.num_readers, config=eval_config, is_training=False)
    start = time.time()
    #============================= IV. Training over Steps(!!!)================================================#
    print("Now we're starting training!!!")
    for step in range(FLAGS.max_steps):
        #>>>>>>>>>>>>> data
        if FLAGS.mode == "debug":
            data = []
            data.append(
                np.ones((config.batch_size, FLAGS.num_steps, 512, 512, 3),
                        dtype=np.float32))
            data.append(
                np.ones((batch_size, len_seq, 128, 128, 1), dtype=np.float32))
            data.append(
                np.ones((batch_size, len_seq, 128, 128, 5), dtype=np.float32))
            data.append(
                np.ones((batch_size, len_seq, 128, 128, 1), dtype=np.float32))
        else:
            data = next(train_data_generator)
        if step < 3:
            print("Data ready!!!")
        east_feed = np.reshape(data[0], [-1, 512, 512, 3])
        target_frame = np.reshape(
            np.array(data[0])[:, 0:4, :, :, :], [-1, 512, 512, 3])
        source_frame = np.reshape(
            np.array(data[0])[:, 1:5, :, :, :], [-1, 512, 512, 3])
        flow_feed = np.concatenate((source_frame[:, np.newaxis, :, :, :],
                                    target_frame[:, np.newaxis, :, :, :]),
                                   axis=1)
        flow_maps_stack = []
        # >>>>>>>>>>>>>>>>>>>>>>>>>>> feature extraction with EAST >>>>>>>>>>>>>>>>>>>>>>>> #
        rounds = int(east_feed.shape[0] / east_opts['batch_size'])
        feature_stack = []
        flow_maps_stack = []
        for r in range(rounds):
            feature_stack.append(
                east_net.sess.run(
                    [east_net.y_hat_test_tnsr],
                    feed_dict={
                        east_net.x_tnsr:
                        east_feed[r * east_opts['batch_size']:(r + 1) *
                                  east_opts['batch_size'], :, :, :]
                    })[0][0])
        feature_maps = np.concatenate(feature_stack, axis=0)
        feature_maps_reshape = np.reshape(feature_maps,
                                          [-1, config.num_steps, 128, 128, 32])
        #>>>>>>>>>>>>>>> flow estimation with PWCnet
        # x: [batch_size,2,H,W,3] uint8; x_adapt: [batch_size,2,H,W,3] float32
        x_adapt, x_adapt_info = nn.adapt_x(flow_feed)
        if x_adapt_info is not None:
            y_adapt_info = (x_adapt_info[0], x_adapt_info[2], x_adapt_info[3],
                            2)
        else:
            y_adapt_info = None
        mini_batch = nn_opts['batch_size'] * nn.num_gpus
        rounds = int(flow_feed.shape[0] / mini_batch)
        for r in range(rounds):
            feed_dict = {
                nn.x_tnsr:
                x_adapt[r * mini_batch:(r + 1) * mini_batch, :, :, :, :]
            }
            y_hat = nn.sess.run(nn.y_hat_test_tnsr, feed_dict=feed_dict)
            if FLAGS.mode == "debug":
                print(
                    "Step 5: now finish running one round of PWCnet for flow estimation"
                )
                GPUtil.showUtilization()
            y_hats, _ = nn.postproc_y_hat_test(
                y_hat, y_adapt_info)  # suppose to be [batch, height, width, 2]
            flow_maps_stack.append(y_hats[:, 1::4, 1::4, :] / 4)
        flow_maps = np.concatenate(flow_maps_stack, axis=0)
        print("flow maps has shape ", flow_maps.shape[:])
        flow_maps = np.reshape(flow_maps,
                               [-1, FLAGS.num_steps - 1, 128, 128, 2])
        #>>>>>>>>>>>>>>> running training session
        with g.as_default():
            ml, tl, _ = sess1.run([model_loss, total_loss, train_op], \
                                        feed_dict={input_feat_maps: feature_maps_reshape,
                                                   input_score_maps: data[1],
                                                   input_geo_maps: data[2],
                                                   input_training_masks: data[3],
                                                   input_flow_maps: flow_maps
                                                   })
            if FLAGS.mode == "debug":
                print("Step 6: running one round on training!!!")
                GPUtil.showUtilization()
            if np.isnan(tl):
                print('Loss diverged, stop training')
                break
            if step % 10 == 0:
                avg_time_per_step = (time.time() - start) / 10
                avg_examples_per_second = (10 * FLAGS.batch_size_per_gpu *
                                           len(gpus)) / (time.time() - start)
                start = time.time()
                print(
                    'Step {:06d}, model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step, {:.2f} examples/second'
                    .format(step, ml, tl, avg_time_per_step,
                            avg_examples_per_second))

            if step % FLAGS.save_checkpoint_steps == 0:
                saver.save(sess1,
                           FLAGS.checkpoint_path + 'model.ckpt',
                           global_step=global_step)

            if step % FLAGS.save_summary_steps == 0:
                _, tl, summary_str = sess1.run(
                    [train_op, total_loss, summary_op],
                    feed_dict={
                        input_feat_maps: feature_maps_reshape,
                        input_score_maps: data[1],
                        input_geo_maps: data[2],
                        input_training_masks: data[3],
                        input_flow_maps: flow_maps
                    })
                summary_writer.add_summary(summary_str, global_step=step)
Exemplo n.º 39
0
def k_m_tf(defect_tensor,
           clusters,
           max_iters,
           summaries_dir,
           stage_str,
           name_str,
           go_to_max=False):
    length = len(defect_tensor[:, 0])
    num_clus = clusters
    MAX_ITERS = max_iters
    tiles = len(defect_tensor[0, :])
    start = time.time()

    sess = tf.InteractiveSession()
    with tf.name_scope('input'):
        points = tf.Variable(tf.random_uniform([length, tiles]),
                             dtype=tf.float32)
    with tf.name_scope('cluster_assigns'):
        cluster_assignments = tf.Variable(tf.zeros([length], dtype=tf.float32))

    with tf.name_scope('cents'):
        centroids = tf.Variable(tf.random_crop(points.initialized_value(),
                                               [num_clus, tiles]),
                                dtype=tf.float32)
    # centroids = tf.Print(centroids,[centroids], summarize = 16, message = 'centroids')

    # Replicate to N copies of each centroid and K copies of each
    # point, then subtract and compute the sum of squared distances.
    with tf.name_scope('Replicate'):
        rep_centroids = tf.reshape(tf.tile(centroids, [length, 1]),
                                   [length, num_clus, tiles])
        # rep_centroids = tf.Print(rep_centroids,[tf.shape(rep_centroids)],message='shape_rep_centroids')
        rep_points = tf.reshape(tf.tile(points, [1, num_clus]),
                                [length, num_clus, tiles])

    with tf.name_scope('Sum_squares'):
        squares = tf.square(rep_points - rep_centroids)
        sum_squares = tf.reduce_sum(tf.square(squares), reduction_indices=2)
        squares_1d = tf.scalar_summary('sum_squares',
                                       tf.reduce_mean(sum_squares))
        # sum_squares = tf.Print(sum_squares,[sum_squares], summarize = 40, message = 'sum_squares')
        # sum_squares = tf.Print(sum_squares,[tf.shape(sum_squares)], summarize = 16, message = 'sum_squares_shape')

        # Use argmin to select the lowest-distance point
    with tf.name_scope('argmin'):
        best_centroids = tf.argmin(sum_squares, 1)
        # best_centroids = tf.Print(best_centroids,[best_centroids], summarize = 40,  message = ' best_cents')
    did_assignments_change = tf.reduce_any(
        tf.not_equal(tf.cast(best_centroids, tf.float32), cluster_assignments))

    ## This part exists for counting purposes, since I can't simply access the count in the means part
    with tf.name_scope('counting'):
        const_1d = {}
        num_1d = {}
        found_1d = {}
        scalar_1d = {}

        for i in range(0, num_clus):
            const_1d[i] = tf.constant(i, shape=[320, 1], dtype=tf.int64)
        # string_1d[i] = tf.constant(str[i], shape =[320,1], dtype = tf.string)

        for i in range(0, num_clus):
            num_1d[i] = tf.equal(tf.reshape(best_centroids, [320, 1]),
                                 const_1d[i])
            found_1d[i] = tf.reduce_sum(tf.cast(num_1d[i], tf.int32))
            found_1d[i] = tf.expand_dims(found_1d[i], -1)
            scalar_1d[i] = tf.scalar_summary(str(i), tf.squeeze(found_1d[i]))
            # found_1d[i] = tf.Print(found_1d[i], [found_1d[i]], summarize=40, message=str(i))
            # found_1d[i] = tf.Print(found_1d[i], [tf.shape(found_1d[i])], summarize=40, message=str(i))
            # found_1d[i] = tf.Print(found_1d[i],[tf.expand_dims(found_1d[i],0)], summarize = 40, message =str(i))
            # found_1d[i] = tf.Print(found_1d[i],[tf.shape(tf.expand_dims(found_1d[i],0))], summarize = 40, message =str(i))
            # found_1d[i] = tf.Print(found_1d[i], [tf.shape(tf.reshape(found_1d[i],[1,1]))], summarize=40, message=str(i))

        found_tensor = tf.concat(0, [found_1d[i] for i in range(0, num_clus)])
        distro = tf.histogram_summary('Distribution', found_tensor)


## calculate the means at the indices of best_centroids.
    with tf.name_scope('means'):
        total = tf.unsorted_segment_sum(points, best_centroids, num_clus)
        count = tf.unsorted_segment_sum(tf.ones_like(points), best_centroids,
                                        num_clus)
        # count = tf.Print(count, [tf.shape(count)])
        means = total / count
        means = tf.select(tf.is_nan(means), tf.ones_like(means) * 0, means)
        means_1d = tf.scalar_summary('means', tf.reduce_mean(means))
        # means = tf.Print(means,[means],summarize = 16, message = 'MEANS')
        # means = tf.Print(means,[tf.shape(means)], message = 'm_shape')
    # Do not write to the assigned clusters variable until after
    # computing whether the assignments have changed - hence with_dependencies
    with tf.name_scope('Do_updates'):
        with tf.control_dependencies([did_assignments_change]):
            do_updates = tf.group(
                centroids.assign(means),
                cluster_assignments.assign(tf.cast(best_centroids,
                                                   tf.float32)))

    changed = True
    iters = 0
    found_numerical = {}
    # found_1d = tf.Print(found_1d,[found_1d])

    # Merge summaries
    scalar_summary = tf.merge_summary(
        [scalar_1d[i] for i in range(0, num_clus)])
    other_summary = tf.merge_summary([means_1d, squares_1d])
    histogram_summary = tf.merge_summary([distro])

    writer = tf.train.SummaryWriter(
        summaries_dir + '/' + stage_str + '/kmeans/' + name_str, sess.graph)
    init = tf.initialize_all_variables()

    sess.run(init)
    # loop
    # check for assignment changes and assign new based on new means. If assignments didnt change, stop.
    while changed and iters < MAX_ITERS:
        iters += 1
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        # if iters%10 == 1:
        [changed, _, histogram_sum_run, scalar_sum_run,
         other_sum_run] = sess.run([
             did_assignments_change, do_updates, histogram_summary,
             scalar_summary, other_summary
         ],
                                   feed_dict={points: defect_tensor})
        writer.add_run_metadata(run_metadata, 'step%03d' % iters)
        writer.add_summary(histogram_sum_run, iters)
        writer.add_summary(scalar_sum_run, iters)
        writer.add_summary(other_sum_run, iters)
        # else:
        #     [changed, _, scalar_sum_run] = sess.run([did_assignments_change, do_updates, scalar_summary], feed_dict={points: defect_tensor})
        #     writer.add_run_metadata(run_metadata, 'step%03d' % iters)
        #     writer.add_summary(scalar_sum_run, iters)

        ## Note: due to the interconnectivity of found_1d, it seems as you need to run it ALONG the session a couple lines before in order to get numerical results
        ## Can't do that in a seperate run. Weirdly enough it works for found_tensor, which is simply a concat of found_1d. I don't know why.
        # found_numerical[0] = sess.run([found_1d[0]], feed_dict={points:defect_tensor})
        found_numerical[1] = sess.run([found_1d[1]],
                                      feed_dict={points: defect_tensor})
        found_numerical[3] = sess.run([found_1d[3]],
                                      feed_dict={points: defect_tensor})
        found_numerical[4] = sess.run([found_1d[4]],
                                      feed_dict={points: defect_tensor})

        if go_to_max == True:
            changed = True
    writer.close()
    [centers, assignments] = sess.run([centroids, cluster_assignments])

    end = time.time()

    print("Found in %.2f seconds" % (end - start), iters, "iterations")
    print('Distribution:',
          sess.run(found_tensor, feed_dict={points: defect_tensor}))

    tf.reset_default_graph()
    sess.close()
    return centers, assignments
Exemplo n.º 40
0
def train(submit_config: dnnlib.SubmitConfig, iteration_count: int,
          eval_interval: int, minibatch_size: int, learning_rate: float,
          ramp_down_perc: float, noise: dict, validation_config: dict,
          train_tfrecords: str, noise2noise: bool):
    noise_augmenter = dnnlib.util.call_func_by_name(**noise)
    validation_set = ValidationSet(submit_config)
    validation_set.load(**validation_config)

    # Create a run context (hides low level details, exposes simple API to manage the run)
    ctx = dnnlib.RunContext(submit_config, config)

    # Initialize TensorFlow graph and session using good default settings
    tfutil.init_tf(config.tf_config)

    dataset_iter = create_dataset(train_tfrecords, minibatch_size,
                                  noise_augmenter.add_train_noise_tf)
    # Construct the network using the Network helper class and a function defined in config.net_config
    with tf.device("/gpu:0"):
        net = tflib.Network(**config.net_config)

    # Optionally print layer information
    net.print_layers()

    print('Building TensorFlow graph...')
    with tf.name_scope('Inputs'), tf.device("/cpu:0"):
        lrate_in = tf.compat.v1.placeholder(tf.float32,
                                            name='lrate_in',
                                            shape=[])

        #print("DEBUG train:", "dataset iter got called")
        noisy_input, noisy_target, clean_target = dataset_iter.get_next()
        noisy_input_split = tf.split(noisy_input, submit_config.num_gpus)
        noisy_target_split = tf.split(noisy_target, submit_config.num_gpus)
        print(len(noisy_input_split), noisy_input_split)
        clean_target_split = tf.split(clean_target, submit_config.num_gpus)
        # Split [?, 3, 256, 256] across num_gpus over axis 0 (i.e. the batch)

    # Define the loss function using the Optimizer helper class, this will take care of multi GPU
    opt = tflib.Optimizer(learning_rate=lrate_in, **config.optimizer_config)
    radii = np.arange(128).reshape(128, 1)  #image size 256, binning = 3
    radial_masks = np.apply_along_axis(radial_mask, 1, radii, 128, 128,
                                       np.arange(0, 256), np.arange(0, 256),
                                       20)
    print("RN SHAPE!!!!!!!!!!:", radial_masks.shape)
    radial_masks = np.expand_dims(radial_masks, 1)  # (128, 1, 256, 256)
    #radial_masks = np.squeeze(np.stack((radial_masks,) * 3, -1)) # 43, 3, 256, 256
    #radial_masks = radial_masks.transpose([0, 3, 1, 2])
    radial_masks = radial_masks.astype(np.complex64)
    radial_masks = tf.expand_dims(radial_masks, 1)

    rn = tf.compat.v1.placeholder_with_default(radial_masks,
                                               [128, None, 1, 256, 256])
    rn_split = tf.split(rn, submit_config.num_gpus, axis=1)
    freq_nyq = int(np.floor(int(256) / 2.0))

    spatial_freq = radii.astype(np.float32) / freq_nyq
    spatial_freq = spatial_freq / max(spatial_freq)

    for gpu in range(submit_config.num_gpus):
        with tf.device("/gpu:%d" % gpu):
            net_gpu = net if gpu == 0 else net.clone()

            denoised_1 = net_gpu.get_output_for(noisy_input_split[gpu])
            denoised_2 = net_gpu.get_output_for(noisy_target_split[gpu])
            print(noisy_input_split[gpu].get_shape(),
                  rn_split[gpu].get_shape())
            if noise2noise:
                meansq_error = fourier_ring_correlation(
                    noisy_target_split[gpu], denoised_1, rn_split[gpu],
                    spatial_freq) - fourier_ring_correlation(
                        noisy_target_split[gpu] - denoised_2,
                        noisy_input_split[gpu] - denoised_1, rn_split[gpu],
                        spatial_freq)
            else:
                meansq_error = tf.reduce_mean(
                    tf.square(clean_target_split[gpu] - denoised))
            # Create an autosummary that will average over all GPUs
            #tf.summary.histogram(name, var)
            with tf.control_dependencies([autosummary("Loss", meansq_error)]):
                opt.register_gradients(meansq_error, net_gpu.trainables)

    train_step = opt.apply_updates()

    # Create a log file for Tensorboard
    summary_log = tf.compat.v1.summary.FileWriter(submit_config.run_dir)
    summary_log.add_graph(tf.compat.v1.get_default_graph())

    print('Training...')
    time_maintenance = ctx.get_time_since_last_update()
    ctx.update(loss='run %d' % submit_config.run_id,
               cur_epoch=0,
               max_epoch=iteration_count)

    # The actual training loop
    for i in range(iteration_count):
        # Whether to stop the training or not should be asked from the context
        if ctx.should_stop():
            break
        # Dump training status
        if i % eval_interval == 0:

            time_train = ctx.get_time_since_last_update()
            time_total = ctx.get_time_since_start()
            print("DEBUG TRAIN!", noisy_input.dtype, noisy_input[0][0].dtype)
            # Evaluate 'x' to draw a batch of inputs
            [source_mb, target_mb] = tfutil.run([noisy_input, clean_target])
            denoised = net.run(source_mb)
            save_image(submit_config, denoised[0],
                       "img_{0}_y_pred.tif".format(i))
            save_image(submit_config, target_mb[0], "img_{0}_y.tif".format(i))
            save_image(submit_config, source_mb[0],
                       "img_{0}_x_aug.tif".format(i))

            validation_set.evaluate(net, i,
                                    noise_augmenter.add_validation_noise_np)

            print(
                'iter %-10d time %-12s sec/eval %-7.1f sec/iter %-7.2f maintenance %-6.1f'
                % (autosummary('Timing/iter', i),
                   dnnlib.util.format_time(
                       autosummary('Timing/total_sec', time_total)),
                   autosummary('Timing/sec_per_eval', time_train),
                   autosummary('Timing/sec_per_iter',
                               time_train / eval_interval),
                   autosummary('Timing/maintenance_sec', time_maintenance)))

            dnnlib.tflib.autosummary.save_summaries(summary_log, i)
            ctx.update(loss='run %d' % submit_config.run_id,
                       cur_epoch=i,
                       max_epoch=iteration_count)
            time_maintenance = ctx.get_last_update_interval() - time_train

            save_snapshot(submit_config, net, str(i))
        lrate = compute_ramped_down_lrate(i, iteration_count, ramp_down_perc,
                                          learning_rate)
        tfutil.run([train_step], {lrate_in: lrate})

    print("Elapsed time: {0}".format(
        util.format_time(ctx.get_time_since_start())))
    save_snapshot(submit_config, net, 'final')

    # Summary log and context should be closed at the end
    summary_log.close()
    ctx.close()
Exemplo n.º 41
0
train_state = True
val_state = False

network = build_nukev8(inputs=net_input,
                       num_classes=num_classes,
                       is_training=model_mode,
                       dropout_p=dp)

reg_loss = tf.add_n(tf.losses.get_regularization_losses())
net_output = tf.cast(net_output, tf.float64)
loss = tf.reduce_mean(
    tf.losses.softmax_cross_entropy(logits=network,
                                    onehot_labels=net_output)) + reg_loss
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
print('finish update ops')
with tf.control_dependencies(update_ops):
    optimizer = tf.train.RMSPropOptimizer(
        learning_rate=0.0001, decay=0.995).minimize(
            loss, var_list=[var for var in tf.trainable_variables()])
saver = tf.train.Saver(max_to_keep=1000)
sess.run(tf.global_variables_initializer())

# If a pre-trained ResNet is required, load the weights.
# This must be done AFTER the variables are initialized with sess.run(tf.global_variables_initializer())

# Load a previous checkpoint if desired
model_checkpoint_name = "./latest_model" + ".ckpt"
if args.continue_training:
    print('Loaded latest model checkpoint')
    #saver.restore(sess, "079_trnlss_loss:0.228071_iou:0.695515_momentum_gnorm_/model.ckpt")
    saver.restore(sess, model_checkpoint_name)
def _random_crop(image_list, crop_height, crop_width):
  """Crops the given list of images.

  The function applies the same crop to each image in the list. This can be
  effectively applied when there are multiple image inputs of the same
  dimension such as:

    image, depths, normals = _random_crop([image, depths, normals], 120, 150)

  Args:
    image_list: a list of image tensors of the same dimension but possibly
      varying channel.
    crop_height: the new height.
    crop_width: the new width.

  Returns:
    the image_list with cropped images.

  Raises:
    ValueError: if there are multiple image inputs provided with different size
      or the images are smaller than the crop dimensions.
  """
  if not image_list:
    raise ValueError('Empty image_list.')

  # Compute the rank assertions.
  rank_assertions = []
  for i in range(len(image_list)):
    image_rank = tf.rank(image_list[i])
    rank_assert = tf.Assert(
        tf.equal(image_rank, 3),
        ['Wrong rank for tensor  %s [expected] [actual]',
         image_list[i].name, 3, image_rank])
    rank_assertions.append(rank_assert)

  with tf.control_dependencies([rank_assertions[0]]):
    image_shape = tf.shape(image_list[0])
  image_height = image_shape[0]
  image_width = image_shape[1]
  crop_size_assert = tf.Assert(
      tf.logical_and(
          tf.greater_equal(image_height, crop_height),
          tf.greater_equal(image_width, crop_width)),
      ['Crop size greater than the image size.'])

  asserts = [rank_assertions[0], crop_size_assert]

  for i in range(1, len(image_list)):
    image = image_list[i]
    asserts.append(rank_assertions[i])
    with tf.control_dependencies([rank_assertions[i]]):
      shape = tf.shape(image)
    height = shape[0]
    width = shape[1]

    height_assert = tf.Assert(
        tf.equal(height, image_height),
        ['Wrong height for tensor %s [expected][actual]',
         image.name, height, image_height])
    width_assert = tf.Assert(
        tf.equal(width, image_width),
        ['Wrong width for tensor %s [expected][actual]',
         image.name, width, image_width])
    asserts.extend([height_assert, width_assert])

  # Create a random bounding box.
  #
  # Use tf.random_uniform and not numpy.random.rand as doing the former would
  # generate random numbers at graph eval time, unlike the latter which
  # generates random numbers at graph definition time.
  with tf.control_dependencies(asserts):
    max_offset_height = tf.reshape(image_height - crop_height + 1, [])
  with tf.control_dependencies(asserts):
    max_offset_width = tf.reshape(image_width - crop_width + 1, [])
  offset_height = tf.random_uniform(
      [], maxval=max_offset_height, dtype=tf.int32)
  offset_width = tf.random_uniform(
      [], maxval=max_offset_width, dtype=tf.int32)

  return [_crop(image, offset_height, offset_width,
                crop_height, crop_width) for image in image_list]
Exemplo n.º 43
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Exemplo n.º 44
0
def train(model,
          data,
          batch_size=128,
          learning_rate=FLAGS.learning_rate,
          log_dir='./log',
          checkpoint_dir='./checkpoint',
          num_epochs=-1):

    # tf Graph input
    with tf.device('/cpu:0'):
        with tf.name_scope('data'):
            if FLAGS.dataset == "imagenet":
                x, yt = image_processing.distorted_inputs(
                    data,
                    batch_size=batch_size,
                    num_preprocess_threads=FLAGS.num_threads)
            else:
                x, yt = data.generate_batches(batch_size,
                                              num_threads=FLAGS.num_threads)
        global_step = tf.get_variable('global_step',
                                      shape=[],
                                      dtype=tf.int64,
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
    if FLAGS.gpu:
        device_str = '/gpu:' + str(FLAGS.device)
    else:
        device_str = '/cpu:0'
    with tf.device(device_str):
        y = model(x, is_training=True)
        # Define loss and optimizer
        with tf.name_scope('objective'):
            loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(labels=yt,
                                                               logits=y))
        tf.contrib.quantize.create_training_graph()
        with tf.name_scope('objective'):
            accuracy = tf.reduce_mean(
                tf.cast(tf.nn.in_top_k(y, yt, 1), tf.float32))

        opt = tf.contrib.layers.optimize_loss(
            loss,
            global_step,
            learning_rate,
            'Adam',
            gradient_noise_scale=None,
            gradient_multipliers=None,
            clip_gradients=None,  #moving_average_decay=0.9,
            learning_rate_decay_fn=learning_rate_decay_fn
            if FLAGS.using_learning_rate_decay_fn else None,
            update_ops=None,
            variables=None,
            name=None)
        #grads = opt.compute_gradients(loss)
        #apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # loss_avg

    ema = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,
                                            global_step,
                                            name='average')
    ema_op = ema.apply([loss, accuracy] + tf.trainable_variables())
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, ema_op)

    loss_avg = ema.average(loss)
    tf.summary.scalar('loss/training', loss_avg)
    accuracy_avg = ema.average(accuracy)
    tf.summary.scalar('accuracy/training', accuracy_avg)

    check_loss = tf.check_numerics(loss, 'model diverged: loss->nan')
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, check_loss)
    updates_collection = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies([opt]):
        train_op = tf.group(*updates_collection)

    if FLAGS.summary:
        add_summaries(scalar_list=[accuracy, accuracy_avg, loss, loss_avg],
                      activation_list=tf.get_collection(
                          tf.GraphKeys.ACTIVATIONS),
                      var_list=tf.trainable_variables())
        # grad_list=grads)

    summary_op = tf.summary.merge_all()
    # Configure options for session
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        log_device_placement=False,
        allow_soft_placement=True,
        gpu_options=gpu_options,
    ))
    if FLAGS.resume:
        logging.info('resuming from ' + checkpoint_dir)
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir + '/')
        if ckpt and ckpt.model_checkpoint_path:
            # Restores from checkpoint
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found')
            return
        #print sess.run('global_step:0')
        #print global_step.eval()
    else:
        saver = tf.train.Saver(max_to_keep=5)
        sess.run(tf.global_variables_initializer())

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    num_batches = data.size[0] / batch_size
    summary_writer = tf.summary.FileWriter(log_dir, graph=sess.graph)
    epoch = global_step.eval() / num_batches if FLAGS.resume else 0
    display_interval = FLAGS.display_interval or num_batches / 10
    test_interval = FLAGS.test_interval or num_batches / 2
    logging.info('num of trainable paramaters: %d' %
                 count_params(tf.trainable_variables()))
    while epoch != num_epochs:

        curr_step = 0
        # Initializing the variables

        #with tf.Session() as session:
        #    print(session.run(ww))

        logging.info('Started epoch %d' % epoch)
        bar = Bar('Training',
                  max=num_batches,
                  suffix='%(percent)d%% eta: %(eta)ds')
        while curr_step < data.size[0]:
            _, loss_val, step = sess.run([train_op, loss, global_step])
            # if step%display_interval==0:
            #   step, acc_value, loss_value, summary = sess.run(
            #     [global_step, accuracy_avg, loss_avg, summary_op])
            #   logging.info("step %d loss %.3f accuracy %.3f" %(step,loss_value,acc_value))
            #   summary_out = tf.Summary()
            #   summary_out.ParseFromString(summary)
            #   summary_writer.add_summary(summary_out, step)
            #   summary_writer.flush()
            # if step%test_interval==0:
            #   saver.save(sess, save_path=checkpoint_dir +
            #        '/model.ckpt', global_step=global_step)
            #   test_top1,test_top5,test_loss = evaluate(model, FLAGS.dataset,
            #                            batch_size=batch_size,
            #                            checkpoint_dir=checkpoint_dir)
            #   logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' % (test_loss,test_top1,test_top5))
            #   summary_out = tf.Summary()
            #   summary_out.ParseFromString(summary)
            #   summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1)
            #   summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5)
            #   summary_out.value.add(tag='loss/test', simple_value=test_loss)
            #   summary_writer.add_summary(summary_out, step)
            #   summary_writer.flush()
            curr_step += FLAGS.batch_size
            bar.next()

        bar.finish()
        step, acc_value, loss_value, summary = sess.run(
            [global_step, accuracy_avg, loss_avg, summary_op])
        saver.save(sess,
                   save_path=checkpoint_dir + '/model.ckpt',
                   global_step=global_step)
        test_top1, test_top5, test_loss = evaluate(
            model,
            FLAGS.dataset,
            batch_size=batch_size,
            checkpoint_dir=checkpoint_dir)
        logging.info('Test loss %.3f Test top1 %.3f Test top5 %.3f' %
                     (test_loss, test_top1, test_top5))
        summary_out = tf.Summary()
        summary_out.ParseFromString(summary)
        summary_out.value.add(tag='accuracy/test_top1', simple_value=test_top1)
        summary_out.value.add(tag='accuracy/test_top5', simple_value=test_top5)
        summary_out.value.add(tag='loss/test', simple_value=test_loss)
        summary_writer.add_summary(summary_out, step)
        summary_writer.flush()
        logging.info("Finished epoch %d " % epoch)
        epoch += 1

    # When done, ask the threads to stop.
    coord.request_stop()
    coord.join(threads)
    coord.clear_stop()
    summary_writer.close()
    def run(
        self,
        *in_arrays,
        return_as_list=False,  # True = return a list of NumPy arrays, False = return a single NumPy array, or a tuple if there are multiple outputs.
        print_progress=True,  # Print progress to the console? Useful for very large input arrays.
        minibatch_size=None,  # Maximum minibatch size to use, None = disable batching.
        num_gpus=1,  # Number of GPUs to use.
        out_mul=1.0,  # Multiplicative constant to apply to the output(s).
        out_add=0.0,  # Additive constant to apply to the output(s).
        out_shrink=1,  # Shrink the spatial dimensions of the output(s) by the given factor.
        out_dtype=None,  # Convert the output to the specified data type.
        **dynamic_kwargs
    ):  # Additional keyword arguments to pass into the network construction function.

        #      assert len(in_arrays) == self.num_inputs
        num_items = in_arrays[0].shape[0]
        if minibatch_size is None:
            minibatch_size = num_items
        key = str([
            list(sorted(dynamic_kwargs.items())), num_gpus, out_mul, out_add,
            out_shrink, out_dtype
        ])

        # Build graph.
        if key not in self._run_cache:
            with absolute_name_scope(self.scope +
                                     '/Run'), tf.control_dependencies(None):
                in_split = list(
                    zip(*[tf.split(x, num_gpus)
                          for x in self.input_templates]))
                out_split = []
                for gpu in range(num_gpus):
                    with tf.device('/gpu:%d' % gpu):
                        out_expr = self.get_output_for(*in_split[gpu],
                                                       return_as_list=True,
                                                       **dynamic_kwargs)
                        if out_mul != 1.0:
                            out_expr = [x * out_mul for x in out_expr]
                        if out_add != 0.0:
                            out_expr = [x + out_add for x in out_expr]
                        if out_shrink > 1:
                            ksize = [1, 1, out_shrink, out_shrink]
                            out_expr = [
                                tf.nn.avg_pool(x,
                                               ksize=ksize,
                                               strides=ksize,
                                               padding='VALID',
                                               data_format='NCHW')
                                for x in out_expr
                            ]
                        if out_dtype is not None:
                            if tf.as_dtype(out_dtype).is_integer:
                                out_expr = [tf.round(x) for x in out_expr]
                            out_expr = [
                                tf.saturate_cast(x, out_dtype)
                                for x in out_expr
                            ]
                        out_split.append(out_expr)
                self._run_cache[key] = [
                    tf.concat(outputs, axis=0) for outputs in zip(*out_split)
                ]

        # Run minibatches.
        out_expr = self._run_cache[key]
        out_arrays = [
            np.empty([num_items] + shape_to_list(expr.shape)[1:],
                     expr.dtype.name) for expr in out_expr
        ]
        for mb_begin in range(0, num_items, minibatch_size):
            if print_progress:
                print('\r%d / %d' % (mb_begin, num_items), end='')
            mb_end = min(mb_begin + minibatch_size, num_items)
            mb_in = [src[mb_begin:mb_end] for src in in_arrays]
            mb_out = tf.get_default_session().run(
                out_expr, dict(zip(self.input_templates, mb_in)))
            for dst, src in zip(out_arrays, mb_out):
                dst[mb_begin:mb_end] = src

        # Done.
        if print_progress:
            print('\r%d / %d' % (num_items, num_items))
        if not return_as_list:
            out_arrays = out_arrays[0] if len(out_arrays) == 1 else tuple(
                out_arrays)
        return out_arrays
def train(train_record_file,
          train_log_step,
          train_param,
          val_record_file,
          val_log_step,
          labels_nums,
          data_shape,
          snapshot,
          snapshot_prefix):
    '''
    :param train_record_file: 训练的tfrecord文件
    :param train_log_step: 显示训练过程log信息间隔
    :param train_param: train参数
    :param val_record_file: 验证的tfrecord文件
    :param val_log_step: 显示验证过程log信息间隔
    :param val_param: val参数
    :param labels_nums: labels数
    :param data_shape: 输入数据shape
    :param snapshot: 保存模型间隔
    :param snapshot_prefix: 保存模型文件的前缀名
    :return:
    '''
    [base_lr,max_steps]=train_param
    [batch_size,resize_height,resize_width,depths]=data_shape

    # 获得训练和测试的样本数
    train_nums=get_example_nums(train_record_file)
    val_nums=get_example_nums(val_record_file)
    print('train nums:%d,val nums:%d'%(train_nums,val_nums))

    # 从record中读取图片和labels数据
    # train数据,训练数据一般要求打乱顺序shuffle=True
    train_images, train_labels = read_records(train_record_file, resize_height, resize_width, type='normalization')
    train_images_batch, train_labels_batch = get_batch_images(train_images, train_labels,
                                                              batch_size=batch_size, labels_nums=labels_nums,
                                                              one_hot=True, shuffle=True)
    # val数据,验证数据可以不需要打乱数据
    val_images, val_labels = read_records(val_record_file, resize_height, resize_width, type='normalization')
    val_images_batch, val_labels_batch = get_batch_images(val_images, val_labels,
                                                          batch_size=batch_size, labels_nums=labels_nums,
                                                          one_hot=True, shuffle=False)

    # Define the model:
    with slim.arg_scope(inception_v3.inception_v3_arg_scope()):
        out, end_points = inception_v3.inception_v3(inputs=input_images, num_classes=labels_nums, dropout_keep_prob=keep_prob, is_training=is_training)

    # Specify the loss function: tf.losses定义的loss函数都会自动添加到loss函数,不需要add_loss()了
    tf.losses.softmax_cross_entropy(onehot_labels=input_labels, logits=out)#添加交叉熵损失loss=1.6
    # slim.losses.add_loss(my_loss)
    loss = tf.losses.get_total_loss(add_regularization_losses=True)#添加正则化损失loss=2.2

    # Specify the optimization scheme:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=base_lr)


    # global_step = tf.Variable(0, trainable=False)
    # learning_rate = tf.train.exponential_decay(0.05, global_step, 150, 0.9)
    #
    # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
    # # train_tensor = optimizer.minimize(loss, global_step)
    # train_op = slim.learning.create_train_op(loss, optimizer,global_step=global_step)


    # 在定义训练的时候, 注意到我们使用了`batch_norm`层时,需要更新每一层的`average`和`variance`参数,
    # 更新的过程不包含在正常的训练过程中, 需要我们去手动像下面这样更新
    # 通过`tf.get_collection`获得所有需要更新的`op`
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    # 使用`tensorflow`的控制流, 先执行更新算子, 再执行训练
    with tf.control_dependencies(update_ops):
        # create_train_op that ensures that when we evaluate it to get the loss,
        # the update_ops are done and the gradient updates are computed.
        # train_op = slim.learning.create_train_op(total_loss=loss,optimizer=optimizer)
        train_op = slim.learning.create_train_op(total_loss=loss, optimizer=optimizer)

    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(out, 1), tf.argmax(input_labels, 1)), tf.float32))
    # 循环迭代过程
    step_train(train_op, loss, accuracy,
               train_images_batch, train_labels_batch, train_nums, train_log_step,
               val_images_batch, val_labels_batch, val_nums, val_log_step,
               snapshot_prefix, snapshot)
Exemplo n.º 47
0
    def _build_ops(self, lm_graph):
        with tf.control_dependencies([lm_graph.update_state_op]):
            # get the LM embeddings
            token_embeddings = lm_graph.embedding
            layers = [tf.concat([token_embeddings, token_embeddings], axis=2)]

            n_lm_layers = len(lm_graph.lstm_outputs['forward'])
            for i in range(n_lm_layers):
                layers.append(
                    tf.concat([
                        lm_graph.lstm_outputs['forward'][i],
                        lm_graph.lstm_outputs['backward'][i]
                    ],
                              axis=-1))

            # The layers include the BOS/EOS tokens.  Remove them
            sequence_length_wo_bos_eos = lm_graph.sequence_lengths - 2
            layers_without_bos_eos = []
            # lm_graph.sequence_lengths = tf.Print(lm_graph.sequence_lengths, [lm_graph.sequence_lengths], message='seq len')
            for layer in layers:
                layer_wo_bos_eos = layer[:, 1:, :]
                layer_wo_bos_eos = tf.reverse_sequence(
                    layer_wo_bos_eos,
                    lm_graph.sequence_lengths - 1,
                    seq_axis=1,
                    batch_axis=0,
                )
                layer_wo_bos_eos = layer_wo_bos_eos[:, 1:, :]
                layer_wo_bos_eos = tf.reverse_sequence(
                    layer_wo_bos_eos,
                    sequence_length_wo_bos_eos,
                    seq_axis=1,
                    batch_axis=0,
                )
                layers_without_bos_eos.append(layer_wo_bos_eos)

            # concatenate the layers
            lm_embeddings = tf.concat(
                [tf.expand_dims(t, axis=1) for t in layers_without_bos_eos],
                axis=1)

            # get the mask op without bos/eos.
            # tf doesn't support reversing boolean tensors, so cast
            # to int then back
            mask_wo_bos_eos = tf.cast(lm_graph.mask[:, 1:], 'int32')
            mask_wo_bos_eos = tf.reverse_sequence(
                mask_wo_bos_eos,
                lm_graph.sequence_lengths - 1,
                seq_axis=1,
                batch_axis=0,
            )
            mask_wo_bos_eos = mask_wo_bos_eos[:, 1:]
            mask_wo_bos_eos = tf.reverse_sequence(
                mask_wo_bos_eos,
                sequence_length_wo_bos_eos,
                seq_axis=1,
                batch_axis=0,
            )
            mask_wo_bos_eos = tf.cast(mask_wo_bos_eos, 'bool')

        return {
            'lm_embeddings': lm_embeddings,
            'lengths': sequence_length_wo_bos_eos,
            'token_embeddings': lm_graph.embedding,
            'mask': mask_wo_bos_eos,
        }
Exemplo n.º 48
0
def train_cnn():
    """Training CNN model."""

    # Load sentences, labels, and training parameters
    logger.info("✔︎ Loading data...")

    logger.info("✔︎ Training data processing...")
    train_data = dh.load_data_and_labels(FLAGS.training_data_file, FLAGS.num_classes,
                                         FLAGS.embedding_dim, data_aug_flag=False)

    logger.info("✔︎ Validation data processing...")
    val_data = dh.load_data_and_labels(FLAGS.validation_data_file, FLAGS.num_classes,
                                       FLAGS.embedding_dim, data_aug_flag=False)

    logger.info("Recommended padding Sequence length is: {0}".format(FLAGS.pad_seq_len))

    logger.info("✔︎ Training data padding...")
    x_train, y_train = dh.pad_data(train_data, FLAGS.pad_seq_len)

    logger.info("✔︎ Validation data padding...")
    x_val, y_val = dh.pad_data(val_data, FLAGS.pad_seq_len)

    # Build vocabulary
    VOCAB_SIZE, pretrained_word2vec_matrix = dh.load_word2vec_matrix(FLAGS.embedding_dim)

    # Build a graph and cnn object
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = FLAGS.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=FLAGS.pad_seq_len,
                num_classes=FLAGS.num_classes,
                vocab_size=VOCAB_SIZE,
                fc_hidden_size=FLAGS.fc_hidden_size,
                embedding_size=FLAGS.embedding_dim,
                embedding_type=FLAGS.embedding_type,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(','))),
                num_filters=FLAGS.num_filters,
                l2_reg_lambda=FLAGS.l2_reg_lambda,
                pretrained_embedding=pretrained_word2vec_matrix)

            # Define training procedure
            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                learning_rate = tf.train.exponential_decay(learning_rate=FLAGS.learning_rate,
                                                           global_step=cnn.global_step, decay_steps=FLAGS.decay_steps,
                                                           decay_rate=FLAGS.decay_rate, staircase=True)
                optimizer = tf.train.AdamOptimizer(learning_rate)
                grads, vars = zip(*optimizer.compute_gradients(cnn.loss))
                grads, _ = tf.clip_by_global_norm(grads, clip_norm=FLAGS.norm_ratio)
                train_op = optimizer.apply_gradients(zip(grads, vars), global_step=cnn.global_step, name="train_op")

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in zip(grads, vars):
                if g is not None:
                    grad_hist_summary = tf.summary.histogram("{0}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar("{0}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            if FLAGS.train_or_restore == 'R':
                MODEL = input("☛ Please input the checkpoints model you want to restore, "
                              "it should be like(1490175368): ")  # The model you want to restore

                while not (MODEL.isdigit() and len(MODEL) == 10):
                    MODEL = input("✘ The format of your input is illegal, please re-input: ")
                logger.info("✔︎ The format of your input is legal, now loading to next step...")
                out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", MODEL))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))
            else:
                timestamp = str(int(time.time()))
                out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
                logger.info("✔︎ Writing to {0}\n".format(out_dir))

            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            best_checkpoint_dir = os.path.abspath(os.path.join(out_dir, "bestcheckpoints"))

            # Summaries for loss
            loss_summary = tf.summary.scalar("loss", cnn.loss)

            # Train summaries
            train_summary_op = tf.summary.merge([loss_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # Validation summaries
            validation_summary_op = tf.summary.merge([loss_summary])
            validation_summary_dir = os.path.join(out_dir, "summaries", "validation")
            validation_summary_writer = tf.summary.FileWriter(validation_summary_dir, sess.graph)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
            best_saver = cm.BestCheckpointSaver(save_dir=best_checkpoint_dir, num_to_keep=3, maximize=True)

            if FLAGS.train_or_restore == 'R':
                # Load cnn model
                logger.info("✔︎ Loading model...")
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                logger.info(checkpoint_file)

                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph("{0}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)
            else:
                if not os.path.exists(checkpoint_dir):
                    os.makedirs(checkpoint_dir)
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())

                # Embedding visualization config
                config = projector.ProjectorConfig()
                embedding_conf = config.embeddings.add()
                embedding_conf.tensor_name = "embedding"
                embedding_conf.metadata_path = FLAGS.metadata_file

                projector.visualize_embeddings(train_summary_writer, config)
                projector.visualize_embeddings(validation_summary_writer, config)

                # Save the embedding visualization
                saver.save(sess, os.path.join(out_dir, "embedding", "embedding.ckpt"))

            current_step = sess.run(cnn.global_step)

            def train_step(x_batch, y_batch):
                """A single training step"""
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    cnn.is_training: True
                }
                _, step, summaries, loss = sess.run(
                    [train_op, cnn.global_step, train_summary_op, cnn.loss], feed_dict)
                logger.info("step {0}: loss {1:g}".format(step, loss))
                train_summary_writer.add_summary(summaries, step)

            def validation_step(x_val, y_val, writer=None):
                """Evaluates model on a validation set"""
                batches_validation = dh.batch_iter(list(zip(x_val, y_val)), FLAGS.batch_size, 1)

                # Predict classes by threshold or topk ('ts': threshold; 'tk': topk)
                eval_counter, eval_loss = 0, 0.0

                eval_pre_tk = [0.0] * FLAGS.top_num
                eval_rec_tk = [0.0] * FLAGS.top_num
                eval_F_tk = [0.0] * FLAGS.top_num

                true_onehot_labels = []
                predicted_onehot_scores = []
                predicted_onehot_labels_ts = []
                predicted_onehot_labels_tk = [[] for _ in range(FLAGS.top_num)]

                for batch_validation in batches_validation:
                    x_batch_val, y_batch_val = zip(*batch_validation)
                    feed_dict = {
                        cnn.input_x: x_batch_val,
                        cnn.input_y: y_batch_val,
                        cnn.dropout_keep_prob: 1.0,
                        cnn.is_training: False
                    }
                    step, summaries, scores, cur_loss = sess.run(
                        [cnn.global_step, validation_summary_op, cnn.scores, cnn.loss], feed_dict)

                    # Prepare for calculating metrics
                    for i in y_batch_val:
                        true_onehot_labels.append(i)
                    for j in scores:
                        predicted_onehot_scores.append(j)

                    # Predict by threshold
                    batch_predicted_onehot_labels_ts = \
                        dh.get_onehot_label_threshold(scores=scores, threshold=FLAGS.threshold)

                    for k in batch_predicted_onehot_labels_ts:
                        predicted_onehot_labels_ts.append(k)

                    # Predict by topK
                    for top_num in range(FLAGS.top_num):
                        batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(scores=scores, top_num=top_num+1)

                        for i in batch_predicted_onehot_labels_tk:
                            predicted_onehot_labels_tk[top_num].append(i)

                    eval_loss = eval_loss + cur_loss
                    eval_counter = eval_counter + 1

                    if writer:
                        writer.add_summary(summaries, step)

                eval_loss = float(eval_loss / eval_counter)

                # Calculate Precision & Recall & F1 (threshold & topK)
                eval_pre_ts = precision_score(y_true=np.array(true_onehot_labels),
                                              y_pred=np.array(predicted_onehot_labels_ts), average='micro')
                eval_rec_ts = recall_score(y_true=np.array(true_onehot_labels),
                                           y_pred=np.array(predicted_onehot_labels_ts), average='micro')
                eval_F_ts = f1_score(y_true=np.array(true_onehot_labels),
                                     y_pred=np.array(predicted_onehot_labels_ts), average='micro')

                for top_num in range(FLAGS.top_num):
                    eval_pre_tk[top_num] = precision_score(y_true=np.array(true_onehot_labels),
                                                           y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                                           average='micro')
                    eval_rec_tk[top_num] = recall_score(y_true=np.array(true_onehot_labels),
                                                        y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                                        average='micro')
                    eval_F_tk[top_num] = f1_score(y_true=np.array(true_onehot_labels),
                                                  y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                                                  average='micro')

                # Calculate the average AUC
                eval_auc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                         y_score=np.array(predicted_onehot_scores), average='micro')
                # Calculate the average PR
                eval_prc = average_precision_score(y_true=np.array(true_onehot_labels),
                                                   y_score=np.array(predicted_onehot_scores), average='micro')

                return eval_loss, eval_auc, eval_prc, eval_rec_ts, eval_pre_ts, eval_F_ts, \
                       eval_rec_tk, eval_pre_tk, eval_F_tk

            # Generate batches
            batches_train = dh.batch_iter(
                list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)

            num_batches_per_epoch = int((len(x_train) - 1) / FLAGS.batch_size) + 1

            # Training loop. For each batch...
            for batch_train in batches_train:
                x_batch_train, y_batch_train = zip(*batch_train)
                train_step(x_batch_train, y_batch_train)
                current_step = tf.train.global_step(sess, cnn.global_step)

                if current_step % FLAGS.evaluate_every == 0:
                    logger.info("\nEvaluation:")
                    eval_loss, eval_auc, eval_prc, \
                    eval_rec_ts, eval_pre_ts, eval_F_ts, eval_rec_tk, eval_pre_tk, eval_F_tk = \
                        validation_step(x_val, y_val, writer=validation_summary_writer)

                    logger.info("All Validation set: Loss {0:g} | AUC {1:g} | AUPRC {2:g}"
                                .format(eval_loss, eval_auc, eval_prc))

                    # Predict by threshold
                    logger.info("☛ Predict by threshold: Precision {0:g}, Recall {1:g}, F {2:g}"
                                .format(eval_pre_ts, eval_rec_ts, eval_F_ts))

                    # Predict by topK
                    logger.info("☛ Predict by topK:")
                    for top_num in range(FLAGS.top_num):
                        logger.info("Top{0}: Precision {1:g}, Recall {2:g}, F {3:g}"
                                    .format(top_num+1, eval_pre_tk[top_num], eval_rec_tk[top_num], eval_F_tk[top_num]))
                    best_saver.handle(eval_prc, sess, current_step)
                if current_step % FLAGS.checkpoint_every == 0:
                    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    logger.info("✔︎ Saved model checkpoint to {0}\n".format(path))
                if current_step % num_batches_per_epoch == 0:
                    current_epoch = current_step // num_batches_per_epoch
                    logger.info("✔︎ Epoch {0} has finished!".format(current_epoch))

    logger.info("✔︎ Done.")
Exemplo n.º 49
0
def train_optimizer(logdir,
                    optimizer_spec,
                    problems_and_data,
                    num_problems,
                    num_meta_iterations,
                    num_unroll_func,
                    num_partial_unroll_itrs_func,
                    learning_rate=1e-4,
                    gradient_clip=5.,
                    is_chief=False,
                    select_random_problems=True,
                    callbacks=None,
                    obj_train_max_multiplier=-1,
                    out=sys.stdout):
    """Trains the meta-parameters of this optimizer.

  Args:
    logdir: a directory filepath for storing model checkpoints (must exist)
    optimizer_spec: specification for an Optimizer (see utils.Spec)
    problems_and_data: a list of tuples containing three elements: a problem
      specification (see utils.Spec), a dataset (see datasets.Dataset), and
      a batch_size (int) for generating a problem and corresponding dataset. If
      the problem doesn't have data, set dataset to None.
    num_problems: the number of problems to sample during meta-training
    num_meta_iterations: the number of iterations (steps) to run the
      meta-optimizer for on each subproblem.
    num_unroll_func: called once per meta iteration and returns the number of
      unrolls to do for that meta iteration.
    num_partial_unroll_itrs_func: called once per unroll and returns the number
      of iterations to do for that unroll.
    learning_rate: learning rate of the RMSProp meta-optimizer (Default: 1e-4)
    gradient_clip: value to clip gradients at (Default: 5.0)
    is_chief: whether this is the chief task (Default: False)
    select_random_problems: whether to select training problems randomly
        (Default: True)
    callbacks: a list of callback functions that is run after every random
        problem draw
    obj_train_max_multiplier: the maximum increase in the objective value over
        a single training run. Ignored if < 0.
    out: where to write output to, e.g. a file handle (Default: sys.stdout)

  Raises:
    ValueError: If one of the subproblems has a negative objective value.
  """

    if select_random_problems:
        # iterate over random draws of problem / dataset pairs
        sampler = (random.choice(problems_and_data) for _ in range(num_problems))
    else:
        # iterate over a random shuffle of problems, looping if necessary
        num_repeats = (num_problems / len(problems_and_data)) + 1
        random.shuffle(problems_and_data)
        sampler = (problems_and_data * num_repeats)[:num_problems]

    for problem_itr, (problem_spec, dataset, batch_size) in enumerate(sampler):

        # timer used to time how long it takes to initialize a problem
        problem_start_time = time.time()

        # if dataset is None, use the EMPTY_DATASET
        if dataset is None:
            dataset = datasets.EMPTY_DATASET
            batch_size = dataset.size

        # build a new graph for this problem
        graph = tf.Graph()
        real_device_setter = tf.train.replica_device_setter(FLAGS.ps_tasks)

        def custom_device_setter(op):
            # Places the local variables onto the workers.
            if trainable_optimizer.is_local_state_variable(op):
                return "/job:worker"
            else:
                return real_device_setter(op)

        if real_device_setter:
            device_setter = custom_device_setter
        else:
            device_setter = None

        with graph.as_default(), graph.device(device_setter):

            # initialize a problem
            problem = problem_spec.build()

            # build the optimizer
            opt = optimizer_spec.build()

            # get the meta-objective for training the optimizer
            train_output = opt.train(problem, dataset)

            state_keys = opt.state_keys
            for key, val in zip(state_keys, train_output.output_state[0]):
                finite_val = utils.make_finite(val, replacement=tf.zeros_like(val))
                tf.summary.histogram("State/{}".format(key), finite_val,
                                     collections=[OPT_SUM_COLLECTION])

            tf.summary.scalar("MetaObjective", train_output.metaobj,
                              collections=[OPT_SUM_COLLECTION])

            # Per-problem meta-objective
            tf.summary.scalar(problem_spec.callable.__name__ + "_MetaObjective",
                              train_output.metaobj,
                              collections=[OPT_SUM_COLLECTION])

            # create the meta-train_op
            global_step = tf.Variable(0, name="global_step", trainable=False)
            meta_parameters = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope=OPTIMIZER_SCOPE)
            # parameter regularization
            reg_l2 = FLAGS.l2_reg * sum([tf.reduce_sum(param ** 2)
                                         for param in meta_parameters])

            # compute the meta-gradients
            meta_opt = tf.train.RMSPropOptimizer(learning_rate, decay=FLAGS.rms_decay,
                                                 use_locking=True,
                                                 epsilon=FLAGS.rms_epsilon)
            grads_and_vars = meta_opt.compute_gradients(train_output.metaobj + reg_l2,
                                                        meta_parameters)

            # clip the gradients
            clipped_grads_and_vars = []
            for grad, var in grads_and_vars:
                clipped_grad = tf.clip_by_value(
                    utils.make_finite(grad, replacement=tf.zeros_like(var)),
                    -gradient_clip, gradient_clip)
                clipped_grads_and_vars.append((clipped_grad, var))

            # histogram summary of grads and vars
            for grad, var in grads_and_vars:
                tf.summary.histogram(
                    var.name + "_rawgrad",
                    utils.make_finite(
                        grad, replacement=tf.zeros_like(grad)),
                    collections=[OPT_SUM_COLLECTION])
            for grad, var in clipped_grads_and_vars:
                tf.summary.histogram(var.name + "_var", var,
                                     collections=[OPT_SUM_COLLECTION])
                tf.summary.histogram(var.name + "_grad", grad,
                                     collections=[OPT_SUM_COLLECTION])

            # builds the train and summary operations
            train_op = meta_opt.apply_gradients(clipped_grads_and_vars,
                                                global_step=global_step)

            # only grab summaries defined for LOL, not inside the problem
            summary_op = tf.summary.merge_all(key=OPT_SUM_COLLECTION)

            # make sure the state gets propagated after the gradients and summaries
            # were computed.
            with tf.control_dependencies([train_op, summary_op]):
                propagate_loop_state_ops = []
                for dest, src in zip(
                        train_output.init_loop_vars, train_output.output_loop_vars):
                    propagate_loop_state_ops.append(dest.assign(src))
                propagate_loop_state_op = tf.group(*propagate_loop_state_ops)

            # create the supervisor
            sv = tf.train.Supervisor(
                graph=graph,
                is_chief=is_chief,
                logdir=logdir,
                summary_op=None,
                save_model_secs=0,  # we save checkpoints manually
                global_step=global_step,
            )

            with sv.managed_session() as sess:

                init_time = time.time() - problem_start_time
                out.write("--------- Problem #{} ---------\n".format(problem_itr))
                out.write("{callable.__name__}{args}{kwargs}\n".format(
                    **problem_spec.__dict__))
                out.write("Took {} seconds to initialize.\n".format(init_time))
                out.flush()

                # For profiling summaries
                if FLAGS.set_profiling:
                    summary_writer = tf.summary.FileWriter(logdir, graph=sess.graph)

                # used to store information during training
                metadata = defaultdict(list)

                for k in range(num_meta_iterations):

                    if sv.should_stop():
                        break

                    problem.init_fn(sess)

                    # set run options (for profiling)
                    full_trace_opt = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    run_options = full_trace_opt if FLAGS.set_profiling else None
                    run_metadata = tf.RunMetadata() if FLAGS.set_profiling else None

                    num_unrolls = num_unroll_func()
                    partial_unroll_iters = [
                        num_partial_unroll_itrs_func() for _ in xrange(num_unrolls)
                    ]
                    total_num_iter = sum(partial_unroll_iters)

                    objective_weights = [np.ones(num) / float(num)
                                         for num in partial_unroll_iters]
                    db = dataset.batch_indices(total_num_iter, batch_size)
                    dataset_batches = []
                    last_index = 0
                    for num in partial_unroll_iters:
                        dataset_batches.append(db[last_index:last_index + num])
                        last_index += num

                    train_start_time = time.time()

                    unroll_itr = 0
                    additional_log_info = ""

                    for unroll_itr in range(num_unrolls):
                        first_unroll = unroll_itr == 0
                        if FLAGS.reset_rnn_params:
                            reset_state = first_unroll and k == 0
                        else:
                            reset_state = first_unroll

                        feed = {
                            train_output.obj_weights: objective_weights[unroll_itr],
                            train_output.batches: dataset_batches[unroll_itr],
                            train_output.first_unroll: first_unroll,
                            train_output.reset_state: reset_state,
                        }

                        # run the train and summary ops
                        # when a "save_diagnostics" flag is turned on
                        fetches_list = [
                            train_output.metaobj,
                            train_output.problem_objectives,
                            train_output.initial_obj,
                            summary_op,
                            clipped_grads_and_vars,
                            train_op
                        ]
                        if unroll_itr + 1 < num_unrolls:
                            fetches_list += [propagate_loop_state_op]

                        fetched = sess.run(fetches_list, feed_dict=feed,
                                           options=run_options, run_metadata=run_metadata)
                        meta_obj = fetched[0]
                        sub_obj = fetched[1]
                        init_obj = fetched[2]
                        summ = fetched[3]
                        meta_grads_and_params = fetched[4]

                        # assert that the subproblem objectives are non-negative
                        # (this is so that we can rescale the objective by the initial value
                        # and not worry about rescaling by a negative value)
                        if np.any(sub_obj < 0):
                            raise ValueError(
                                "Training problem objectives must be nonnegative.")
                        # If the objective has increased more than we want, exit this
                        # training run and start over on another meta iteration.
                        if obj_train_max_multiplier > 0 and (
                                sub_obj[-1] > (init_obj +
                                               abs(init_obj) * (obj_train_max_multiplier - 1))):
                            msg = " Broke early at {} out of {} unrolls. ".format(
                                unroll_itr + 1, num_unrolls)
                            additional_log_info += msg
                            break

                        # only the chief task is allowed to write the summary
                        if is_chief:
                            sv.summary_computed(sess, summ)

                        metadata["subproblem_objs"].append(sub_obj)
                        # store training metadata to pass to the callback
                        metadata["meta_objs"].append(meta_obj)
                        metadata["meta_grads_and_params"].append(meta_grads_and_params)

                    optimization_time = time.time() - train_start_time

                    if FLAGS.set_profiling:
                        summary_name = "%02d_iter%04d_%02d" % (FLAGS.task, problem_itr, k)
                        summary_writer.add_run_metadata(run_metadata, summary_name)

                    metadata["global_step"].append(sess.run(global_step))
                    metadata["runtimes"].append(optimization_time)

                    # write a diagnostic message to the output
                    args = (k, meta_obj, optimization_time,
                            sum(partial_unroll_iters[:unroll_itr + 1]))
                    out.write("  [{:02}] {}, {} seconds, {} iters ".format(*args))
                    out.write("(unrolled {} steps)".format(
                        ", ".join([str(s) for s in partial_unroll_iters[:unroll_itr + 1]])))
                    out.write("{}\n".format(additional_log_info))
                    out.flush()

                if FLAGS.set_profiling:
                    summary_writer.close()

                # force a checkpoint save before we load a new problem
                # only the chief task has the save_path and can write the checkpoint
                if is_chief:
                    sv.saver.save(sess, sv.save_path, global_step=global_step)

        # run the callbacks on the chief
        if is_chief and callbacks is not None:
            for callback in callbacks:
                if hasattr(callback, "__call__"):
                    problem_name = problem_spec.callable.__name__
                    callback(problem_name, problem_itr, logdir, metadata)
Exemplo n.º 50
0
    def _build_lstms(self):
        # now the LSTMs
        # these will collect the initial states for the forward
        #   (and reverse LSTMs if we are doing bidirectional)

        # parse the options
        lstm_dim = self.options['lstm']['dim']
        projection_dim = self.options['lstm']['projection_dim']
        n_lstm_layers = self.options['lstm'].get('n_layers', 1)
        cell_clip = self.options['lstm'].get('cell_clip')
        proj_clip = self.options['lstm'].get('proj_clip')
        use_skip_connections = self.options['lstm']['use_skip_connections']
        if use_skip_connections:
            print("USING SKIP CONNECTIONS")
        else:
            print("NOT USING SKIP CONNECTIONS")

        # the sequence lengths from input mask
        if self.use_character_inputs:
            mask = tf.reduce_any(self.ids_placeholder > 0, axis=2)
        else:
            mask = self.ids_placeholder > 0
        sequence_lengths = tf.reduce_sum(tf.cast(mask, tf.int32), axis=1)
        batch_size = tf.shape(sequence_lengths)[0]

        # for each direction, we'll store tensors for each layer
        self.lstm_outputs = {'forward': [], 'backward': []}
        self.lstm_state_sizes = {'forward': [], 'backward': []}
        self.lstm_init_states = {'forward': [], 'backward': []}
        self.lstm_final_states = {'forward': [], 'backward': []}

        update_ops = []
        for direction in ['forward', 'backward']:
            if direction == 'forward':
                layer_input = self.embedding
            else:
                layer_input = tf.reverse_sequence(self.embedding,
                                                  sequence_lengths,
                                                  seq_axis=1,
                                                  batch_axis=0)

            for i in range(n_lstm_layers):
                if projection_dim < lstm_dim:
                    # are projecting down output
                    lstm_cell = tf.nn.rnn_cell.LSTMCell(
                        lstm_dim,
                        num_proj=projection_dim,
                        cell_clip=cell_clip,
                        proj_clip=proj_clip)
                else:
                    lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                        cell_clip=cell_clip,
                                                        proj_clip=proj_clip)

                if use_skip_connections:
                    # ResidualWrapper adds inputs to outputs
                    if i == 0:
                        # don't add skip connection from token embedding to
                        # 1st layer output
                        pass
                    else:
                        # add a skip connection
                        lstm_cell = tf.nn.rnn_cell.ResidualWrapper(lstm_cell)

                # collect the input state, run the dynamic rnn, collect
                # the output
                state_size = lstm_cell.state_size
                # the LSTMs are stateful.  To support multiple batch sizes,
                # we'll allocate size for states up to max_batch_size,
                # then use the first batch_size entries for each batch
                init_states = [
                    tf.Variable(tf.zeros([self._max_batch_size, dim]),
                                trainable=False)
                    for dim in lstm_cell.state_size
                ]
                batch_init_states = [
                    state[:batch_size, :] for state in init_states
                ]

                if direction == 'forward':
                    i_direction = 0
                else:
                    i_direction = 1
                variable_scope_name = 'RNN_{0}/RNN/MultiRNNCell/Cell{1}'.format(
                    i_direction, i)
                with tf.variable_scope(variable_scope_name):
                    layer_output, final_state = tf.nn.dynamic_rnn(
                        lstm_cell,
                        layer_input,
                        sequence_length=sequence_lengths,
                        initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                            *batch_init_states),
                    )

                self.lstm_state_sizes[direction].append(lstm_cell.state_size)
                self.lstm_init_states[direction].append(init_states)
                self.lstm_final_states[direction].append(final_state)
                if direction == 'forward':
                    self.lstm_outputs[direction].append(layer_output)
                else:
                    self.lstm_outputs[direction].append(
                        tf.reverse_sequence(layer_output,
                                            sequence_lengths,
                                            seq_axis=1,
                                            batch_axis=0))

                with tf.control_dependencies([layer_output]):
                    # update the initial states
                    for i in range(2):
                        new_state = tf.concat([
                            final_state[i][:batch_size, :],
                            init_states[i][batch_size:, :]
                        ],
                                              axis=0)
                        state_update_op = tf.assign(init_states[i], new_state)
                        update_ops.append(state_update_op)

                layer_input = layer_output

        self.mask = mask
        self.sequence_lengths = sequence_lengths
        self.update_state_op = tf.group(*update_ops)
		def mean_var_with_update():
			with tf.control_dependencies([ema_apply_op]):
				return tf.identity(batch_mean), tf.identity(batch_var)
Exemplo n.º 52
0
def _model_fn(features, labels, mode, params, variable_filter_fn=None):
    """Model defination for the Mask-RCNN model based on ResNet.

  Args:
    features: the input image tensor and auxiliary information, such as
      `image_info` and `source_ids`. The image tensor has a shape of
      [batch_size, height, width, 3]. The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include score targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
  """
    if mode == tf.estimator.ModeKeys.PREDICT:
        if params['include_groundtruth_in_features'] and ('labels'
                                                          in features):
            # In include groundtruth for eval.
            labels = features['labels']
        else:
            labels = None
        if 'features' in features:
            features = features['features']
            # Otherwise, it is in export mode, the features is past in directly.

    if params['use_bfloat16']:
        with tf.contrib.tpu.bfloat16_scope():
            model_outputs = build_model_graph(
                features, labels, mode == tf.estimator.ModeKeys.TRAIN, params)
            model_outputs.update({
                'source_id': features['source_ids'],
                'image_info': features['image_info'],
            })

            def cast_outputs_to_float(d):
                for k, v in sorted(six.iteritems(d)):
                    if isinstance(v, dict):
                        cast_outputs_to_float(v)
                    else:
                        d[k] = tf.cast(v, tf.float32)

            cast_outputs_to_float(model_outputs)
    else:
        model_outputs = build_model_graph(features, labels,
                                          mode == tf.estimator.ModeKeys.TRAIN,
                                          params)
        model_outputs.update({
            'source_id': features['source_ids'],
            'image_info': features['image_info'],
        })

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {}
        if labels and params['include_groundtruth_in_features']:
            # Labels can only be emebeded in predictions. The predition cannot output
            # dictionary as a value.
            predictions.update(labels)
        model_outputs.pop('fpn_features', None)
        predictions.update(model_outputs)

        if params['use_tpu']:
            return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                                   predictions=predictions)
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
        global_step, params['init_learning_rate'],
        params['warmup_learning_rate'], params['warmup_steps'],
        params['learning_rate_levels'], params['learning_rate_steps'])
    # score_loss and box_loss are for logging. only total_loss is optimized.
    total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
        model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
        labels, params)

    (total_fast_rcnn_loss, fast_rcnn_class_loss,
     fast_rcnn_box_loss) = losses.fast_rcnn_loss(
         model_outputs['class_outputs'], model_outputs['box_outputs'],
         model_outputs['class_targets'], model_outputs['box_targets'], params)
    # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py  # pylint: disable=line-too-long
    if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
        mask_loss = losses.mask_rcnn_loss(
            model_outputs['mask_outputs'], model_outputs['mask_targets'],
            model_outputs['selected_class_targets'], params)
    else:
        mask_loss = 0.
    if variable_filter_fn and ('resnet' in params['backbone']):
        var_list = variable_filter_fn(tf.trainable_variables(),
                                      params['backbone'] + '/')
    else:
        var_list = tf.trainable_variables()
    l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
        tf.nn.l2_loss(v) for v in var_list
        if 'batch_normalization' not in v.name and 'bias' not in v.name
    ])
    total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss +
                  l2_regularization_loss)

    host_call = None
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = create_optimizer(learning_rate, params)
        if params['use_tpu']:
            optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

        scaffold_fn = None
        if params['warm_start_path']:

            def warm_start_scaffold_fn():
                tf.logging.info('model_fn warm start from: %s",' %
                                params['warm_start_path'])
                assignment_map = _build_assigment_map(
                    optimizer,
                    prefix=None,
                    skip_variables_regex=params['skip_checkpoint_variables'])
                tf.train.init_from_checkpoint(params['warm_start_path'],
                                              assignment_map)
                return tf.train.Scaffold()

            scaffold_fn = warm_start_scaffold_fn

        elif params['checkpoint']:

            def backbone_scaffold_fn():
                """Loads pretrained model through scaffold function."""
                # Exclude all variable of optimizer.
                vars_to_load = _build_assigment_map(
                    optimizer,
                    prefix=params['backbone'] + '/',
                    skip_variables_regex=params['skip_checkpoint_variables'])
                tf.train.init_from_checkpoint(params['checkpoint'],
                                              vars_to_load)
                if not vars_to_load:
                    raise ValueError('Variables to load is empty.')
                return tf.train.Scaffold()

            scaffold_fn = backbone_scaffold_fn

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
        if params['global_gradient_clip_ratio'] > 0:
            # Clips the gradients for training stability.
            # Refer: https://arxiv.org/abs/1211.5063
            with tf.name_scope('clipping'):
                old_grads, variables = zip(*grads_and_vars)
                num_weights = sum(g.shape.num_elements() for g in old_grads
                                  if g is not None)
                clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
                    num_weights)
                tf.logging.info(
                    'Global clip norm set to %g for %d variables with %d elements.'
                    % (clip_norm, sum(
                        1 for g in old_grads if g is not None), num_weights))
                gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
        else:
            gradients, variables = zip(*grads_and_vars)
        grads_and_vars = []
        # Special treatment for biases (beta is named as bias in reference model)
        # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113  # pylint: disable=line-too-long
        for grad, var in zip(gradients, variables):
            if grad is not None and ('beta' in var.name or 'bias' in var.name):
                grad = 2.0 * grad
            grads_and_vars.append((grad, var))
        minimize_op = optimizer.apply_gradients(grads_and_vars,
                                                global_step=global_step)

        with tf.control_dependencies(update_ops):
            train_op = minimize_op

        if params['use_host_call']:

            def host_call_fn(global_step, total_loss, total_rpn_loss,
                             rpn_score_loss, rpn_box_loss,
                             total_fast_rcnn_loss, fast_rcnn_class_loss,
                             fast_rcnn_box_loss, mask_loss, learning_rate):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          global_step: `Tensor with shape `[batch, ]` for the global_step.
          total_loss: `Tensor` with shape `[batch, ]` for the training loss.
          total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
            loss.
          rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
            score loss.
          rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
            box loss.
          total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
            training Mask-RCNN loss.
          fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
            training Mask-RCNN class loss.
          fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
            training Mask-RCNN box loss.
          mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
            mask loss.
          learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.

        Returns:
          List of summary ops to run on the CPU host.
        """
                # Outfeed supports int32 but global_step is expected to be int64.
                global_step = tf.reduce_mean(global_step)
                # Host call fns are executed FLAGS.iterations_per_loop times after one
                # TPU loop is finished, setting max_queue value to the same as number of
                # iterations will make the summary writer only flush the data to storage
                # once per loop.
                with (tf.contrib.summary.create_file_writer(
                        params['model_dir'],
                        max_queue=params['iterations_per_loop']).as_default()):
                    with tf.contrib.summary.always_record_summaries():
                        tf.contrib.summary.scalar('total_loss',
                                                  tf.reduce_mean(total_loss),
                                                  step=global_step)
                        tf.contrib.summary.scalar(
                            'total_rpn_loss',
                            tf.reduce_mean(total_rpn_loss),
                            step=global_step)
                        tf.contrib.summary.scalar(
                            'rpn_score_loss',
                            tf.reduce_mean(rpn_score_loss),
                            step=global_step)
                        tf.contrib.summary.scalar('rpn_box_loss',
                                                  tf.reduce_mean(rpn_box_loss),
                                                  step=global_step)
                        tf.contrib.summary.scalar(
                            'total_fast_rcnn_loss',
                            tf.reduce_mean(total_fast_rcnn_loss),
                            step=global_step)
                        tf.contrib.summary.scalar(
                            'fast_rcnn_class_loss',
                            tf.reduce_mean(fast_rcnn_class_loss),
                            step=global_step)
                        tf.contrib.summary.scalar(
                            'fast_rcnn_box_loss',
                            tf.reduce_mean(fast_rcnn_box_loss),
                            step=global_step)
                        if params['include_mask']:
                            tf.contrib.summary.scalar(
                                'mask_loss',
                                tf.reduce_mean(mask_loss),
                                step=global_step)
                        tf.contrib.summary.scalar(
                            'learning_rate',
                            tf.reduce_mean(learning_rate),
                            step=global_step)

                        return tf.contrib.summary.all_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            global_step_t = tf.reshape(global_step, [1])
            total_loss_t = tf.reshape(total_loss, [1])
            total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
            rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
            rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
            total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
            fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
            fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
            mask_loss_t = tf.reshape(mask_loss, [1])
            learning_rate_t = tf.reshape(learning_rate, [1])
            host_call = (host_call_fn, [
                global_step_t, total_loss_t, total_rpn_loss_t,
                rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
                fast_rcnn_class_loss_t, fast_rcnn_box_loss_t, mask_loss_t,
                learning_rate_t
            ])
    else:
        train_op = None
        scaffold_fn = None

    if params['use_tpu']:
        return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                               loss=total_loss,
                                               train_op=train_op,
                                               host_call=host_call,
                                               scaffold_fn=scaffold_fn)
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=total_loss,
                                      train_op=train_op)
Exemplo n.º 53
0
def train():
    # load train set list and transform it to queue.
    try:
        with open('train_set_list.pickle', 'r') as f:
            train_set_list = pickle.load(f)
    except:
        raise EnvironmentError(
            'Data list not existed. Please run generate_data_list.py first.')
    random.shuffle(train_set_list)
    train_set_queue = deque(train_set_list)
    train_set_size = len(train_set_list)
    del train_set_list
    print('Training set built. Size: ' + str(train_set_size))

    # build the tensorflow graph.
    with tf.Graph().as_default() as g:

        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        num_batches_per_epoch = train_set_size / BATCH_SIZE
        decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True)
        tf.summary.scalar('learning_rate', lr)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.RMSPropOptimizer(lr,
                                        RMSPROP_DECAY,
                                        momentum=RMSPROP_MOMENTUM,
                                        epsilon=RMSPROP_EPSILON)

        images = tf.placeholder(tf.float32,
                                shape=[BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3])

        labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE])

        logits = inception.inference(images,
                                     NUM_CLASSES,
                                     for_training=True,
                                     restore_logits=FLAGS.fine_tune,
                                     scope=None)

        inception.loss(logits, labels, batch_size=BATCH_SIZE)

        # Assemble all of the losses for the current tower only.
        losses = tf.get_collection(slim.losses.LOSSES_COLLECTION, scope=None)

        # Calculate the total loss for the current tower.
        regularization_losses = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        total_loss = tf.add_n(losses + regularization_losses,
                              name='total_loss')

        # Compute the moving average of all individual losses and the total loss.
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        loss_averages_op = loss_averages.apply(losses + [total_loss])

        # same for the averaged version of the losses.
        for l in losses + [total_loss]:
            # Name each loss as '(raw)' and name the moving average version of the loss
            # as the original loss name.
            tf.summary.scalar(l.op.name + ' (raw)', l)
            tf.summary.scalar(l.op.name, loss_averages.average(l))

        with tf.control_dependencies([loss_averages_op]):
            total_loss = tf.identity(total_loss)

        batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
                                              scope=None)

        # Calculate the gradients for the batch of data on this ImageNet
        # tower.
        grads = opt.compute_gradients(total_loss)

        # Apply gradients.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradients', grad)

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            inception.MOVING_AVERAGE_DECAY, global_step)

        variables_to_average = (tf.trainable_variables() +
                                tf.moving_average_variables())
        variables_averages_op = variable_averages.apply(variables_to_average)

        # Group all updates to into a single train op.
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchnorm_updates_op)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge_all()

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # open session and initialize
        sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
        sess.run(init)

        # restore old checkpoint
        if FLAGS.fine_tune:
            checkpoint = tf.train.get_checkpoint_state(FLAGS.ckpt_restore_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Successfully loaded:", checkpoint.model_checkpoint_path)
            else:
                print("Could not find old network weights")
        else:
            variables_to_restore = tf.get_collection(
                slim.variables.VARIABLES_TO_RESTORE)
            restorer = tf.train.Saver(variables_to_restore)
            restorer.restore(sess, FLAGS.pretrained_model_checkpoint_path)
            print('%s: Pre-trained model restored from %s' %
                  (datetime.now(), FLAGS.pretrained_model_checkpoint_path))

        summary_writer = tf.summary.FileWriter(
            FLAGS.ckpt_save_dir,
            graph_def=sess.graph.as_graph_def(add_shapes=True))

        step = 1
        while step <= FLAGS.max_steps:
            start_time = time.time()
            # construct image batch and label batch for one step train
            minibatch = []
            for count in xrange(0, BATCH_SIZE):
                element = train_set_queue.pop()
                minibatch.append(element)
                train_set_queue.appendleft(element)

            image_list = [load_image(d[0]) for d in minibatch]
            label_list = [d[1] for d in minibatch]

            image_batch = np.array(image_list)
            label_batch = np.array(label_list)

            image_batch = np.reshape(image_batch,
                                     [BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, 3])
            label_batch = np.reshape(label_batch, [BATCH_SIZE])

            _, loss_value = sess.run([train_op, total_loss],
                                     feed_dict={
                                         images: image_batch,
                                         labels: label_batch
                                     })

            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step == 1 or step % 10 == 0:
                num_examples_per_step = BATCH_SIZE
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')

                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            # shuttle the image list per epoch
            if step % num_batches_per_epoch == 0:
                random.shuffle(train_set_queue)

            # write summary periodically
            if step == 1 or step % 100 == 0:
                summary_str = sess.run(summary_op,
                                       feed_dict={
                                           images: image_batch,
                                           labels: label_batch
                                       })
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0:
                checkpoint_path = os.path.join(FLAGS.ckpt_save_dir,
                                               'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            step += 1
Exemplo n.º 54
0
def main(_):
    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)

    # Start a new TensorFlow session.
    sess = tf.InteractiveSession()

    optimizer_name = FLAGS.optimizer
    optimizer_params = {}
    clip_gradients = 0.0
    if FLAGS.optimizer.find(":") > 0:
        optimizer = _maybe_load_yaml(FLAGS.optimizer)
        optimizer_name = optimizer["name"]
        if "params" in optimizer:
            optimizer_params = optimizer["params"]
        if "clip_gradients" in optimizer:
            clip_gradients = optimizer["clip_gradients"]

    tf.logging.info("optimizer_name = {} optimizer_params = {}".format(
        optimizer_name, optimizer_params))

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.dct_coefficient_count,
        FLAGS.feature_type)

    fingerprint_size = model_settings['fingerprint_size']
    label_count = model_settings['label_count']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
    # Figure out the learning rates for each training phase. Since it's often
    # effective to have high learning rates at the start of training, followed by
    # lower levels towards the end, the number of steps and learning rates can be
    # specified as comma-separated lists to define the rate at each stage. For
    # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
    # will run 13,000 training loops in total, with a rate of 0.001 for the first
    # 10,000, and 0.0001 for the final 3,000.
    training_steps_list = list(
        map(int, FLAGS.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits, dropout_prob = models.create_model(fingerprint_input,
                                               model_settings,
                                               FLAGS.model_architecture,
                                               hparam_string=FLAGS.hparams,
                                               is_training=True)

    # Define loss and optimizer
    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    # Optionally we can add runtime checks to spot when NaNs or other symptoms of
    # numerical errors start occurring during training.
    control_dependencies = []
    if FLAGS.check_nans:
        checks = tf.add_check_numerics_ops()
        control_dependencies = [checks]

    # Create the back propagation and training evaluation machinery in the graph.
    with tf.name_scope('cross_entropy'):
        cross_entropy_mean = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_input,
                                                    logits=logits))
    tf.summary.scalar('cross_entropy', cross_entropy_mean)

    with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
        learning_rate_input = tf.placeholder(tf.float32, [],
                                             name='learning_rate_input')
        if optimizer_name == 'Momentum':
            optimizer = tf.train.MomentumOptimizer(
                learning_rate=learning_rate_input, **optimizer_params)
        elif optimizer_name.lower() == 'nadam':
            optimizer = tf.contrib.opt.NadamOptimizer(
                learning_rate=learning_rate_input, **optimizer_params)
        else:
            optimizer = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](
                learning_rate=learning_rate_input, **optimizer_params)

        def _clip_gradients(grads_and_vars, value):
            """Clips gradients by global norm."""
            gradients, variables = zip(*grads_and_vars)
            gradients, _ = tf.clip_by_global_norm(gradients, value)

            return list(zip(gradients, variables))

        reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        if len(reg_loss):
            tf.logging.info("add regularization loss")
            grad_vars = optimizer.compute_gradients(cross_entropy_mean +
                                                    tf.reduce_mean(reg_loss))
        else:
            grad_vars = optimizer.compute_gradients(cross_entropy_mean)
        if clip_gradients > 0.0:
            grad_vars = _clip_gradients(grad_vars, clip_gradients)

        # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
        # https://github.com/tensorflow/tensorflow/issues/1122
        with tf.control_dependencies(tf.get_collection(
                tf.GraphKeys.UPDATE_OPS)):
            train_step = optimizer.apply_gradients(grad_vars)

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', evaluation_step)

    global_step = tf.contrib.framework.get_or_create_global_step()
    increment_global_step = tf.assign(global_step, global_step + 1)

    saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)

    # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
    merged_summaries = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                         sess.graph)
    validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                              '/validation')

    tf.global_variables_initializer().run()

    start_step = 1

    if FLAGS.start_checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
        start_step = global_step.eval(session=sess)
    else:
        checkpoint_path = tf.train.latest_checkpoint(FLAGS.train_dir)
        if checkpoint_path:
            models.load_variables_from_checkpoint(sess, checkpoint_path)
            start_step = global_step.eval(session=sess)

    tf.logging.info('Training from step: %d ', start_step)

    # Save graph.pbtxt.
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         FLAGS.model_architecture + '.pbtxt')

    audio_processor = input_data.AudioProcessor(
        FLAGS.data_url,
        FLAGS.data_dir,
        FLAGS.silence_percentage,
        FLAGS.unknown_percentage,
        FLAGS.wanted_words.split(','),
        FLAGS.validation_percentage,
        FLAGS.testing_percentage,
        model_settings,
        feature_scaling=FLAGS.feature_scaling)

    # Save list of words.
    with gfile.GFile(
            os.path.join(FLAGS.train_dir,
                         FLAGS.model_architecture + '_labels.txt'), 'w') as f:
        f.write('\n'.join(audio_processor.words_list))

    # Training loop.
    training_steps_max = np.sum(training_steps_list)

    data_offset = 0
    audio_processor.shuffle_data('training')

    for training_step in xrange(start_step, training_steps_max + 1):
        # Figure out what the current learning rate is.
        training_steps_sum = 0
        for i in range(len(training_steps_list)):
            training_steps_sum += training_steps_list[i]
            if training_step <= training_steps_sum:
                learning_rate_value = learning_rates_list[i]
                break

        if data_offset > audio_processor.set_size(
                'training') - FLAGS.batch_size:
            data_offset = 0
            audio_processor.shuffle_data('training')

        # Pull the audio samples we'll use for training.
        data_start = time.time()
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, data_offset, model_settings,
            FLAGS.background_frequency, FLAGS.background_volume,
            time_shift_samples, 'training', sess)
        tf.logging.info("---- get_data %s seconds ----" %
                        str(time.time() - data_start)[:5])

        data_offset += FLAGS.batch_size
        # Run the graph with this batch of training data.
        train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
            [
                merged_summaries, evaluation_step, cross_entropy_mean,
                train_step, increment_global_step
            ],
            feed_dict={
                fingerprint_input: train_fingerprints,
                ground_truth_input: train_ground_truth,
                learning_rate_input: learning_rate_value,
                dropout_prob: FLAGS.dropout_prob
            })
        train_writer.add_summary(train_summary, training_step)
        if training_step % 10 == 1:
            tf.logging.info(
                'Time: %s, Epoch #%d: step #%d: rate %f, accuracy %.1f%%, cross entropy %f'
                % (time.asctime(),
                   int((training_step * FLAGS.batch_size) /
                       audio_processor.set_size('training')), training_step,
                   learning_rate_value, train_accuracy * 100,
                   cross_entropy_value))
        is_last_step = (training_step == training_steps_max)
        if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
            set_size = audio_processor.set_size('validation')
            total_accuracy = 0
            total_conf_matrix = None
            for i in xrange(0, set_size, FLAGS.batch_size):
                validation_fingerprints, validation_ground_truth = (
                    audio_processor.get_data(FLAGS.batch_size, i,
                                             model_settings, 0.0, 0.0, 0,
                                             'validation', sess))
                # Run a validation step and capture training summaries for TensorBoard
                # with the `merged` op.
                validation_summary, validation_accuracy, conf_matrix = sess.run(
                    [merged_summaries, evaluation_step, confusion_matrix],
                    feed_dict={
                        fingerprint_input: validation_fingerprints,
                        ground_truth_input: validation_ground_truth,
                        dropout_prob: 1.0
                    })
                validation_writer.add_summary(validation_summary,
                                              training_step)
                batch_size = min(FLAGS.batch_size, set_size - i)
                total_accuracy += (validation_accuracy * batch_size) / set_size
                if total_conf_matrix is None:
                    total_conf_matrix = conf_matrix
                else:
                    total_conf_matrix += conf_matrix
            tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
            tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                            (training_step, total_accuracy * 100, set_size))

        # Save the model checkpoint periodically.
        if (training_step % FLAGS.save_step_interval == 0
                or training_step == training_steps_max):
            checkpoint_path = os.path.join(FLAGS.train_dir,
                                           FLAGS.model_architecture + '.ckpt')
            tf.logging.info('Saving to "%s-%d"', checkpoint_path,
                            training_step)
            saver.save(sess, checkpoint_path, global_step=training_step)

    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in xrange(0, set_size, FLAGS.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
                dropout_prob: 1.0
            })
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Final test accuracy = %.1f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
Exemplo n.º 55
0
def main(unused_args):
  logging.info('Training IBP on %s...', FLAGS.dataset.upper())
  step = tf.train.get_or_create_global_step()

  # Learning rate.
  learning_rate = ibp.parse_learning_rate(step, FLAGS.learning_rate)

  # Dataset.
  input_bounds = (0., 1.)
  num_classes = 10
  if FLAGS.dataset == 'mnist':
    data_train, data_test = tf.keras.datasets.mnist.load_data()
  else:
    assert FLAGS.dataset == 'cifar10', (
        'Unknown dataset "{}"'.format(FLAGS.dataset))
    data_train, data_test = tf.keras.datasets.cifar10.load_data()
    data_train = (data_train[0], data_train[1].flatten())
    data_test = (data_test[0], data_test[1].flatten())
  data = ibp.build_dataset(data_train, batch_size=FLAGS.batch_size,
                           sequential=False)
  if FLAGS.dataset == 'cifar10':
    data = data._replace(image=ibp.randomize(
        data.image, (32, 32, 3), expand_shape=(40, 40, 3),
        crop_shape=(32, 32, 3), vertical_flip=True))

  # Base predictor network.
  original_predictor = ibp.DNN(num_classes, layers(FLAGS.model))
  predictor = original_predictor
  if FLAGS.dataset == 'cifar10':
    mean = (0.4914, 0.4822, 0.4465)
    std = (0.2023, 0.1994, 0.2010)
    predictor = ibp.add_image_normalization(original_predictor, mean, std)
  predictor = ibp.VerifiableModelWrapper(predictor)

  # Training.
  train_losses, train_loss, _ = ibp.create_classification_losses(
      step,
      data.image,
      data.label,
      predictor,
      FLAGS.epsilon_train,
      loss_weights={
          'nominal': {'init': FLAGS.nominal_xent_init,
                      'final': FLAGS.nominal_xent_final},
          'attack': {'init': FLAGS.attack_xent_init,
                     'final': FLAGS.attack_xent_final},
          'verified': {'init': FLAGS.verified_xent_init,
                       'final': FLAGS.verified_xent_final},
      },
      warmup_steps=FLAGS.warmup_steps,
      rampup_steps=FLAGS.rampup_steps,
      input_bounds=input_bounds)
  saver = tf.train.Saver(original_predictor.get_variables())
  optimizer = tf.train.AdamOptimizer(learning_rate)
  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies(update_ops):
    train_op = optimizer.minimize(train_loss, step)

  # Test using while loop.
  def get_test_metrics(batch_size, attack_builder=ibp.UntargetedPGDAttack):
    """Returns the test metrics."""
    num_test_batches = len(data_test[0]) // batch_size
    assert len(data_test[0]) % batch_size == 0, (
        'Test data is not a multiple of batch size.')

    def cond(i, *unused_args):
      return i < num_test_batches

    def body(i, metrics):
      """Compute the sum of all metrics."""
      test_data = ibp.build_dataset(data_test, batch_size=batch_size,
                                    sequential=True)
      predictor(test_data.image, is_training=False)
      input_interval_bounds = ibp.IntervalBounds(
          tf.maximum(test_data.image - FLAGS.epsilon, input_bounds[0]),
          tf.minimum(test_data.image + FLAGS.epsilon, input_bounds[1]))
      predictor.propagate_bounds(input_interval_bounds)
      test_specification = ibp.ClassificationSpecification(
          test_data.label, num_classes)
      test_attack = attack_builder(predictor, test_specification, FLAGS.epsilon,
                                   input_bounds=input_bounds,
                                   optimizer_builder=ibp.UnrolledAdam)
      test_losses = ibp.Losses(predictor, test_specification, test_attack)
      test_losses(test_data.label)
      new_metrics = []
      for m, n in zip(metrics, test_losses.scalar_metrics):
        new_metrics.append(m + n)
      return i + 1, new_metrics

    total_count = tf.constant(0, dtype=tf.int32)
    total_metrics = [tf.constant(0, dtype=tf.float32)
                     for _ in range(len(ibp.ScalarMetrics._fields))]
    total_count, total_metrics = tf.while_loop(
        cond,
        body,
        loop_vars=[total_count, total_metrics],
        back_prop=False,
        parallel_iterations=1)
    total_count = tf.cast(total_count, tf.float32)
    test_metrics = []
    for m in total_metrics:
      test_metrics.append(m / total_count)
    return ibp.ScalarMetrics(*test_metrics)

  test_metrics = get_test_metrics(
      FLAGS.batch_size, ibp.UntargetedPGDAttack)
  summaries = []
  for f in test_metrics._fields:
    summaries.append(
        tf.summary.scalar(f, getattr(test_metrics, f)))
  test_summaries = tf.summary.merge(summaries)
  test_writer = tf.summary.FileWriter(os.path.join(FLAGS.output_dir, 'test'))

  # Run everything.
  tf_config = tf.ConfigProto()
  tf_config.gpu_options.allow_growth = True
  with tf.train.SingularMonitoredSession(config=tf_config) as sess:
    for _ in range(FLAGS.steps):
      iteration, loss_value, _ = sess.run(
          [step, train_losses.scalar_losses.nominal_cross_entropy, train_op])
      if iteration % FLAGS.test_every_n == 0:
        metric_values, summary = sess.run([test_metrics, test_summaries])
        test_writer.add_summary(summary, iteration)
        show_metrics(iteration, metric_values, loss_value=loss_value)
    saver.save(sess._tf_sess(),  # pylint: disable=protected-access
               os.path.join(FLAGS.output_dir, 'model'),
               global_step=FLAGS.steps - 1)
Exemplo n.º 56
0
    def build_model(self):
        # Define data, placeholders, and tracking variables.
        self.real_points = load_2d_data(
                self.dataset, self.real_n, self.real_dim)
        self.real_sample = tf.placeholder(
                tf.float64, [None, self.real_dim], name='real_sample')
        self.real_sample_opt_trans = tf.placeholder(
                tf.float64, [None, self.real_dim], name='real_sample_opt_trans')
        self.z = tf.placeholder(tf.float64, [None, self.z_dim], name='z')
        self.z_opt_trans = tf.placeholder(tf.float64, [None, self.z_dim],
                name='z_opt_trans')
        self.z_preimage = tf.Variable(tf.random_normal(
                [self.real_n, self.z_dim], stddev=0.1, dtype=tf.float64),
                name='z_preimage')
        self.k_d = tf.Variable(0., dtype=tf.float64, trainable=False,
                name='k_d')
        self.k_g = tf.Variable(0., dtype=tf.float64, trainable=False,
                name='k_g')

        # Compute generator and autoencoder outputs.
        self.gen_z = generator(
                self.z, self.g_layers_width, self.g_layers_depth,
                self.g_activations, self.g_out_dim, reuse=False)
        self.gen_z_opt_trans = generator(
                self.z_opt_trans, self.g_layers_width, self.g_layers_depth,
                self.g_activations, self.g_out_dim, reuse=True)
        self.gen_z_preimage = generator(self.z_preimage,
                self.g_layers_width, self.g_layers_depth,
                self.g_activations, self.g_out_dim, reuse=True)
        self.ae_real_sample = decoder(
                encoder(
                    self.real_sample, self.d_layers_width, self.d_layers_depth,
                    self.d_activations, self.d_encoded_dim, reuse=False),
                self.d_layers_width, self.d_layers_depth, self.d_activations,
                self.d_out_dim, reuse=False)
        self.ae_gen_z = decoder(
                encoder(
                    self.gen_z, self.d_layers_width, self.d_layers_depth,
                    self.d_activations, self.d_encoded_dim, reuse=True),
                self.d_layers_width, self.d_layers_depth, self.d_activations,
                self.d_out_dim, reuse=True)
        self.ae_grid = decoder(
                encoder(
                    tf.convert_to_tensor(self.grid), self.d_layers_width,
                    self.d_layers_depth, self.d_activations, self.d_encoded_dim,
                    reuse=True),
                self.d_layers_width, self.d_layers_depth, self.d_activations, 
                self.d_out_dim, reuse=True)

        # Define autoencoder losses.
        self.ae_loss_real = tf.reduce_mean(
                tf.abs(self.ae_real_sample - self.real_sample))
        self.ae_loss_gen = tf.reduce_mean(tf.abs(self.ae_gen_z - self.gen_z))

        self.ae_loss_real_vals = tf.reduce_sum(
                tf.abs(self.ae_real_sample - self.real_sample), 1)
        self.ae_loss_gen_vals = tf.reduce_sum(
                tf.abs(self.ae_gen_z - self.gen_z), 1)
        self.ae_loss_grid_vals = tf.reduce_sum(
                tf.abs(self.ae_grid - self.grid), 1)

        # Define losses.
        self.d_loss = self.ae_loss_real - self.k_d * self.ae_loss_gen
        self.normality_loss = tf.py_func(self.normality_dist, [self.z_preimage],
                tf.float64)
        self.gen_z_preimage_loss = tf.reduce_mean(
                tf.abs(self.real_points - self.gen_z_preimage)
                ) + self.lambda_normality_loss * self.normality_loss
        # Define coverage loss formulas: munkres, moments.
        self.coverage_loss = tf.reduce_mean(tf.abs( 
                self.gen_z_opt_trans - self.real_sample_opt_trans))
        self.gen_z_m1 = tf.reduce_mean(tf.pow(self.gen_z, 1), axis=0)
        self.gen_z_m2 = tf.reduce_mean(tf.pow(self.gen_z, 2), axis=0)
        self.gen_z_var = self.gen_z_m2 - tf.square(self.gen_z_m1) 
        self.real_m1 = tf.reduce_mean(tf.pow(self.real_sample, 1), axis=0)
        self.real_m2 = tf.reduce_mean(tf.pow(self.real_sample, 2), axis=0)
        self.real_var = self.real_m2 - tf.square(self.real_m1) 
        self.cvg_loss_m1 = tf.norm(self.gen_z_m1 - self.real_m1)
        self.cvg_loss_m2 = tf.norm(self.gen_z_m2 - self.real_m2)
        self.cvg_loss_var = tf.norm(self.gen_z_var - self.real_var)
        self.coverage_loss_moments = (
                self.cvg_loss_m1 + self.cvg_loss_m2 + self.cvg_loss_var)

        if self.training_z in ['preimage', 'mix']:
            self.g_loss = self.ae_loss_gen + self.k_g * (self.coverage_loss_moments +
                    self.gen_z_preimage_loss)
        else:
            #self.g_loss = self.coverage_loss
            self.g_loss = self.ae_loss_gen + self.k_g * self.coverage_loss
            #self.g_loss = self.ae_loss_gen + self.coverage_loss_moments

        # Build optimization ops.
        self.g_vars = [
            var for var in tf.global_variables() if 'generator' in var.name]
        self.d_vars = [
            var for var in tf.global_variables() if 'autoencoder' in var.name]
        self.preimage_vars = [
            var for var in tf.global_variables() if 'preimage' in var.name]

        if self.optimizer == 'adagrad':
            optimizer = tf.train.AdagradOptimizer
        else:
            optimizer = tf.train.AdamOptimizer

        # Define optimization nodes.
        self.d_optim = optimizer(self.d_lr).minimize(
                self.d_loss, var_list=self.d_vars)
        self.g_optim = optimizer(self.g_lr).minimize(
                self.g_loss, var_list=self.g_vars)
        self.z_optim = optimizer(self.c_lr).minimize(
                self.gen_z_preimage_loss, var_list=self.preimage_vars)

        self.emp_gamma_d = self.ae_loss_gen / self.ae_loss_real
        self.emp_gamma_g = self.ae_loss_gen / self.coverage_loss
        self.balance_d = self.gamma_d * self.ae_loss_real - self.ae_loss_gen
        self.balance_g = self.gamma_g * self.coverage_loss - self.ae_loss_gen
        self.measure = self.ae_loss_real + tf.abs(self.balance_d)

        #with tf.control_dependencies([self.d_optim, self.g_optim]):
        with tf.control_dependencies([self.d_optim]):
            self.k_d_update = tf.assign(
                self.k_d, 
                tf.clip_by_value(
                    self.k_d + self.lambda_k_d * self.balance_d, 0, 1))
        with tf.control_dependencies([self.g_optim]):
            self.k_g_update = tf.assign(
                self.k_g,
                tf.clip_by_value(
                    self.k_g + self.lambda_k_g * self.balance_g, 0, 1))


        # Set up summary items.
        self.summary_op = tf.summary.merge([
            tf.summary.scalar('loss/d_loss', self.d_loss),
            tf.summary.scalar('loss/ae_loss_real', self.ae_loss_real),
            tf.summary.scalar('loss/ae_loss_gen', self.ae_loss_gen),
            tf.summary.scalar('loss/g_loss', self.g_loss),
            tf.summary.scalar('loss/coverage_loss', self.coverage_loss),
            tf.summary.scalar('loss/normality_loss', self.normality_loss),
            tf.summary.scalar('balance/emp_gamma_d', self.emp_gamma_d),
            tf.summary.scalar('balance/emp_gamma_g', self.emp_gamma_g),
            tf.summary.scalar('balance/measure', self.measure),
            tf.summary.scalar('balance/k_d', self.k_d),
            tf.summary.scalar('balance/k_g', self.k_g),
            tf.summary.scalar('training/d_lr', self.d_lr),
            tf.summary.scalar('training/g_lr', self.g_lr),
        ])

        tf.global_variables_initializer().run()
Exemplo n.º 57
0
     return output_gate * tf.tanh(state), state
 # Input data.
 train_data = list()
 for _ in range(num_unrollings + 1):
     train_data.append( tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
 train_inputs = train_data[:num_unrollings]
 train_labels = train_data[1:]  # labels are inputs shifted by one time step.
 # Unrolled LSTM loop.
 outputs = list()
 output = saved_output
 state = saved_state
 for i in train_inputs:
     output, state = lstm_cell(i, output, state)
     outputs.append(output)
 # State saving across unrollings.
 with tf.control_dependencies([saved_output.assign(output),saved_state.assign(state)]):
     # Classifier.
     logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
     loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits, tf.concat(0, train_labels)))
 # Optimizer.
 global_step = tf.Variable(0)
 learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
 optimizer = tf.train.GradientDescentOptimizer(learning_rate)
 gradients, v = zip(*optimizer.compute_gradients(loss))
 gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
 optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
 # Predictions.
 train_prediction = tf.nn.softmax(logits)
 # Sampling and validation eval: batch 1, no unrolling.
 sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
 saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
Exemplo n.º 58
0
    # restore_vars = tf.contrib.framework.get_variables_to_restore(include=RESTORE_PART)
    # update_vars = tf.contrib.framework.get_variables_to_restore(include=UPDATE_PART)
    # saver_restore = tf.train.Saver(var_list=restore_vars)

    # Restore all vars (if continue training)
    # saver_restore = tf.train.Saver()
    
    # average model 
    ema = tf.train.ExponentialMovingAverage(decay=0.99, num_updates=global_step)
    ema_op = ema.apply(tf.trainable_variables())

    # Update the BN vars
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    # train_step = optimizer.minimize(loss, global_step=global_step, var_list=update_vars+restore_vars)
    train_step = optimizer.minimize(loss, global_step=global_step)
    with tf.control_dependencies(update_ops):
        with tf.control_dependencies([train_step, ema_op]):
            train_op = tf.no_op(name='train')

    # Set session config    
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.per_process_gpu_memory_fraction = 0.80

    with tf.Session(config=config) as sess:
        sess.run((tf.global_variables_initializer(), tf.local_variables_initializer()))
        # saver_restore.restore(sess, RESTORE_PATH)
        saver = tf.train.Saver()

        write_op = tf.summary.merge_all()
        write_train = tf.summary.FileWriter('./log/train', sess.graph)
        write_val = tf.summary.FileWriter('./log/val')
Exemplo n.º 59
0
 def mean_var_with_update():
     with tf.control_dependencies([update_moving_mean, update_moving_variance]):
         return tf.identity(mean), tf.identity(variance)
Exemplo n.º 60
0
def train(total_loss, global_step, optimizer, learning_rate, moving_average_decay, update_gradient_vars,
          warm_flag,log_histograms=True):
    # Generate moving averages of all losses and associated summaries.
    loss_averages_op = _add_loss_summaries(total_loss)
    if warm_flag:
        # Compute gradients.
        with tf.control_dependencies([loss_averages_op]):
            if optimizer == 'ADAGRAD':
                opt = tf.train.AdagradOptimizer(learning_rate * hvd.size())
            elif optimizer == 'ADADELTA':
                opt = tf.train.AdadeltaOptimizer(learning_rate * hvd.size(), rho=0.9, epsilon=1e-6)
            elif optimizer == 'ADAM':
                opt = tf.train.AdamOptimizer(learning_rate * hvd.size(), beta1=0.9, beta2=0.999, epsilon=0.1)
            elif optimizer == 'RMSPROP':
                opt = tf.train.RMSPropOptimizer(learning_rate * hvd.size(), decay=0.9, momentum=0.9, epsilon=1.0)
            elif optimizer == 'MOM':
                opt = tf.train.MomentumOptimizer(learning_rate * hvd.size(), 0.9, use_nesterov=True)
            else:
                raise ValueError('Invalid optimization algorithm')
            # Add Horovod Distributed Optimizer
            opt = hvd.DistributedOptimizer(opt)
            grads = opt.compute_gradients(total_loss, update_gradient_vars)
    else:
        # Compute gradients.
        with tf.control_dependencies([loss_averages_op]):
            if optimizer == 'ADAGRAD':
                opt = tf.train.AdagradOptimizer(learning_rate)
            elif optimizer == 'ADADELTA':
                opt = tf.train.AdadeltaOptimizer(learning_rate, rho=0.9, epsilon=1e-6)
            elif optimizer == 'ADAM':
                opt = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.999, epsilon=0.1)
            elif optimizer == 'RMSPROP':
                opt = tf.train.RMSPropOptimizer(learning_rate, decay=0.9, momentum=0.9, epsilon=1.0)
            elif optimizer == 'MOM':
                opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_nesterov=True)
            else:
                raise ValueError('Invalid optimization algorithm')
            # Add Horovod Distributed Optimizer
            opt = hvd.DistributedOptimizer(opt)
            grads = opt.compute_gradients(total_loss, update_gradient_vars)

    # Apply gradients.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    # Add histograms for trainable variables.
    if log_histograms:
        for var in tf.trainable_variables():
            tf.summary.histogram(var.op.name, var)

    # Add histograms for gradients.
    if log_histograms:
        for grad, var in grads:
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradients', grad)

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(
        moving_average_decay, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())

    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
        train_op = tf.no_op(name='train')

    return train_op