def _tower_loss(self, hr, lr, tower_index, reuse_variables, non_local): """Calculate the total loss on a single tower running the model (with the batch splitting) Args: datas: 4D tensor of size [batch_size, 1, image_size, image_size] labels: 1-D integer Tensor of [batch_size] scope: unique prefix string identifying the tower, e.g. 'tower_0' Returns: tensor of shape [] containing the total loss for a batch of data """ # build the inference graph with tf.variable_scope(tf.get_variable_scope()): net = Net(hr, lr, non_local, wl=self.weight_decay, tower=tower_index, reuse=reuse_variables) net.build_net() # return the total loss for the current tower return net.total_loss
def __init__(self): # training path self.train_data = conf.train_data self.models_dir = conf.models_dir self.logFilename = conf.log_name self.num_examples_per_epoch_for_train = conf.num_train_exps # for validation self.valid_data = conf.valid_data self.num_examples_per_epoch_for_valid = conf.num_valid_exps # make dirs if not path.exists(self.models_dir): makedirs(self.models_dir) # soft constraint for total epochs self.num_epoch = conf.num_epoch # device setting self.device_id = conf.device_id self.num_gpus = conf.num_gpus # hyper parameters self.batch_size = conf.batch_size self.valid_bs = conf.valid_bs self.weight_decay = conf.weight_decay # learning rate self.lr = tf.placeholder(tf.float32) self.base_lr = conf.base_lr self.power = conf.power self.end_lr = conf.end_lr # several multiplier self.loss_weight = conf.loss_weight self.lr_mp = conf.lr_mp self.decay_fraction = conf.decay_fraction # warming-up self.warmup_epoch = conf.warmup_epoch self.warmup_from0 = conf.warmup_from0 # resuming and finetune self.resume = conf.resume self.finetune = conf.finetune self.meta_data = conf.meta_data # whether to enable the non-local block self.non_local = conf.non_local self.iters = conf.iters if self.iters == None: if self.resume or self.finetune: raise ValueError( 'iters mush be specified when resume or finetune') self.finetune_models_dir = conf.finetune_models_dir # create an optimizer that performs gradient descent opt = tf.train.AdamOptimizer(self.lr) # get the training dataset with tf.device('/cpu:0'): t_hr_splits, t_lr_splits = self._get_data(mode='train') v_hr_splits, v_lr_splits = self._get_data(mode='valid') # calculate the gradients for each model tower reuse_variables = False tower_grads = [] self.losses = [] # for multi-gpu training with tf.variable_scope(tf.get_variable_scope()): for i in range(self.device_id, self.device_id + self.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i) as scope: # constructs the entire model but shares the variables across all towers loss = self._tower_loss(t_hr_splits[i], t_lr_splits[i], i, reuse_variables, self.non_local) # collect the total losses from each tower self.losses += [loss] # reuse variables for the next tower reuse_variables = True tf.get_variable_scope().reuse_variables() # calculate the gradients for the batch of data on this tower grads = opt.compute_gradients(loss) # keep track of the gradients across all towers tower_grads.append(grads) # calculate the mean of each gradient # note: this is the synchronization point across all towers if self.num_gpus > 1: grads = self._average_gradients(tower_grads) else: grads = tower_grads[0] # apply the gradients to adjust the shared variables self.train_op = opt.apply_gradients(grads) # for multi-gpu validation v_loss = 0.0 for i in range(self.device_id, self.device_id + self.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('vtower_%d' % i) as scope: net = Net(v_hr_splits[i], v_lr_splits[i], self.non_local, wl=self.weight_decay, tower=i, reuse=True) net.build_net() v_loss += net.total_loss self.v_loss = v_loss / self.num_gpus