def __init__(self, models, config): self.config = config self.models = models self.global_step = models[0].global_step # learning_rate = config.init_lr if config.use_lr_decay: if config.use_cosine_and_warm_up: warm_up_start = config.init_lr * 0.33 # linear increasing from 0.33*lr to lr in warm_up_steps warm_up_lr = tf.train.polynomial_decay( warm_up_start, self.global_step, config.warm_up_steps, config.init_lr, power=1.0, ) max_steps = int(config.train_num_examples / config.im_batch_size * config.num_epochs) cosine_lr = tf.train.cosine_decay( config.init_lr, self.global_step - config.warm_up_steps, max_steps - config.warm_up_steps, alpha=0.0) boundaries = [ config.warm_up_steps ] # before reaching warm_up steps, use the warm up learning rate. values = [warm_up_lr, cosine_lr] learning_rate = tf.train.piecewise_constant( self.global_step, boundaries, values) print "learning rate warm up lr from %s to %s in %s steps, then cosine learning rate decay till %s steps" % ( warm_up_start, config.init_lr, config.warm_up_steps, max_steps) else: decay_steps = int(config.train_num_examples / config.im_batch_size * config.num_epoch_per_decay) learning_rate = tf.train.exponential_decay( config.init_lr, self.global_step, decay_steps, config.learning_rate_decay, staircase=True) print "learning rate exponential_decay: every %s steps then lr*%s" % ( decay_steps, config.learning_rate_decay) self.learning_rate = learning_rate else: self.learning_rate = None last_layer_lr_mul = 10.0 if config.optimizer == 'adadelta': self.opt_cnn = tf.train.AdadeltaOptimizer(learning_rate) self.opt_fc = tf.train.AdadeltaOptimizer(learning_rate * last_layer_lr_mul) elif config.optimizer == "adam": self.opt_cnn = tf.train.AdamOptimizer(learning_rate) self.opt_fc = tf.train.AdamOptimizer(learning_rate * last_layer_lr_mul) elif config.optimizer == "sgd": self.opt_cnn = tf.train.GradientDescentOptimizer(learning_rate) self.opt_fc = tf.train.GradientDescentOptimizer(learning_rate * last_layer_lr_mul) elif config.optimizer == "momentum": self.opt_cnn = tf.train.MomentumOptimizer(learning_rate, momentum=config.momentum) self.opt_fc = tf.train.MomentumOptimizer(learning_rate * last_layer_lr_mul, momentum=config.momentum) else: print "optimizer not implemented" sys.exit() self.box_label_losses = [model.box_label_loss for model in models] if config.wd is not None: self.wd = [model.wd for model in models] self.losses = [] self.grads_cnn = [] self.grads_fc = [] for model in self.models: gpuid = model.gpuid # compute gradients on each gpu devices with tf.device( assign_to_device("/gpu:%s" % (gpuid), config.controller)): self.losses.append(model.loss) var_cnn = [ var for var in tf.trainable_variables() if not var.name.startswith("dcr_classification") ] var_fc = [ var for var in tf.trainable_variables() if var.name.startswith("dcr_classification") ] grads = tf.gradients(model.loss, var_cnn + var_fc) not_valid_idxs = [ i for i in xrange(len(grads)) if grads[i] is None ] grads = [ grads[i] for i in xrange(len(grads)) if i not in not_valid_idxs ] # we freeze resnet, so there will be none gradient var_cnn = [ var_cnn[i] for i in xrange(len(var_cnn)) if i not in not_valid_idxs ] # we assume fc variable all are not freezed # whehter to clip gradient if config.clip_gradient_norm is not None: grads = [ tf.clip_by_value(g, -1 * config.clip_gradient_norm, config.clip_gradient_norm) for g in grads ] grads_cnn = grads[:len(var_cnn)] grads_fc = grads[len(var_cnn):] self.grads_cnn.append(grads_cnn) self.grads_fc.append(grads_fc) #print "valid var cnn %s, var fc %s, total valid grads %s"%(len(var_cnn), len(var_fc), len(grads)) # apply gradient on the controlling device with tf.device(config.controller): avg_loss = tf.reduce_mean(self.losses) avg_grads_cnn = average_gradients(self.grads_cnn, sum_grads=True) avg_grads_fc = average_gradients(self.grads_fc, sum_grads=True) cnn_train_op = self.opt_cnn.apply_gradients( zip(avg_grads_cnn, var_cnn), global_step=self.global_step) fc_train_op = self.opt_fc.apply_gradients(zip( avg_grads_fc, var_fc)) self.train_op = tf.group(cnn_train_op, fc_train_op) self.loss = avg_loss
def __init__(self,models,config): self.config = config self.models = models self.global_step = models[0].global_step # learning_rate = config.init_lr if config.use_lr_decay: # always use warmup, set step to zero to disable warm_up_start = config.init_lr * 0.33 # linear increasing from 0.33*lr to lr in warm_up_steps warm_up_lr = tf.train.polynomial_decay( warm_up_start, self.global_step, config.warm_up_steps, config.init_lr, power=1.0, ) if config.use_cosine_schedule: max_steps = int(config.train_num_examples / config.im_batch_size * config.num_epochs) schedule_lr = tf.train.cosine_decay( config.init_lr, self.global_step - config.warm_up_steps - config.same_lr_steps, max_steps - config.warm_up_steps - config.same_lr_steps, alpha=0.0 ) else: decay_steps = int(config.train_num_examples / config.im_batch_size * config.num_epoch_per_decay) schedule_lr = tf.train.exponential_decay( config.init_lr, self.global_step, decay_steps, config.learning_rate_decay, staircase=True ) boundaries = [config.warm_up_steps, config.warm_up_steps + config.same_lr_steps] # before reaching warm_up steps, use the warm up learning rate. values = [warm_up_lr, config.init_lr, schedule_lr] learning_rate = tf.train.piecewise_constant(self.global_step, boundaries, values) print("learning rate warm up lr from %s to %s in %s steps, then keep for %s steps, then schedule learning rate decay" % (warm_up_start, config.init_lr, config.warm_up_steps, config.same_lr_steps)) self.learning_rate = learning_rate else: self.learning_rate = tf.constant(config.init_lr, dtype="float") if config.optimizer == 'adadelta': self.opt = tf.train.AdadeltaOptimizer(learning_rate) elif config.optimizer == "adam": self.opt = tf.train.AdamOptimizer(learning_rate) elif config.optimizer == "sgd": self.opt = tf.train.GradientDescentOptimizer(learning_rate) elif config.optimizer == "momentum": self.opt = tf.train.MomentumOptimizer(learning_rate, momentum=config.momentum) else: print("optimizer not implemented") sys.exit() self.rpn_label_losses = [model.rpn_label_loss for model in models] self.rpn_box_losses = [model.rpn_box_loss for model in models] self.fastrcnn_label_losses = [model.fastrcnn_label_loss for model in models] self.fastrcnn_box_losses = [model.fastrcnn_box_loss for model in models] if config.wd is not None: self.wd = [model.wd for model in models] if config.use_small_object_head: self.so_label_losses = [model.so_label_loss for model in models] if config.add_act: self.act_losses = [model.act_losses for model in self.models] self.losses = [] self.grads = [] for model in self.models: gpuid = model.gpuid # compute gradients on each gpu devices with tf.device(assign_to_device("/GPU:%s"%(gpuid), config.controller)): self.losses.append(model.loss) grad = self.opt.compute_gradients(model.loss) grad = [(g,var) for g, var in grad if g is not None] # we freeze resnet, so there will be none gradient # whehter to clip gradient if config.clip_gradient_norm is not None: grad = [(tf.clip_by_value(g, -1*config.clip_gradient_norm, config.clip_gradient_norm), var) for g, var in grad] self.grads.append(grad) # apply gradient on the controlling device with tf.device(config.controller): avg_loss = tf.reduce_mean(self.losses) avg_grads = average_gradients(self.grads,sum_grads=True) self.train_op = self.opt.apply_gradients(avg_grads,global_step=self.global_step) self.loss = avg_loss