Пример #1
0
    def __init__(self, models, config):
        self.config = config
        self.models = models
        self.global_step = models[0].global_step  #

        learning_rate = config.init_lr

        if config.use_lr_decay:
            if config.use_cosine_and_warm_up:
                warm_up_start = config.init_lr * 0.33
                # linear increasing from 0.33*lr to lr in warm_up_steps
                warm_up_lr = tf.train.polynomial_decay(
                    warm_up_start,
                    self.global_step,
                    config.warm_up_steps,
                    config.init_lr,
                    power=1.0,
                )

                max_steps = int(config.train_num_examples /
                                config.im_batch_size * config.num_epochs)
                cosine_lr = tf.train.cosine_decay(
                    config.init_lr,
                    self.global_step - config.warm_up_steps,
                    max_steps - config.warm_up_steps,
                    alpha=0.0)

                boundaries = [
                    config.warm_up_steps
                ]  # before reaching warm_up steps, use the warm up learning rate.
                values = [warm_up_lr, cosine_lr]
                learning_rate = tf.train.piecewise_constant(
                    self.global_step, boundaries, values)
                print "learning rate warm up lr from %s to %s in %s steps, then cosine learning rate decay till %s steps" % (
                    warm_up_start, config.init_lr, config.warm_up_steps,
                    max_steps)
            else:
                decay_steps = int(config.train_num_examples /
                                  config.im_batch_size *
                                  config.num_epoch_per_decay)
                learning_rate = tf.train.exponential_decay(
                    config.init_lr,
                    self.global_step,
                    decay_steps,
                    config.learning_rate_decay,
                    staircase=True)
                print "learning rate exponential_decay: every %s steps then lr*%s" % (
                    decay_steps, config.learning_rate_decay)

            self.learning_rate = learning_rate
        else:
            self.learning_rate = None

        last_layer_lr_mul = 10.0
        if config.optimizer == 'adadelta':
            self.opt_cnn = tf.train.AdadeltaOptimizer(learning_rate)
            self.opt_fc = tf.train.AdadeltaOptimizer(learning_rate *
                                                     last_layer_lr_mul)
        elif config.optimizer == "adam":
            self.opt_cnn = tf.train.AdamOptimizer(learning_rate)
            self.opt_fc = tf.train.AdamOptimizer(learning_rate *
                                                 last_layer_lr_mul)
        elif config.optimizer == "sgd":
            self.opt_cnn = tf.train.GradientDescentOptimizer(learning_rate)
            self.opt_fc = tf.train.GradientDescentOptimizer(learning_rate *
                                                            last_layer_lr_mul)
        elif config.optimizer == "momentum":
            self.opt_cnn = tf.train.MomentumOptimizer(learning_rate,
                                                      momentum=config.momentum)
            self.opt_fc = tf.train.MomentumOptimizer(learning_rate *
                                                     last_layer_lr_mul,
                                                     momentum=config.momentum)
        else:
            print "optimizer not implemented"
            sys.exit()

        self.box_label_losses = [model.box_label_loss for model in models]
        if config.wd is not None:
            self.wd = [model.wd for model in models]

        self.losses = []
        self.grads_cnn = []
        self.grads_fc = []
        for model in self.models:
            gpuid = model.gpuid
            # compute gradients on each gpu devices
            with tf.device(
                    assign_to_device("/gpu:%s" % (gpuid), config.controller)):

                self.losses.append(model.loss)

                var_cnn = [
                    var for var in tf.trainable_variables()
                    if not var.name.startswith("dcr_classification")
                ]
                var_fc = [
                    var for var in tf.trainable_variables()
                    if var.name.startswith("dcr_classification")
                ]
                grads = tf.gradients(model.loss, var_cnn + var_fc)

                not_valid_idxs = [
                    i for i in xrange(len(grads)) if grads[i] is None
                ]
                grads = [
                    grads[i] for i in xrange(len(grads))
                    if i not in not_valid_idxs
                ]  # we freeze resnet, so there will be none gradient
                var_cnn = [
                    var_cnn[i] for i in xrange(len(var_cnn))
                    if i not in not_valid_idxs
                ]  # we assume fc variable all are not freezed

                # whehter to clip gradient
                if config.clip_gradient_norm is not None:
                    grads = [
                        tf.clip_by_value(g, -1 * config.clip_gradient_norm,
                                         config.clip_gradient_norm)
                        for g in grads
                    ]
                grads_cnn = grads[:len(var_cnn)]
                grads_fc = grads[len(var_cnn):]

                self.grads_cnn.append(grads_cnn)
                self.grads_fc.append(grads_fc)

                #print "valid var cnn %s, var fc %s, total valid grads %s"%(len(var_cnn), len(var_fc), len(grads))

        # apply gradient on the controlling device
        with tf.device(config.controller):
            avg_loss = tf.reduce_mean(self.losses)
            avg_grads_cnn = average_gradients(self.grads_cnn, sum_grads=True)
            avg_grads_fc = average_gradients(self.grads_fc, sum_grads=True)

            cnn_train_op = self.opt_cnn.apply_gradients(
                zip(avg_grads_cnn, var_cnn), global_step=self.global_step)
            fc_train_op = self.opt_fc.apply_gradients(zip(
                avg_grads_fc, var_fc))
            self.train_op = tf.group(cnn_train_op, fc_train_op)
            self.loss = avg_loss
Пример #2
0
	def __init__(self,models,config):
		self.config = config
		self.models = models
		self.global_step = models[0].global_step #

		learning_rate = config.init_lr

		if config.use_lr_decay:
			# always use warmup, set step to zero to disable
			warm_up_start = config.init_lr * 0.33
			# linear increasing from 0.33*lr to lr in warm_up_steps
			warm_up_lr = tf.train.polynomial_decay(
				warm_up_start,
				self.global_step,
				config.warm_up_steps,
				config.init_lr,
				power=1.0,
			)

			if config.use_cosine_schedule:
				max_steps = int(config.train_num_examples / config.im_batch_size * config.num_epochs)
				schedule_lr = tf.train.cosine_decay(
				 	config.init_lr,
					self.global_step - config.warm_up_steps - config.same_lr_steps,
					max_steps - config.warm_up_steps - config.same_lr_steps,
					alpha=0.0
				)
			else:
				decay_steps = int(config.train_num_examples / config.im_batch_size * config.num_epoch_per_decay)
				schedule_lr = tf.train.exponential_decay(
				 	config.init_lr,
					self.global_step,
					decay_steps,
					config.learning_rate_decay,
					staircase=True
				)

			boundaries = [config.warm_up_steps, config.warm_up_steps + config.same_lr_steps] # before reaching warm_up steps, use the warm up learning rate.
			values = [warm_up_lr, config.init_lr, schedule_lr]
			learning_rate = tf.train.piecewise_constant(self.global_step, boundaries, values)
			print("learning rate warm up lr from %s to %s in %s steps, then keep for %s steps, then schedule learning rate decay" % (warm_up_start, config.init_lr, config.warm_up_steps, config.same_lr_steps))

			self.learning_rate = learning_rate
		else:
			self.learning_rate = tf.constant(config.init_lr, dtype="float")

		if config.optimizer == 'adadelta':
			self.opt = tf.train.AdadeltaOptimizer(learning_rate)
		elif config.optimizer == "adam":
			self.opt = tf.train.AdamOptimizer(learning_rate)
		elif config.optimizer == "sgd":
			self.opt = tf.train.GradientDescentOptimizer(learning_rate)
		elif config.optimizer == "momentum":
			self.opt = tf.train.MomentumOptimizer(learning_rate, momentum=config.momentum)
		else:
			print("optimizer not implemented")
			sys.exit()

		self.rpn_label_losses = [model.rpn_label_loss for model in models]
		self.rpn_box_losses = [model.rpn_box_loss for model in models]
		self.fastrcnn_label_losses = [model.fastrcnn_label_loss for model in models]
		self.fastrcnn_box_losses = [model.fastrcnn_box_loss for model in models]


		if config.wd is not None:
			self.wd = [model.wd for model in models]
		if config.use_small_object_head:
			self.so_label_losses = [model.so_label_loss for model in models]

		if config.add_act:
			self.act_losses = [model.act_losses for model in self.models]

		self.losses = []
		self.grads = []
		for model in self.models:
			gpuid = model.gpuid
			# compute gradients on each gpu devices
			with tf.device(assign_to_device("/GPU:%s"%(gpuid), config.controller)):
				self.losses.append(model.loss)
				grad = self.opt.compute_gradients(model.loss)

				grad = [(g,var) for g, var in grad if g is not None] # we freeze resnet, so there will be none gradient

				# whehter to clip gradient
				if config.clip_gradient_norm is not None:
					grad = [(tf.clip_by_value(g, -1*config.clip_gradient_norm, config.clip_gradient_norm), var) for g, var in grad]
				self.grads.append(grad)

		# apply gradient on the controlling device
		with tf.device(config.controller):
			avg_loss = tf.reduce_mean(self.losses)
			avg_grads = average_gradients(self.grads,sum_grads=True)

			self.train_op = self.opt.apply_gradients(avg_grads,global_step=self.global_step)
			self.loss = avg_loss