def linear_warmup_decay(init_lr, num_train_steps, num_warmup_steps, main_program): with main_program._lr_schedule_guard(): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=init_lr, dtype='float32', persistable=True, name="learning_rate") with control_flow.Switch() as switch: with switch.case(global_step < num_warmup_steps): decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps fluid.layers.assign(decayed_lr, lr) with switch.default(): decayed_lr = lr_scheduler.polynomial_decay( learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) fluid.layers.assign(decayed_lr, lr) return lr
def scheduler_handler(self, max_train_steps): scheduled_lr = fluid.layers.create_global_var(shape=[1], value=self.learning_rate, dtype='float32', persistable=True, name="learning_rate") if not self.scheduler["slanted_triangle"]["cut_fraction"]: warmup_steps = int(max_train_steps * self.scheduler["warmup"]) linear_decay_start = int( max_train_steps * self.scheduler["linear_decay"]["start_point"]) if linear_decay_start < warmup_steps: logger.warning( "linear decay can not start during warmup process," "it will start after warmup ends!") linear_decay_start = warmup_steps if self.scheduler["noam_decay"]: if warmup_steps > 0: scheduled_lr = fluid.layers.learning_rate_scheduler \ .noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)), warmup_steps) else: logger.warning( "Noam decay learning rate scheduler should have positive \ warmup steps, using constant learning rate instead!") if not self.scheduler["noam_decay"] and \ (warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1): with self.main_program._lr_schedule_guard(): global_step = lr_scheduler._decay_step_counter() with control_flow.Switch() as switch: if warmup_steps > 0: with switch.case(global_step < warmup_steps): decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps fluid.layers.assign(decayed_lr, scheduled_lr) if self.scheduler["linear_decay"]["start_point"] < 1: with switch.case( global_step >= linear_decay_start): decayed_lr = lr_scheduler.polynomial_decay( learning_rate=self.learning_rate, decay_steps=max_train_steps, end_learning_rate=self.scheduler[ "linear_decay"]["end_learning_rate"], power=1.0, cycle=False) fluid.layers.assign(decayed_lr, scheduled_lr) else: if self.scheduler["warmup"] or self.scheduler[ "noam_decay"] or self.scheduler["linear_decay"][ "start_point"] < 1: logger.warning( "You are using slanted_triangle learning rate " "which will make warmup, noam_decay and linear_decay unable" ) cut_step = int(max_train_steps * self.scheduler["slanted_triangle"]["cut_fraction"]) ratio = self.scheduler["slanted_triangle"]["ratio"] global_step = lr_scheduler._decay_step_counter() with control_flow.Switch() as switch: with switch.case(global_step <= cut_step): pct = global_step / cut_step decayed_lr = self.learning_rate * (1 + pct * (ratio - 1)) / ratio fluid.layers.assign(decayed_lr, scheduled_lr) with switch.default(): pct = 1 - (global_step - cut_step) / (max_train_steps - cut_step) decayed_lr = self.learning_rate * (1 + pct * (ratio - 1)) / ratio fluid.layers.assign(decayed_lr, scheduled_lr) super(CombinedStrategy, self).__init__(optimizer_name=self._optimizer_name, learning_rate=scheduled_lr) if self.scheduler["discriminative"]["blocks"]: _block_layers = math.ceil( len(self.sorted_depth) / self.scheduler["discriminative"]["blocks"]) power = 0 for cnt, depth in enumerate(self.sorted_depth): for index, param in enumerate(self.depth_params_dict[depth]): param.optimize_attr["learning_rate"] *= \ pow(1.0 / self.scheduler["discriminative"]["factor"], power) if cnt and cnt % _block_layers == 0: power += 1 return scheduled_lr