def check_result(self, fn, place, dtype): shape = [9, 10] x_data = np.random.random(size=shape).astype(dtype) y_data = np.random.random(size=shape).astype(dtype) python_out = fn(x_data, y_data) x_var = layers.create_global_var(name='x', shape=shape, value=0.0, dtype=dtype, persistable=True) y_var = layers.create_global_var(name='y', shape=shape, value=0.0, dtype=dtype, persistable=True) out = fn(x_var, y_var) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) fluid_out = exe.run(fluid.default_main_program(), feed={ 'x': x_data, 'y': y_data }, fetch_list=[out]) np.testing.assert_array_equal(python_out, fluid_out[0])
def _get_gm_cond_var(main_program, k_steps): main_block = main_program.global_block() # Add const var k_step_var = layers.create_global_var(name="gradient_merge_k", shape=[1], value=int(k_steps), dtype='int32', persistable=True, force_cpu=True) zero_var = layers.create_global_var(name="gradient_merge_zero", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) # Add step var & cond var step_var = layers.create_global_var(name="gradient_merge_step", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) cond_var = layers.create_global_var(name="gradient_merge_cond", shape=[1], value=bool(0), dtype='bool', persistable=False, force_cpu=True) with device_guard("cpu"): # step_var = (step_var + 1) % k_step layers.increment(x=step_var, value=1.0, in_place=True) main_block.append_op(type='elementwise_mod', inputs={ 'X': step_var, 'Y': k_step_var }, outputs={'Out': step_var}, attrs={ 'axis': -1, 'use_mkldnn': False }) # cond_var = (step_var == 0) main_block.append_op(type='equal', inputs={ 'X': step_var, 'Y': zero_var }, outputs={'Out': cond_var}) return cond_var
def check_switch(self, value): x = layers.fill_constant(shape=[1], dtype='float32', value=value) zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0) one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0) two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0) three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0) result = layers.create_global_var( shape=[1], value=-1.0, dtype='float32', persistable=True) with layers.Switch() as switch: with switch.case(layers.less_than(x, zero_var)): layers.assign(zero_var, result) with switch.case(layers.less_than(x, one_var)): layers.assign(one_var, result) with switch.case(layers.less_than(x, two_var)): layers.assign(two_var, result) with switch.default(): layers.assign(three_var, result) cpu = core.CPUPlace() exe = Executor(cpu) exe.run(default_startup_program()) out = exe.run(feed={}, fetch_list=[result])[0][0] return out
def linear_warmup_and_cosine_decay(learning_rate, end_lr, warmup_steps, max_training_steps): """Applies linear warmup and cosine decay to the learning rate.""" dtype = "float32" with fluid.default_main_program()._lr_schedule_guard(): lr = layers.create_global_var(shape=[1], value=0.0, dtype=dtype, persistable=True, name="learning_rate") global_step = _decay_step_counter(1) with layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_steps): warmup_lr = learning_rate * (global_step / warmup_steps) layers.assign(warmup_lr, lr) with switch.case(global_step < max_training_steps): frac = 0.5 * (ops.cos((global_step - warmup_steps) * math.pi / (max_training_steps - warmup_steps)) + 1) decayed_lr = end_lr + (learning_rate - end_lr) * frac layers.assign(decayed_lr, lr) with switch.default(): learning_rate = layers.fill_constant(shape=[1], dtype=dtype, value=end_lr) layers.assign(learning_rate, lr) return lr
def test_error(self): main_program = framework.Program() startup_program = framework.Program() with framework.program_guard(main_program, startup_program): cond = layers.fill_constant(shape=[1], dtype='float32', value=0.0) zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0) result = layers.create_global_var(shape=[1], value=-1.0, dtype='float32', persistable=True) # 1. The type of 'condition' in case must be Variable. def test_condition_type(): with layers.Switch() as switch: with switch.case(1): layers.assign(zero_var, result) self.assertRaises(TypeError, test_condition_type) # 2. The dtype of 'condition' in case must be 'bool'. def test_condition_dtype(): with layers.Switch() as switch: with switch.case(cond): layers.assign(zero_var, result) self.assertRaises(TypeError, test_condition_dtype)
def _create_scale_from_constant(self, value): name = unique_name.generate('global_scale') return layers.create_global_var(name=name, shape=[1], dtype='float32', value=float(value), persistable=True)
def check_switch(self, value): x = layers.fill_constant(shape=[1], dtype='float32', value=value) zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0) one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0) two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0) three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0) result = layers.create_global_var(shape=[1], value=-1.0, dtype='float32', persistable=True) with layers.Switch() as switch: with switch.case(layers.less_than(x, zero_var)): layers.assign(zero_var, result) with switch.case(layers.less_than(x, one_var)): layers.assign(one_var, result) with switch.case(layers.less_than(x, two_var)): layers.assign(two_var, result) with switch.default(): layers.assign(three_var, result) cpu = core.CPUPlace() exe = Executor(cpu) exe.run(default_startup_program()) out = exe.run(feed={}, fetch_list=[result])[0][0] return out
def _append_optimize_op(self, block, param_and_grad): one_var = paddle.ones(shape=[1], dtype='int32', name='lookahead_ones') zero_var = paddle.zeros(shape=[1], dtype='int32', name='lookahead_zeros') k_var = layers.create_global_var( name=unique_name.generate("lookahead_k"), shape=[1], value=self.k, dtype='int32', persistable=True) mod = paddle.remainder(self._global_step_var, k_var) cond_1 = paddle.equal(self._global_step_var, one_var) cond_1 = paddle.cast(cond_1, dtype='float32') cond_2 = paddle.equal(mod, zero_var) cond_2 = paddle.cast(cond_2, dtype='float32') slow_var = self._get_accumulator(self._slow_str, param_and_grad[0]) tmp_var = cond_1 * param_and_grad[0] + (1 - cond_1) * slow_var paddle.assign(tmp_var, slow_var) tmp_var = self.alpha * param_and_grad[0] + (1.0 - self.alpha) * slow_var tmp_var_1 = cond_2 * tmp_var + (1 - cond_2) * param_and_grad[0] paddle.assign(tmp_var_1, param_and_grad[0]) tmp_var_1 = cond_2 * tmp_var + (1 - cond_2) * slow_var paddle.assign(tmp_var_1, slow_var)
def optimize(self, metrics): """ Optimize the model by metrics(mainly `metrics["loss"]`). """ # TODO: support dygraph if self.warmup_steps > 0: scheduled_lr = layers.learning_rate_scheduler.noam_decay( 1 / (self.warmup_steps * (self.learning_rate**2)), self.warmup_steps) else: scheduled_lr = layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=self.learning_rate, dtype="float32", persistable=True) grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm) self.optimizer = AdamW(learning_rate=scheduled_lr, grad_clip=grad_clip, weight_decay=self.weight_decay) if self.is_distributed: self.optimizer = fleet.distributed_optimizer( self.optimizer, strategy=self.dist_strategy) self.optimizer.minimize(metrics["loss"]) return scheduled_lr
def __init__(self, init_loss_scale=1.): super(StaticLossScale, self).__init__() self.scale = layers.create_global_var( name=unique_name.generate("loss_scale"), shape=[1], value=init_loss_scale, dtype='float32', persistable=True)
def __init__(self, init_loss_scale=2**15, increment_every=2000, factor=2.): super(DynamicLossScale, self).__init__() self.scale = layers.create_global_var( name=unique_name.generate("loss_scale"), shape=[1], value=init_loss_scale, dtype='float32', persistable=True) self.good_steps = layers.create_global_var( name=unique_name.generate("good_steps"), shape=[1], value=0, dtype='int32', persistable=True) self.increment_every = layers.fill_constant( shape=[1], dtype='int32', value=increment_every) self.factor = factor
def _create_ema_vars(self, param): param_ema = layers.create_global_var( name=unique_name.generate(self._name + param.name + '_ema'), shape=param.shape, value=0.0, dtype=param.dtype, persistable=True) return param_ema
def _init_amp_var(self): # Ensure the data type of learning rate vars is float32 (same as the # master parameter dtype) if isinstance(self._optimizer._learning_rate, float): self._optimizer._learning_rate_map[default_main_program()] = \ layers.create_global_var( name=unique_name.generate("learning_rate"), shape=[1], value=float(self._optimizer._learning_rate), dtype='float32', persistable=True)
def test_eq(self): """ test queue_generator op, enqueue op and dequeue op. """ main_program = fluid.Program() startup_program = fluid.Program() value = np.random.rand(1) with fluid.program_guard(main_program, startup_program): data_in = layers.create_global_var(shape=[2, 3], value=value, dtype="float32", persistable=True, name='var_in') data_out = layers.create_global_var(shape=[2, 3], value=value - 1.0, dtype="float32", persistable=True, name='var_out') startup_block = startup_program.block(0) queue_name = 'blocking_queue' startup_block.create_var(name=queue_name, persistable=True, type=core.VarDesc.VarType.RAW) startup_block.append_op(type="queue_generator", attrs={'names': [queue_name]}) block = main_program.block(0) block.append_op(type='enqueue', inputs={'X': data_in}, attrs={'queue_name': queue_name}) block.append_op(type='dequeue', outputs={'Out': [data_out]}, attrs={'queue_name': queue_name}) place = fluid.CUDAPlace( 0) if core.is_compiled_with_cuda() else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) ret = exe.run(main_program, fetch_list=[data_out.name]) self.assertTrue( np.allclose(np.asarray(ret), np.full((2, 3), value, np.float32)))
def optimize(self, metrics): """Optimize the model by loss. Args: metrics: A dict mapping metric names to corresponding metrics, which must include loss. """ # TODO: support dygraph # lr scheduler if self.lr_scheduler == "noam" and self.warmup_steps <= 0: print( "[WARN] Using constant learning rate because of `warmup_steps` is not positive while using NoamScheduler." ) if self.lr_scheduler == "noam" and self.warmup_steps > 0: scheduled_lr = layers.learning_rate_scheduler.noam_decay( 1 / (self.warmup_steps * (self.learning_rate**2)), self.warmup_steps) elif self.lr_scheduler == "linear": scheduled_lr = lr_scheduler.linear_warmup_and_linear_decay( self.learning_rate, self.min_learning_rate, self.warmup_steps, self.max_training_steps) elif self.lr_scheduler == "cosine": scheduled_lr = lr_scheduler.linear_warmup_and_cosine_decay( self.learning_rate, self.min_learning_rate, self.warmup_steps, self.max_training_steps) else: # constant scheduled_lr = layers.create_global_var( name=fluid.unique_name.generate("learning_rate"), shape=[1], value=self.learning_rate, dtype="float32", persistable=True) # grad norm if self.max_grad_norm > 0: grad_clip = fluid.clip.GradientClipByGlobalNorm(self.max_grad_norm) else: grad_clip = None # optimizer optimizer_cls = getattr(knover.optim, self.optimizer) optimizer = optimizer_cls(learning_rate=scheduled_lr, grad_clip=grad_clip, weight_decay=self.weight_decay, beta1=self.beta1, beta2=self.beta2) # distributed optimizer if self.is_distributed: optimizer = fleet.distributed_optimizer( optimizer, strategy=self.dist_strategy) optimizer.minimize(metrics["loss"]) return scheduled_lr
def check_result(self, fn, place, dtype): shape = [9, 10] x_data = np.random.random(size=shape).astype(dtype) y_data = np.random.random(size=shape).astype(dtype) python_out = fn(x_data, y_data) x_var = layers.create_global_var( name='x', shape=shape, value=0.0, dtype=dtype, persistable=True) y_var = layers.create_global_var( name='y', shape=shape, value=0.0, dtype=dtype, persistable=True) out = fn(x_var, y_var) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) fluid_out = exe.run(fluid.default_main_program(), feed={'x': x_data, 'y': y_data}, fetch_list=[out]) np.testing.assert_array_equal(python_out, fluid_out[0])
def _increment_global_var(self): if self._global_step_var is None: self._global_step_var = layers.create_global_var( name=unique_name.generate("lookahead_step"), shape=[1], value=0, dtype='int32', persistable=True) self.helper.append_op(type='increment', inputs={'X': [self._global_step_var]}, outputs={'Out': [self._global_step_var]}, attrs={'step': 1.0})
def create_coalesce_program(grad_dict): coalesce_program = fluid.Program() in_vars = [] out_vars = [] with fluid.program_guard(coalesce_program): grad_out_dict = {} for name in grad_dict: grad = grad_dict[name] grad_in = layers.fill_constant(shape=grad.shape, dtype='float32', value=1) grad_out = layers.create_global_var(name='output_' + grad.name, shape=grad.shape, value=0, dtype='float32', persistable=True) in_vars.append(grad_in) out_vars.append(grad_out) grad_out_dict[name] = grad_out grad_fused = layers.create_global_var(name='fused_output', shape=[1], value=0, dtype='float32', persistable=True) coalesce_program.global_block().append_op(type='coalesce_tensor', inputs={'Input': in_vars}, outputs={ 'Output': out_vars, 'FusedOutput': grad_fused }, attrs={ 'copy_data': False, 'dtype': core.VarDesc.VarType.FP32 }) fused_shape = layers.shape(grad_fused) return coalesce_program, grad_out_dict, grad_fused, fused_shape
def __init__(self, d_model, warmup_steps, learning_rate=0.001, current_steps=0, name="learning_rate"): self.current_steps = current_steps self.warmup_steps = warmup_steps self.d_model = d_model self.static_lr = learning_rate self.learning_rate = layers.create_global_var( name=name, shape=[1], value=float(learning_rate), dtype="float32", persistable=True)
def _create_master_weight(self, param): assert isinstance(self.helper, LayerHelper) var_name = param.name + "_fp32_master" var_name = unique_name.generate(var_name) var = layers.create_global_var(name=var_name, shape=param.shape, value=0, dtype='float32', persistable=True) block = self.helper.startup_program.global_block() block.append_op(type="cast", inputs={"X": [param]}, outputs={"Out": [var]}, attrs={ "in_dtype": param.dtype, "out_dtype": core.VarDesc.VarType.FP32 }) self._master_weights[param.name] = var return var
def linear_warmup_and_invsqrt_decay(learning_rate, warmup_steps, decay_steps): """Applies linear warmup and invsqrt decay to the learning rate.""" dtype = "float32" with fluid.default_main_program()._lr_schedule_guard(): lr = layers.create_global_var(shape=[1], value=0.0, dtype=dtype, persistable=True, name="learning_rate") global_step = _decay_step_counter(1) with layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_steps): warmup_lr = learning_rate * (global_step / warmup_steps) layers.assign(warmup_lr, lr) with switch.default(): decayed_lr = lr * ops.sqrt( decay_steps / (global_step - warmup_steps + decay_steps)) layers.assign(decayed_lr, lr) return lr
def minimize_impl(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): minimized = self.inner_opt.minimize(loss, startup_program=startup_program) k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps'] begin_step_value = self.user_defined_strategy.localsgd_configs[ 'begin_step'] if startup_program is None: startup_program = default_startup_program() main_block = loss.block self.nrings = 2 collective_helper = CollectiveHelper(self.role_maker, self.nrings) collective_helper.update_startup_program(startup_program) p2s = self.create_snapshot_vars(startup_program) self.init_snapshot_vars(startup_program, p2s) p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): step = layers.autoincreased_step_counter(begin=1) k_steps = layers.create_global_var(name="k_steps", shape=[1], value=k_steps_value, dtype='int64', persistable=True) begin_step = layers.create_global_var(name="begin_step", shape=[1], value=begin_step_value, dtype='int64', persistable=True) last_step = layers.create_global_var(name="last_step", shape=[1], value=begin_step_value, dtype='int64', persistable=True) def communicate(): sub_block = default_main_program().current_block() ring_id = -1 for param, snapshot in p2s: sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='c_sync_calc_stream', inputs={'X': param}, outputs={'Out': param}, attrs={OP_ROLE_KEY: OpRole.Optimize}) ring_id = (ring_id + 1) % self.nrings sub_block.append_op(type='c_allreduce_sum', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for ring_id in range(self.nrings): sub_block.append_op(type='c_sync_comm_stream', inputs={'X': param}, outputs={'Out': param}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for param, snapshot in p2s: sub_block.append_op(type='scale', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'scale': 1.0 / self.role_maker._worker_num(), OP_ROLE_KEY: OpRole.Optimize }) sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='assign', inputs={'X': [param]}, outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) layers.assign(step, last_step) def begin_localsgd(): layers.cond(step - last_step == k_steps, communicate) layers.cond(step > begin_step, begin_localsgd, communicate) return minimized
def minimize_impl(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): minimized = self.inner_opt.minimize(loss, startup_program=startup_program) init_k_steps = self.user_defined_strategy.adaptive_localsgd_configs[ 'init_k_steps'] begin_step_value = self.user_defined_strategy.adaptive_localsgd_configs[ 'begin_step'] if startup_program is None: startup_program = default_startup_program() main_block = loss.block self.nrings = 2 collective_helper = CollectiveHelper(self.role_maker, self.nrings) collective_helper.update_startup_program(startup_program) p2s = self.create_snapshot_vars(startup_program) self.init_snapshot_vars(startup_program, p2s) p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): step = layers.autoincreased_step_counter(begin=1) k_steps = layers.create_global_var(name="k_steps", shape=[1], value=int(init_k_steps), dtype='int64', persistable=True) begin_step = layers.create_global_var(name="begin_step", shape=[1], value=int(begin_step_value), dtype='int64', persistable=True) last_step = layers.create_global_var(name="last_step", shape=[1], value=int(0), dtype='int64', persistable=True) avg_loss = layers.create_global_var(name="avg_loss", shape=[1], value=float(0), dtype=loss.dtype, persistable=True) lr_0 = layers.create_global_var(name="lr_0", shape=[1], value=float(0), dtype='float32', persistable=True) loss_0 = layers.create_global_var(name="loss_0", shape=[1], value=float(0), dtype='float32', persistable=True) global_lr = self.inner_opt._global_learning_rate() def initialize(): self._generate_avg_loss(main_block, loss, avg_loss) layers.assign(avg_loss, loss_0) layers.assign(global_lr, lr_0) layers.cond(step == 1, initialize) def communicate(): sub_block = default_main_program().current_block() ring_id = -1 for param, snapshot in p2s: sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='c_sync_calc_stream', inputs={'X': param}, outputs={'Out': param}, attrs={OP_ROLE_KEY: OpRole.Optimize}) ring_id = (ring_id + 1) % self.nrings sub_block.append_op(type='c_allreduce_sum', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for ring_id in range(self.nrings): sub_block.append_op(type='c_sync_comm_stream', inputs={'X': param}, outputs={'Out': param}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for param, snapshot in p2s: sub_block.append_op(type='scale', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'scale': 1.0 / self.role_maker._worker_num(), OP_ROLE_KEY: OpRole.Optimize }) sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='assign', inputs={'X': [param]}, outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) layers.assign(step, last_step) def communicate_avg_loss(): communicate() self._generate_avg_loss(main_block, loss, avg_loss) next_local_steps = layers.cast(layers.ceil( layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) * float(init_k_steps))), dtype='int64') max_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=16) min_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=1) next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps) next_local_steps = layers.elementwise_max( next_local_steps, min_local_steps) layers.assign(next_local_steps, k_steps) def begin_localsgd(): layers.cond(step - last_step == k_steps, communicate_avg_loss) layers.cond(step > begin_step, begin_localsgd, communicate) return minimized
def network(batch_size, items_num, hidden_size, step, rate): stdv = 1.0 / math.sqrt(hidden_size) items = layers.data( name="items", shape=[batch_size, -1, 1], dtype="int64", append_batch_size=False) #[bs, uniq_max, 1] seq_index = layers.data( name="seq_index", shape=[batch_size, -1], dtype="int64", append_batch_size=False) #[-1(seq_max)*batch_size, 1] last_index = layers.data( name="last_index", shape=[batch_size], dtype="int64", append_batch_size=False) #[batch_size, 1] adj_in = layers.data( name="adj_in", shape=[batch_size, -1, -1], dtype="float32", append_batch_size=False) adj_out = layers.data( name="adj_out", shape=[batch_size, -1, -1], dtype="float32", append_batch_size=False) mask = layers.data( name="mask", shape=[batch_size, -1, 1], dtype="float32", append_batch_size=False) label = layers.data( name="label", shape=[batch_size, 1], dtype="int64", append_batch_size=False) items_emb = layers.embedding( input=items, is_sparse=True, param_attr=fluid.ParamAttr( name="emb", learning_rate=rate, initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[batch_size, uniq_max, h] data_feed = [items, seq_index, last_index, adj_in, adj_out, mask, label] pre_state = items_emb for i in range(step): pre_state = layers.reshape( x=pre_state, shape=[batch_size, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_adj_in = layers.matmul(adj_in, state_in) #[batch_size, uniq_max, h] state_adj_out = layers.matmul(adj_out, state_out) #[batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit( input=gru_fc, hidden=layers.reshape( x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = pre_state seq_index = layers.reshape(seq_index, shape=[-1]) seq = layers.gather(final_state, seq_index) #[batch_size*-1(seq_max), h] last = layers.gather(final_state, last_index) #[batch_size, h] seq = layers.reshape( seq, shape=[batch_size, -1, hidden_size]) #[batch_size, -1(seq_max), h] last = layers.reshape( last, shape=[batch_size, hidden_size]) #[batch_size, h] seq_fc = layers.fc( input=seq, name="seq_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, -1(seq_max), h] last_fc = layers.fc(input=last, name="last_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[bathc_size, h] seq_fc_t = layers.transpose( seq_fc, perm=[1, 0, 2]) #[-1(seq_max), batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) #[-1(seq_max), batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) #[h] add = layers.elementwise_add(add, b) #[-1(seq_max), batch_size, h] add_sigmoid = layers.sigmoid(add) #[-1(seq_max), batch_size, h] add_sigmoid = layers.transpose( add_sigmoid, perm=[1, 0, 2]) #[batch_size, -1(seq_max), h] weight = layers.fc(input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, -1, 1] weight *= mask weight_mask = layers.elementwise_mul(seq, weight, axis=0) global_attention = layers.reduce_sum(weight_mask, dim=1) final_attention = layers.concat( [global_attention, last], axis=1) #[batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="fina_attention_fc", size=hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, h] all_vocab = layers.create_global_var( shape=[items_num - 1, 1], value=0, dtype="int64", persistable=True, name="all_vocab") all_emb = layers.embedding( input=all_vocab, is_sparse=True, param_attr=fluid.ParamAttr( name="emb", learning_rate=rate, initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[all_vocab, h] logits = layers.matmul( x=final_attention_fc, y=all_emb, transpose_y=True) #[batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( logits=logits, label=label) #[batch_size, 1] loss = layers.reduce_mean(softmax) # [1] #fluid.layers.Print(loss) acc = layers.accuracy(input=logits, label=label, k=20) return loss, acc, data_feed, [items_emb, all_emb]
def _get_gm_cond_var(main_program, k_steps, dist_context): main_block = main_program.global_block() # Add const var k_step_var = layers.create_global_var(name="gradient_merge_k", shape=[1], value=int(k_steps), dtype='int32', persistable=True, force_cpu=True) set_var_dist_attr(dist_context, k_step_var, [-1], world_process_group.ranks) zero_var = layers.create_global_var(name="gradient_merge_zero", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) set_var_dist_attr(dist_context, zero_var, [-1], world_process_group.ranks) # Add step var & cond var step_var = layers.create_global_var(name="gradient_merge_step", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) set_var_dist_attr(dist_context, step_var, [-1], world_process_group.ranks) cond_var = main_block.create_var(name="gradient_merge_cond", shape=[1], dtype='bool') set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks) with device_guard("cpu"): # step_var = (step_var + 1) % k_step layers.increment(x=step_var, value=1.0, in_place=True) elementwise_mod_op = main_block.append_op(type='elementwise_mod', inputs={ 'X': step_var, 'Y': k_step_var }, outputs={'Out': step_var}, attrs={ 'axis': -1, 'use_mkldnn': False }) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( elementwise_mod_op, world_process_group.ranks, [-1], dist_context) # cond_var = (step_var == 0) equal_op = main_block.append_op(type='equal', inputs={ 'X': step_var, 'Y': zero_var }, outputs={'Out': cond_var}) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( equal_op, world_process_group.ranks, [-1], dist_context) return cond_var
def optimization(loss, warmup_steps, num_train_steps, learning_rate, train_program, startup_prog, weight_decay, scheduler='linear_warmup_decay', use_fp16=False, init_loss_scaling=128, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, incr_ratio=2.0, decr_ratio=0.8): """do backword for static""" def exclude_from_weight_decay(param): name = param.name.rstrip('.master') if name.find("layer_norm") > -1: return True bias_suffix = ["_bias", "_b", ".b_0"] for suffix in bias_suffix: if name.endswith(suffix): return True return False if warmup_steps > 0: if scheduler == 'noam_decay': scheduled_lr = L.learning_rate_scheduler\ .noam_decay(1/(warmup_steps *(learning_rate ** 2)), warmup_steps) elif scheduler == 'linear_warmup_decay': scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps) else: raise ValueError("Unkown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'") log.debug('using Adam') optimizer = F.optimizer.Adam(learning_rate=scheduled_lr) else: scheduled_lr = L.create_global_var( name=F.unique_name.generate("learning_rate"), shape=[1], value=learning_rate, dtype='float32', persistable=True) log.debug('using Adam') optimizer = F.optimizer.Adam(learning_rate=scheduled_lr) optimizer._learning_rate_map[F.default_main_program()] = scheduled_lr if use_fp16: log.info('AMP activated') optimizer = F.contrib.mixed_precision.decorate( optimizer, amp_lists=F.contrib.mixed_precision.AutoMixedPrecisionLists( custom_black_varnames={"loss"}, custom_black_list={'layer_norm', 'arg_max', 'argmax'}), init_loss_scaling=init_loss_scaling, use_dynamic_loss_scaling=True, ) loss_scaling = optimizer.get_loss_scaling() else: loss_scaling = None F.clip.set_gradient_clip(clip=F.clip.GradientClipByGlobalNorm( clip_norm=1.0)) param_list = {} for param in train_program.global_block().all_parameters(): param_list[param.name] = param * 1.0 param_list[param.name].stop_gradient = True _, param_grads = optimizer.minimize(loss) if weight_decay > 0: for param, grad in param_grads: if exclude_from_weight_decay(param): continue with param.block.program._optimized_guard( [param, grad]), F.framework.name_scope("weight_decay"): updated_param = param - param_list[ param.name] * weight_decay * scheduled_lr L.assign(output=param, input=updated_param) return scheduled_lr, loss_scaling
def network(items_num, hidden_size, step, bs): stdv = 1.0 / math.sqrt(hidden_size) items = fluid.data(name="items", shape=[bs, -1], dtype="int64") #[batch_size, uniq_max] seq_index = fluid.data(name="seq_index", shape=[bs, -1, 2], dtype="int32") #[batch_size, seq_max, 2] last_index = fluid.data(name="last_index", shape=[bs, 2], dtype="int32") #[batch_size, 2] adj_in = fluid.data(name="adj_in", shape=[bs, -1, -1], dtype="float32") #[batch_size, seq_max, seq_max] adj_out = fluid.data(name="adj_out", shape=[bs, -1, -1], dtype="float32") #[batch_size, seq_max, seq_max] mask = fluid.data(name="mask", shape=[bs, -1, 1], dtype="float32") #[batch_size, seq_max, 1] label = fluid.data(name="label", shape=[bs, 1], dtype="int64") #[batch_size, 1] datas = [items, seq_index, last_index, adj_in, adj_out, mask, label] py_reader = fluid.io.DataLoader.from_generator(capacity=256, feed_list=datas, iterable=False) feed_datas = datas items_emb = fluid.embedding( input=items, param_attr=fluid.ParamAttr(name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[batch_size, uniq_max, h] pre_state = items_emb for i in range(step): pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_adj_in = layers.matmul(adj_in, state_in) #[batch_size, uniq_max, h] state_adj_out = layers.matmul(adj_out, state_out) #[batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit(input=gru_fc, hidden=layers.reshape( x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) seq = layers.gather_nd(final_state, seq_index) last = layers.gather_nd(final_state, last_index) seq_fc = layers.fc( input=seq, name="seq_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, seq_max, h] last_fc = layers.fc( input=last, name="last_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[bathc_size, h] seq_fc_t = layers.transpose(seq_fc, perm=[1, 0, 2]) #[seq_max, batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) #[seq_max, batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) #[h] add = layers.elementwise_add(add, b) #[seq_max, batch_size, h] add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h] add_sigmoid = layers.transpose(add_sigmoid, perm=[1, 0, 2]) #[batch_size, seq_max, h] weight = layers.fc( input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, seq_max, 1] weight *= mask weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h] global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h] final_attention = layers.concat([global_attention, last], axis=1) #[batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", size=hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, h] all_vocab = layers.create_global_var(shape=[items_num - 1], value=0, dtype="int64", persistable=True, name="all_vocab") all_emb = fluid.embedding(input=all_vocab, param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[all_vocab, h] logits = layers.matmul(x=final_attention_fc, y=all_emb, transpose_y=True) #[batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy(logits=logits, label=label) #[batch_size, 1] loss = layers.reduce_mean(softmax) # [1] acc = layers.accuracy(input=logits, label=label, k=50) return loss, acc, py_reader, feed_datas, logits
def constant(name, value, dtype, hide_batch_size=True): """Create constant variable with given data. This function helps to create constants variable with given numpy.ndarray data. Args: name: variable name value: numpy.ndarray the value of constant dtype: the type of constant hide_batch_size: If set the first dimenstion as unknown, the explicit batch size may cause some error in paddle. For example, when the value has a shape of (batch_size, dim1, dim2), it will return a variable with shape (-1, dim1, dim2). Return: A tuple contain the constant variable and the constant variable initialize function. Examples: .. code-block:: python import paddle.fluid as fluid place = fluid.CPUPlace() exe = fluid.Executor(place) constant_var, constant_var_init = constant(name="constant", value=np.array([5.0], dtype="float32")) exe.run(fluid.default_startup_program()) # Run After default startup constant_var_init(place) """ if not isinstance(value, np.ndarray): raise TypeError("value should be Numpy array.") value = value.astype(dtype) data = L.create_global_var(shape=value.shape, value=0, dtype=value.dtype, name=name, persistable=True) data.stop_gradient = True if hide_batch_size: shape = list(value.shape) shape[0] = -1 data.desc.set_shape(shape) def initializer(place): if isinstance(place, fluid.CUDAPlace): pass elif isinstance(place, fluid.CUDAPinnedPlace): pass elif isinstance(place, fluid.CPUPlace): pass else: raise TypeError( "The input of initializer is not in" " [fluid.CUDAPlace, fluid.CPUPlace, fluid.CUDAPinnedPlace]") var = fluid.global_scope().var(data.name).get_tensor() var.set(value, place) return data, initializer
def _create_gm_cond(self, main_block): # Add const var acc_step_var = layers.create_global_var( name="gradient_merge_acc_step", shape=[1], value=int(self._gradient_merge_acc_step), dtype='int32', persistable=True, force_cpu=True) zero_var = layers.create_global_var(name="gradient_merge_zero", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) # Add step var & cond var current_step_var = layers.create_global_var( name="gradient_merge_current_step", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) cond_var = layers.create_global_var(name="gradient_merge_cond", shape=[1], value=bool(0), dtype='bool', persistable=False, force_cpu=True) with device_guard("cpu"): # step_var = (step_var + 1) % k_step main_block.append_op(type='increment', inputs={'X': [current_step_var]}, outputs={'Out': [current_step_var]}, attrs={ 'step': float(1), OP_ROLE_KEY: OpRole.Optimize }) main_block.append_op(type='elementwise_mod', inputs={ 'X': current_step_var, 'Y': acc_step_var }, outputs={'Out': current_step_var}, attrs={ 'axis': -1, OP_ROLE_KEY: OpRole.Optimize, 'use_mkldnn': False }) # cond_var = (step_var == 0) main_block.append_op(type='equal', inputs={ 'X': current_step_var, 'Y': zero_var }, outputs={'Out': cond_var}, attrs={OP_ROLE_KEY: OpRole.Optimize}) # paddle.static.Print(current_step_var, message="in FWBW last conditional") return cond_var
op_maker = core.op_proto_and_checker_maker op_role_key = op_maker.kOpRoleAttrName() # "op_role" op_role_var_key = op_maker.kOpRoleVarAttrName() # "op_role_var" param2avg = [] for idx, op in list(enumerate(block.ops)): if _is_backward_op(op, op_role_key) and op_role_var_key in op.attr_names: op_role_var = op.all_attrs()[op_role_var_key] if len(op_role_var) == 0: continue assert len(op_role_var) % 2 == 0 for i in range(0, len(op_role_var), 2): param = block.vars[op_role_var[i]] avg_var = layers.create_global_var(name=param.name + "@avg", shape=param.shape, value=1.0, dtype='float32', persistable=True) avgw_list.append(avg_var) tmp0 = layers.elementwise_mul(avg_var, decay_var) tmp1 = layers.elementwise_mul(param, rev_decay_var) block.append_op(type='elementwise_add', inputs={ 'X': tmp0, 'Y': tmp1 }, outputs={'Out': avg_var}, stop_gradient=True) # 执行器声明
def init_fp16_params(self, loss_type, fp16_user_dict): # set default value for fp16_params_dict fp16_params_dict = dict() fp16_params_dict['init_loss_scaling'] = 1.0 fp16_params_dict['incr_every_n_steps'] = 1000 fp16_params_dict['decr_every_n_nan_or_inf'] = 2 fp16_params_dict['incr_ratio'] = 2.0 fp16_params_dict['decr_ratio'] = 0.5 fp16_params_dict['use_dynamic_loss_scaling'] = True fp16_params_dict['amp_lists'] = None if fp16_user_dict is not None: # update fp16_params_dict for key in fp16_user_dict: if fp16_params_dict.has_key(key): fp16_params_dict[key] = fp16_user_dict[key] else: logging.warning( "Can't find name '%s' in our fp16_params_dict. " "Please check your dict key. You can set fp16 params only " "in [init_loss_scaling, incr_every_n_steps, " "decr_every_n_nan_or_inf, incr_ratio, decr_ratio, " "use_dynamic_loss_scaling, amp_lists]" % (key)) self._amp_lists = fp16_params_dict['amp_lists'] if self._amp_lists is None: self._amp_lists = AutoMixedPrecisionLists() self._loss_type = loss_type self._loss_scaling = layers.create_global_var( name=unique_name.generate("loss_scaling"), shape=[1], value=fp16_params_dict['init_loss_scaling'], dtype='float32', persistable=True) self._use_dynamic_loss_scaling = fp16_params_dict[ 'use_dynamic_loss_scaling'] if self._use_dynamic_loss_scaling: self._incr_every_n_steps = layers.fill_constant( shape=[1], dtype='int32', value=fp16_params_dict['incr_every_n_steps']) self._decr_every_n_nan_or_inf = layers.fill_constant( shape=[1], dtype='int32', value=fp16_params_dict['decr_every_n_nan_or_inf']) self._incr_ratio = fp16_params_dict['incr_ratio'] self._decr_ratio = fp16_params_dict['decr_ratio'] self._num_good_steps = layers.create_global_var( name=unique_name.generate("num_good_steps"), shape=[1], value=0, dtype='int32', persistable=True) self._num_bad_steps = layers.create_global_var( name=unique_name.generate("num_bad_steps"), shape=[1], value=0, dtype='int32', persistable=True) # Ensure the data type of learning rate vars is float32 (same as the # master parameter dtype) if isinstance(self._optimizer._learning_rate, float): self._optimizer._learning_rate_map[fluid.default_main_program()] = \ layers.create_global_var( name=unique_name.generate("learning_rate"), shape=[1], value=float(self._optimizer._learning_rate), dtype='float32', persistable=True)