def check_switch(self, value): x = layers.fill_constant(shape=[1], dtype='float32', value=value) zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0) one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0) two_var = layers.fill_constant(shape=[1], dtype='float32', value=2.0) three_var = layers.fill_constant(shape=[1], dtype='float32', value=3.0) result = layers.create_global_var(shape=[1], value=-1.0, dtype='float32', persistable=True) with layers.Switch() as switch: with switch.case(layers.less_than(x, zero_var)): layers.assign(zero_var, result) with switch.case(layers.less_than(x, one_var)): layers.assign(one_var, result) with switch.case(layers.less_than(x, two_var)): layers.assign(two_var, result) with switch.default(): layers.assign(three_var, result) cpu = core.CPUPlace() exe = Executor(cpu) exe.run(default_startup_program()) out = exe.run(feed={}, fetch_list=[result])[0][0] return out
def increment(self): enough_steps = layers.less_than(self.increment_every, self.good_steps + 1) with layers.Switch() as switch: with switch.case(enough_steps): new_scale = self.scale * self.factor scale_valid = layers.isfinite(new_scale) with layers.Switch() as switch2: with switch2.case(scale_valid): layers.assign(new_scale, self.scale) layers.assign(layers.zeros_like(self.good_steps), self.good_steps) with switch2.default(): layers.increment(self.good_steps) with switch.default(): layers.increment(self.good_steps)
def decrement(self): new_scale = self.scale / self.factor one = layers.fill_constant(shape=[1], dtype='float32', value=1.0) less_than_one = layers.less_than(new_scale, one) with layers.Switch() as switch: with switch.case(less_than_one): layers.assign(one, self.scale) with switch.default(): layers.assign(new_scale, self.scale) layers.assign(layers.zeros_like(self.good_steps), self.good_steps)
def update_loss_scale(grads): state = mixed_precision_global_state() if state is None or not state.dynamic_scaling: return per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads]) grad_valid = layers.isfinite(per_grad_check) with layers.Switch() as switch: with switch.case(grad_valid): state.increment() with switch.default(): state.decrement() return grad_valid
def backward(self, loss, **kwargs): state = mixed_precision_global_state() callbacks = 'callbacks' in kwargs and kwargs['callbacks'] or None if callbacks is None: from paddle.fluid.clip import error_clip_callback callbacks = [error_clip_callback] # XXX what if gradient is zero? if state is not None: kwargs['callbacks'] = [scale_gradient] + callbacks else: kwargs['callbacks'] = callbacks param_grads = self._backward(loss, **kwargs) if state is not None: grad_valid = update_loss_scale(v for k, v in param_grads) if state.dynamic_scaling: with layers.Switch() as switch: with switch.case(grad_valid): pass with switch.default(): for _, g in param_grads: layers.assign(layers.zeros_like(g), g) return param_grads
def test_condition_dtype(): with layers.Switch() as switch: with switch.case(cond): layers.assign(zero_var, result)
def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps, num_bad_steps, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio): """ Update loss scaling according to overall gradients. If all gradients is finite after incr_every_n_steps, loss scaling will increase by incr_ratio. Otherwise, loss scaling will decrease by decr_ratio after decr_every_n_nan_or_inf steps and each step some gradients are infinite. Args: is_overall_finite (Variable): A boolean variable indicates whether all gradients are finite. prev_loss_scaling (Variable): Previous loss scaling. num_good_steps (Variable): A variable accumulates good steps in which all gradients are finite. num_bad_steps (Variable): A variable accumulates bad steps in which some gradients are infinite. incr_every_n_steps (Variable): A variable represents increasing loss scaling every n consecutive steps with finite gradients. decr_every_n_nan_or_inf (Variable): A variable represents decreasing loss scaling every n accumulated steps with nan or inf gradients. incr_ratio(float): The multiplier to use when increasing the loss scaling. decr_ratio(float): The less-than-one-multiplier to use when decreasing loss scaling. """ zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0) with layers.Switch() as switch: with switch.case(is_overall_finite): should_incr_loss_scaling = layers.less_than( incr_every_n_steps, num_good_steps + 1) with layers.Switch() as switch1: with switch1.case(should_incr_loss_scaling): new_loss_scaling = prev_loss_scaling * incr_ratio loss_scaling_is_finite = layers.isfinite(new_loss_scaling) with layers.Switch() as switch2: with switch2.case(loss_scaling_is_finite): layers.assign(new_loss_scaling, prev_loss_scaling) with switch2.default(): pass layers.assign(zero_steps, num_good_steps) layers.assign(zero_steps, num_bad_steps) with switch1.default(): layers.increment(num_good_steps) layers.assign(zero_steps, num_bad_steps) with switch.default(): should_decr_loss_scaling = layers.less_than( decr_every_n_nan_or_inf, num_bad_steps + 1) with layers.Switch() as switch3: with switch3.case(should_decr_loss_scaling): new_loss_scaling = prev_loss_scaling * decr_ratio static_loss_scaling = \ layers.fill_constant(shape=[1], dtype='float32', value=1.0) less_than_one = layers.less_than(new_loss_scaling, static_loss_scaling) with layers.Switch() as switch4: with switch4.case(less_than_one): layers.assign(static_loss_scaling, prev_loss_scaling) with switch4.default(): layers.assign(new_loss_scaling, prev_loss_scaling) layers.assign(zero_steps, num_good_steps) layers.assign(zero_steps, num_bad_steps) with switch3.default(): layers.assign(zero_steps, num_good_steps) layers.increment(num_bad_steps)
def decode(context): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') pd.array_write(init_state, array=state_array, i=counter) # ids, scores as memory ids_array = pd.create_array('int64') scores_array = pd.create_array('float32') init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) init_scores = pd.data(name="init_scores", shape=[1], dtype="float32", lod_level=2) pd.array_write(init_ids, array=ids_array, i=counter) pd.array_write(init_scores, array=scores_array, i=counter) cond = pd.less_than(x=counter, y=array_len) while_op = pd.While(cond=cond) with while_op.block(): pre_ids = pd.array_read(array=ids_array, i=counter) pre_state = pd.array_read(array=state_array, i=counter) pre_score = pd.array_read(array=scores_array, i=counter) # expand the lod of pre_state to be the same with pre_score pre_state_expanded = pd.sequence_expand(pre_state, pre_score) pre_ids_emb = pd.embedding(input=pre_ids, size=[dict_size, word_dim], dtype='float32', is_sparse=is_sparse, param_attr=fluid.ParamAttr(name='vemb')) # use rnn unit to update rnn current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], size=decoder_size, act='tanh') current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) # use score to do beam search current_score = pd.fc(input=current_state_with_lod, size=target_dict_dim, act='softmax') topk_scores, topk_indices = pd.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = pd.elementwise_add(x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0) selected_ids, selected_scores = pd.beam_search(pre_ids, pre_score, topk_indices, accu_scores, beam_size, end_id=10, level=0) with pd.Switch() as switch: with switch.case(pd.is_empty(selected_ids)): pd.fill_constant(shape=[1], value=0, dtype='bool', force_cpu=True, out=cond) with switch.default(): pd.increment(x=counter, value=1, in_place=True) # update the memories pd.array_write(current_state, array=state_array, i=counter) pd.array_write(selected_ids, array=ids_array, i=counter) pd.array_write(selected_scores, array=scores_array, i=counter) # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = pd.less_than(x=counter, y=array_len) finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10) return translation_ids, translation_scores
def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None, callbacks=None): assert loss._get_info('shard_logit') shard_logit = loss._get_info('shard_logit') shard_prob = loss._get_info('shard_prob') shard_label = loss._get_info('shard_label') shard_dim = loss._get_info('shard_dim') op_maker = fluid.core.op_proto_and_checker_maker op_role_key = op_maker.kOpRoleAttrName() op_role_var_key = op_maker.kOpRoleVarAttrName() backward_role = int(op_maker.OpRole.Backward) loss_backward_role = int(op_maker.OpRole.Loss) | int( op_maker.OpRole.Backward) # minimize a scalar of reduce_sum to generate the backward network scalar = fluid.layers.reduce_sum(shard_logit) block = loss.block if not self._use_fp16: ret = self._optimizer.minimize(scalar) # remove the unnecessary ops index = 0 for i, op in enumerate(block.ops): if op.all_attrs()[op_role_key] == loss_backward_role: index = i break assert block.ops[index - 1].type == 'reduce_sum' assert block.ops[index].type == 'fill_constant' assert block.ops[index + 1].type == 'reduce_sum_grad' block._remove_op(index + 1) block._remove_op(index) block._remove_op(index - 1) self.insert_commom_backward_op(block, index, shard_logit, shard_prob, shard_label, shard_dim, op_role_key, backward_role, loss_backward_role) return ret else: scaled_params_grads = self.fp16_backward(block, scalar, startup_program, parameter_list, no_grad_set, callbacks) index = 0 for i, op in enumerate(block.ops): if op.all_attrs()[op_role_key] == loss_backward_role: index = i break if self._loss_type == 'dist_arcface': assert block.ops[index - 2].type == 'fill_constant' assert block.ops[index - 1].type == 'reduce_sum' assert block.ops[index].type == 'fill_constant' assert block.ops[index + 1].type == 'reduce_sum_grad' assert block.ops[index + 2].type == 'scale' assert block.ops[index + 3].type == 'elementwise_add_grad' block._remove_op(index + 2) block._remove_op(index + 1) block._remove_op(index) block._remove_op(index - 1) self.insert_dist_arcface_backward_op(block, index, shard_logit, shard_prob, shard_label, shard_dim, op_role_key, backward_role, loss_backward_role) elif self._loss_type == 'dist_softmax': assert block.ops[index - 1].type == 'reduce_sum' assert block.ops[index].type == 'fill_constant' assert block.ops[index + 1].type == 'reduce_sum_grad' assert block.ops[index + 2].type == 'cast' assert block.ops[index + 3].type == 'elementwise_add_grad' block._remove_op(index + 1) block._remove_op(index) block._remove_op(index - 1) self.insert_dist_softmax_backward_op(block, index, shard_logit, shard_prob, shard_label, shard_dim, op_role_key, backward_role, loss_backward_role) if self._use_dynamic_loss_scaling: grads = [ layers.reduce_sum(g) for [_, g] in scaled_params_grads ] all_grads = layers.concat(grads) all_grads_sum = layers.reduce_sum(all_grads) is_overall_finite = layers.isfinite(all_grads_sum) update_loss_scaling(is_overall_finite, self._loss_scaling, self._num_good_steps, self._num_bad_steps, self._incr_every_n_steps, self._decr_every_n_nan_or_inf, self._incr_ratio, self._decr_ratio) with layers.Switch() as switch: with switch.case(is_overall_finite): pass with switch.default(): for _, g in scaled_params_grads: layers.assign(layers.zeros_like(g), g) optimize_ops = self._optimizer.apply_gradients(scaled_params_grads) ret = optimize_ops, scaled_params_grads return ret