def _get_gm_cond_var(main_program, k_steps): main_block = main_program.global_block() # Add const var k_step_var = layers.create_global_var(name="gradient_merge_k", shape=[1], value=int(k_steps), dtype='int32', persistable=True, force_cpu=True) zero_var = layers.create_global_var(name="gradient_merge_zero", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) # Add step var & cond var step_var = layers.create_global_var(name="gradient_merge_step", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) cond_var = layers.create_global_var(name="gradient_merge_cond", shape=[1], value=bool(0), dtype='bool', persistable=False, force_cpu=True) with device_guard("cpu"): # step_var = (step_var + 1) % k_step layers.increment(x=step_var, value=1.0, in_place=True) main_block.append_op(type='elementwise_mod', inputs={ 'X': step_var, 'Y': k_step_var }, outputs={'Out': step_var}, attrs={ 'axis': -1, 'use_mkldnn': False }) # cond_var = (step_var == 0) main_block.append_op(type='equal', inputs={ 'X': step_var, 'Y': zero_var }, outputs={'Out': cond_var}) return cond_var
def _add_accumulator(self, name, param, dtype=None, fill_value=0.0, shape=None, type=None, device=None): """Utility function to add an accumulator for a parameter Args: block: the block in which the loss tensor is present name: name of the accumulator param: parameter tensor for which accumulator is to be added dtype: data type of the accumulator tensor fill_value: value to initialize the accumulator tensor """ if self._name is not None: name = self._name + "_" + name if (name in self._accumulators and param.name in self._accumulators[name]): if framework.in_dygraph_mode(): return self._accumulators[name][param.name] raise Exception("Accumulator {} already exists for parameter {}". format(name, param.name)) if shape == None: shape = param.shape assert isinstance(self.helper, LayerHelper) var_name = param.name + "_" + name var_name = unique_name.generate(var_name) self._opti_name_list.append(var_name) var = self.helper.create_global_variable( name=var_name, persistable=True, dtype=dtype or param.dtype, type=param.type if type is None else type, shape=shape, belong_to_optimizer=True) if device is None: device = self._get_device_for_param(param.name) with device_guard(device): self.helper.set_variable_initializer( var, initializer=Constant(value=float(fill_value))) if framework.in_dygraph_mode(): if len(self._accumulators_holder) > 0: assert var_name in self._accumulators_holder, \ "Optimizer set error, {} should in state dict".format( var_name ) var.set_value(self._accumulators_holder[var_name]) self._accumulators[name][param.name] = var return var
def _get_gm_cond_var(main_program, k_steps, dist_context): main_block = main_program.global_block() # Add const var k_step_var = layers.create_global_var(name="gradient_merge_k", shape=[1], value=int(k_steps), dtype='int32', persistable=True, force_cpu=True) set_var_dist_attr(dist_context, k_step_var, [-1], world_process_group.ranks) zero_var = layers.create_global_var(name="gradient_merge_zero", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) set_var_dist_attr(dist_context, zero_var, [-1], world_process_group.ranks) # Add step var & cond var step_var = layers.create_global_var(name="gradient_merge_step", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) set_var_dist_attr(dist_context, step_var, [-1], world_process_group.ranks) cond_var = main_block.create_var(name="gradient_merge_cond", shape=[1], dtype='bool') set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks) with device_guard("cpu"): # step_var = (step_var + 1) % k_step layers.increment(x=step_var, value=1.0, in_place=True) elementwise_mod_op = main_block.append_op(type='elementwise_mod', inputs={ 'X': step_var, 'Y': k_step_var }, outputs={'Out': step_var}, attrs={ 'axis': -1, 'use_mkldnn': False }) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( elementwise_mod_op, world_process_group.ranks, [-1], dist_context) # cond_var = (step_var == 0) equal_op = main_block.append_op(type='equal', inputs={ 'X': step_var, 'Y': zero_var }, outputs={'Out': cond_var}) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( equal_op, world_process_group.ranks, [-1], dist_context) return cond_var
def _create_optimization_pass(self, parameters_and_grads): """Add optimization operators to update gradients to tensors. Args: parameters_and_grads(list(tuple(Tensor, Tensor))): a list of (tensor, gradient) pair to update. Returns: return_op_list: a list of operators that will complete one step of optimization. This will include parameter update ops, global step update ops and any other custom ops required by subclasses to manage their internal state. """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that # the subclass will implement the _append_optimize_op method and the # _initialize_tensors method. The subclass can extend the # _create_accumulators method if it needs to create accumulators # for parameters and extend _finish_update method to add custom ops. # Allways called under program_guard use global block as loss block # But if current block is in control flow, append optimize op in the # grad block of current block global_block = framework.default_main_program().global_block() target_block = global_block current_block = framework.default_main_program().current_block() if current_block.idx != global_block.idx: assert current_block.backward_block_idx != -1, \ "current block is not global_block, but it doesn't have backward block." target_block = framework.default_main_program().blocks[ current_block.backward_block_idx] start = len(target_block.ops) self.helper = LayerHelper(self.__class__.__name__) self._update_param_device_map(parameters_and_grads, target_block) self._create_accumulators( target_block, [p[0] for p in parameters_and_grads if p[0].trainable]) self._create_global_learning_rate() if framework.in_dygraph_mode(): for param_and_grad in parameters_and_grads: if param_and_grad[1] is None: continue if param_and_grad[0].trainable is True: self._append_optimize_op(target_block, param_and_grad) else: for param_and_grad in parameters_and_grads: if param_and_grad[1] is None: continue with param_and_grad[0].block.program._optimized_guard( param_and_grad), name_scope("optimizer"): if param_and_grad[0].trainable is True: device = self._get_device_for_param(param_and_grad[0] .name) with device_guard(device): optimize_op = self._append_optimize_op( target_block, param_and_grad) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies self._finish_update(target_block, parameters_and_grads) end = len(target_block.ops) return target_block._slice_ops(start, end)
def _create_gm_cond(self, main_block): # Add const var acc_step_var = layers.create_global_var( name="gradient_merge_acc_step", shape=[1], value=int(self._gradient_merge_acc_step), dtype='int32', persistable=True, force_cpu=True) zero_var = layers.create_global_var(name="gradient_merge_zero", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) # Add step var & cond var current_step_var = layers.create_global_var( name="gradient_merge_current_step", shape=[1], value=int(0), dtype='int32', persistable=True, force_cpu=True) cond_var = layers.create_global_var(name="gradient_merge_cond", shape=[1], value=bool(0), dtype='bool', persistable=False, force_cpu=True) with device_guard("cpu"): # step_var = (step_var + 1) % k_step main_block.append_op(type='increment', inputs={'X': [current_step_var]}, outputs={'Out': [current_step_var]}, attrs={ 'step': float(1), OP_ROLE_KEY: OpRole.Optimize }) main_block.append_op(type='elementwise_mod', inputs={ 'X': current_step_var, 'Y': acc_step_var }, outputs={'Out': current_step_var}, attrs={ 'axis': -1, OP_ROLE_KEY: OpRole.Optimize, 'use_mkldnn': False }) # cond_var = (step_var == 0) main_block.append_op(type='equal', inputs={ 'X': current_step_var, 'Y': zero_var }, outputs={'Out': cond_var}, attrs={OP_ROLE_KEY: OpRole.Optimize}) # paddle.static.Print(current_step_var, message="in FWBW last conditional") return cond_var