def _create_master_weight(self, param): assert isinstance(self.helper, LayerHelper) var_name = param.name + "_fp32_master" var_name = unique_name.generate(var_name) var = layers.create_global_var(name=var_name, shape=param.shape, value=0, dtype='float32', persistable=True) block = self.helper.startup_program.global_block() block.append_op(type="cast", inputs={"X": [param]}, outputs={"Out": [var]}, attrs={ "in_dtype": param.dtype, "out_dtype": core.VarDesc.VarType.FP32 }) self._master_weights[param.name] = var return var
def __init__(self, name_scope, hidden_size, param_attr=None, bias_attr=None, gate_activation=None, activation=None, dtype='float32'): super(BasicGRUUnit, self).__init__(name_scope, dtype) # reserve old school _full_name and _helper for static graph save load self._full_name = unique_name.generate(name_scope + "/" + self.__class__.__name__) self._helper = LayerObjectHelper(self._full_name) self._name = name_scope self._hiden_size = hidden_size self._param_attr = param_attr self._bias_attr = bias_attr self._gate_activation = gate_activation or layers.sigmoid self._activation = activation or layers.tanh self._dtype = dtype
def __init__(self, name=None, quant_bits=8, dtype='float32', quant_on_weight=False): super(FakeQuantAbsMax, self).__init__() self._quant_bits = quant_bits self._name = name scale_prefix = "{}.scale".format( name) if name else 'quant_dequant.scale' self._scale_name = unique_name.generate(scale_prefix) if quant_on_weight: scale_attr = ParamAttr(name=self._scale_name, initializer=Constant(0.0), trainable=False) self._scale = self.create_parameter(shape=[1], attr=scale_attr, dtype=self._dtype) self._scale.stop_gradient = True else: self._scale = None
def visit_Continue(self, node): loop_node_index = self._find_ancestor_loop_index(node) assert loop_node_index != -1, "SyntaxError: 'continue' outside loop" loop_node = self.ancestor_nodes[loop_node_index] # 1. Map the 'break/continue' stmt with an unique boolean variable V. variable_name = unique_name.generate(CONTINUE_NAME_PREFIX) # 2. Find the first ancestor block containing this 'break/continue', a # block can be a node containing stmt list. We should remove all stmts # after the 'break/continue' and set the V to True here. first_block_index = self._remove_stmts_after_break_continue( node, variable_name, loop_node_index) # 3. Add 'if V' for stmts in ancestor blocks between the first one # (exclusive) and the ancestor loop (inclusive) self._replace_if_stmt(loop_node_index, first_block_index, variable_name) # 4. For 'continue', set continue to False at the beginning of each loop assign_false_node = create_fill_constant_node(variable_name, False) loop_node.body.insert(0, assign_false_node)
def _create_node(nodes, api_type): assert len( nodes ) > 1, "The length of BoolOp should be at least 2, but received {}.".format( len(nodes)) if len(nodes) > 2: # Creates logic_and/logic_or node recursively. pre_assign_node = _create_node(nodes[:2], api_type) nodes = [pre_assign_node] + nodes[2:] args = [ast_to_source_code(child) for child in nodes] new_node_str = "fluid.layers.logical_{}(x={}, y={})".format( api_type, args[0], args[1]) # gast.parse return Module(body=[expr(value=...)]) new_node = gast.parse(new_node_str).body[0].value logic_tensor_name = unique_name.generate( LOGIC_AND_PREFIX if 'and' in api_type else LOGIC_OR_PREFIX) assign_name, assign_node = create_assign_node( logic_tensor_name, new_node) self._new_assign_nodes.append(assign_node) return assign_name
def to_static_inputs(self, main_program): inputs = [] block = main_program.global_block() for input_var in self.args: if isinstance(input_var, np.ndarray): feed_layer = block.create_var( name=unique_name.generate('feed'), shape=list(input_var.shape), dtype=input_var.dtype, is_data=True, need_check_feed=False) elif isinstance(input_var, core.VarBase): feed_layer = block.create_var( name=input_var.name, shape=list(input_var.shape), dtype=input_var.dtype, stop_gradient=input_var.stop_gradient, need_check_feed=False) else: feed_layer = input_var inputs.append(feed_layer) return inputs
def visit_Break(self, node): loop_node_index = self._find_ancestor_loop_index(node) assert loop_node_index != -1, "SyntaxError: 'break' outside loop" loop_node = self.ancestor_nodes[loop_node_index] # 1. Map the 'break/continue' stmt with an unique boolean variable V. variable_name = unique_name.generate(BREAK_NAME_PREFIX) # 2. Find the first ancestor block containing this 'break/continue', a # block can be a node containing stmt list. We should remove all stmts # after the 'break/continue' and set the V to True here. first_block_index = self._remove_stmts_after_break_continue( node, variable_name, loop_node_index) # 3. Add 'if V' for stmts in ancestor blocks between the first one # (exclusive) and the ancestor loop (inclusive) self._replace_if_stmt(loop_node_index, first_block_index, variable_name) # 4. For 'break' add break into condition of the loop. assign_false_node = create_fill_constant_node(variable_name, False) self._add_stmt_before_cur_node(loop_node_index, assign_false_node) cond_var_node = gast.UnaryOp( op=gast.Not(), operand=gast.Name( id=variable_name, ctx=gast.Load(), annotation=None, type_comment=None)) if isinstance(loop_node, gast.While): loop_node.test = gast.BoolOp( op=gast.And(), values=[loop_node.test, cond_var_node]) elif isinstance(loop_node, gast.For): parent_node = self.ancestor_nodes[loop_node_index - 1] for_to_while = ForToWhileTransformer(parent_node, loop_node, cond_var_node) for_to_while.transform()
def _create_persistable_tensor(self, name, type, dtype): return framework.default_main_program().current_block().create_var( name=unique_name.generate(name), type=type, dtype=dtype, persistable=True)
def visit_Return(self, node): cur_func_node = self.function_def[-1] return_name = unique_name.generate(RETURN_PREFIX) self.return_name[cur_func_node].append(return_name) max_return_length = self.pre_analysis.get_func_max_return_length( cur_func_node) parent_node_of_return = self.ancestor_nodes[-2] for ancestor_index in reversed(range(len(self.ancestor_nodes) - 1)): ancestor = self.ancestor_nodes[ancestor_index] cur_node = self.ancestor_nodes[ancestor_index + 1] if hasattr( ancestor, "body") and index_in_list(ancestor.body, cur_node) != -1: if cur_node == node: self._replace_return_in_stmt_list(ancestor.body, cur_node, return_name, max_return_length, parent_node_of_return) self._replace_after_node_to_if_in_stmt_list( ancestor.body, cur_node, return_name, parent_node_of_return) elif hasattr(ancestor, "orelse") and index_in_list( ancestor.orelse, cur_node) != -1: if cur_node == node: self._replace_return_in_stmt_list(ancestor.orelse, cur_node, return_name, max_return_length, parent_node_of_return) self._replace_after_node_to_if_in_stmt_list( ancestor.orelse, cur_node, return_name, parent_node_of_return) # If return node in while loop, add `not return_name` in gast.While.test if isinstance(ancestor, gast.While): cond_var_node = gast.UnaryOp(op=gast.Not(), operand=gast.Name( id=return_name, ctx=gast.Load(), annotation=None, type_comment=None)) ancestor.test = gast.BoolOp( op=gast.And(), values=[ancestor.test, cond_var_node]) continue # If return node in for loop, add `not return_name` in gast.While.test if isinstance(ancestor, gast.For): cond_var_node = gast.UnaryOp(op=gast.Not(), operand=gast.Name( id=return_name, ctx=gast.Load(), annotation=None, type_comment=None)) parent_node = self.ancestor_nodes[ancestor_index - 1] for_to_while = ForToWhileTransformer(parent_node, ancestor, cond_var_node) new_stmts = for_to_while.transform() while_node = new_stmts[-1] self.ancestor_nodes[ancestor_index] = while_node if ancestor == cur_func_node: break
def create_static_variable_gast_node(name): func_code = "{} = paddle.jit.dy2static\ .data_layer_not_check(name='{}', shape=[-1], dtype='float32')".format( name, unique_name.generate(name)) return gast.parse(func_code).body[0]
def save_vars(executor, dirname, main_program=None, vars=None, predicate=None, filename=None): """ This API saves specific variables in the `Program` to files. There are two ways to specify the variables to be saved: set variables in a list and assign it to the `vars`, or use the `predicate` function to select variables that make `predicate(variable) == True`. The first way has a higher priority. The `dirname` is used to specify the folder where to save variables. If you prefer to save variables in separate files in the `dirname` floder, do not set `filename`. If you prefer to save all variables in a single file, use `filename` to specify it. Args: executor(Executor): The executor to run for saving variables. dirname(str, optional): The folder where to save variables. When you need to save the parameter to the memory, set it to None. main_program(Program, optional): The program whose variables will be saved. If it is None, the default main program will be used automatically. Default: None vars(list[Variable], optional): The list contains all variables to be saved. Default: None predicate(function, optional): The function selects the variables that make `predicate(variable) == True`. Default: None filename(str, optional): If you prefer to save all variables in a single file, use `filename` to specify it. Otherwise, let `filename` be None. Default: None Returns: str: When saving parameters to a file, returns None. When saving parameters to memory, returns a binary string containing parameters. Raises: TypeError: If `main_program` is not an instance of Program nor None. Examples: .. code-block:: python import paddle.fluid as fluid main_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(main_prog, startup_prog): data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False) w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32', name='fc_w') b = fluid.layers.create_parameter(shape=[200], dtype='float32', name='fc_b') hidden_w = fluid.layers.matmul(x=data, y=w) hidden_b = fluid.layers.elementwise_add(hidden_w, b) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) # The first usage: use `vars` to set the saved variables. var_list = [w, b] path = "./my_paddle_vars" fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, filename="vars_file") # w and b will be save in a file named "var_file". # The second usage: use `predicate` to select the saved variable. def name_has_fc(var): res = "fc" in var.name return res param_path = "./my_paddle_model" fluid.io.save_vars(executor=exe, dirname=param_path, main_program=main_prog, vars=None, predicate = name_has_fc) # all variables whose names contain "fc " are saved. """ save_to_memory = False if dirname is None and filename is None: save_to_memory = True main_program = _get_valid_program(main_program) if vars is None: return save_vars( executor, main_program=main_program, dirname=dirname, vars=list(filter(predicate, main_program.list_vars())), filename=filename) else: params_var_name = unique_name.generate("saved_params") # give warning when there is no var in model if len(list(vars)) == 0: warnings.warn( "no variable in your model, please ensure there are any variables in your model to save" ) return None save_program = Program() save_block = save_program.global_block() save_var_map = {} for each_var in vars: # NOTE: don't save the variable which type is RAW if each_var.type == core.VarDesc.VarType.RAW: continue new_var = _clone_var_in_block_(save_block, each_var) if filename is None and save_to_memory is False: save_file_path = os.path.join( os.path.normpath(dirname), new_var.name) save_block.append_op( type='save', inputs={'X': [new_var]}, outputs={}, attrs={'file_path': os.path.normpath(save_file_path)}) else: save_var_map[new_var.name] = new_var if filename is not None or save_to_memory: save_var_list = [] for name in sorted(save_var_map.keys()): save_var_list.append(save_var_map[name]) save_path = str() if save_to_memory is False: save_path = os.path.join(os.path.normpath(dirname), filename) saved_params = save_block.create_var( type=core.VarDesc.VarType.RAW, name=params_var_name) saved_params.desc.set_persistable(True) save_block.append_op( type='save_combine', inputs={'X': save_var_list}, outputs={'Y': saved_params}, attrs={ 'file_path': save_path, 'save_to_memory': save_to_memory }) #NOTE(zhiqiu): save op will add variable kLookupTablePath in save_program.desc, # which leads to diff on save_program and its desc. Call _sync_with_cpp # to keep consistency. save_program._sync_with_cpp() executor.run(save_program) if save_to_memory: return global_scope().find_var(params_var_name).get_bytes()
def get_for_stmt_nodes(self, node): # TODO: consider for - else in python if not self.name_visitor.is_control_flow_loop(node): return [node] # TODO: support non-range case range_call_node = self.get_for_range_node(node) if range_call_node is None: return [node] if not isinstance(node.target, gast.Name): return [node] iter_var_name = node.target.id init_stmt, cond_stmt, change_stmt = self.get_for_args_stmts( iter_var_name, range_call_node.args) loop_var_names, create_var_names = self.name_visitor.get_loop_var_names( node) new_stmts = [] # Python can create variable in loop and use it out of loop, E.g. # # for x in range(10): # y += x # print(x) # x = 10 # # We need to create static variable for those variables for name in create_var_names: if "." not in name: new_stmts.append(create_static_variable_gast_node(name)) new_stmts.append(init_stmt) # for x in range(10) in dygraph should be convert into static tensor + 1 <= 10 for name in loop_var_names: new_stmts.append(to_static_variable_gast_node(name)) condition_func_node = gast.FunctionDef( name=unique_name.generate(FOR_CONDITION_PREFIX), args=gast.arguments(args=[ gast.Name(id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=[gast.Return(value=cond_stmt)], decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(condition_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(condition_func_node) new_body = node.body new_body.append(change_stmt) new_body.append( gast.Return( value=generate_name_node(loop_var_names, ctx=gast.Load()))) body_func_node = gast.FunctionDef( name=unique_name.generate(FOR_BODY_PREFIX), args=gast.arguments(args=[ gast.Name(id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=new_body, decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(body_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(body_func_node) while_loop_node = create_while_node(condition_func_node.name, body_func_node.name, loop_var_names) new_stmts.append(while_loop_node) return new_stmts
def get_for_stmt_nodes(self, node): # TODO: consider for - else in python # 1. get key statements for different cases # NOTE 1: three key statements: # 1). init_stmts: list[node], prepare nodes of for loop, may not only one # 2). cond_stmt: node, condition node to judge whether continue loop # 3). body_stmts: list[node], updated loop body, sometimes we should change # the original statement in body, not just append new statement # # NOTE 2: The following `for` statements will be transformed to `while` statements: # 1). for x in range(*) # 2). for x in iter_var # 3). for i, x in enumerate(*) current_for_node_parser = ForNodeVisitor(node) stmts_tuple = current_for_node_parser.parse() if stmts_tuple is None: return [node] init_stmts, cond_stmt, body_stmts = stmts_tuple # 2. get original loop vars loop_var_names, create_var_names = self.name_visitor.get_loop_var_names( node) # NOTE: in 'for x in var' or 'for i, x in enumerate(var)' cases, # we need append new loop var & remove useless loop var # 1. for x in var -> x is no need # 2. for i, x in enumerate(var) -> x is no need if current_for_node_parser.is_for_iter( ) or current_for_node_parser.is_for_enumerate_iter(): iter_var_name = current_for_node_parser.iter_var_name iter_idx_name = current_for_node_parser.iter_idx_name loop_var_names.add(iter_idx_name) if iter_var_name not in create_var_names: loop_var_names.remove(iter_var_name) # 3. prepare result statement list new_stmts = [] # Python can create variable in loop and use it out of loop, E.g. # # for x in range(10): # y += x # print(x) # x = 10 # # We need to create static variable for those variables for name in create_var_names: if "." not in name: new_stmts.append(create_static_variable_gast_node(name)) # 4. append init statements new_stmts.extend(init_stmts) # 5. create & append condition function node condition_func_node = gast.FunctionDef( name=unique_name.generate(FOR_CONDITION_PREFIX), args=gast.arguments( args=[ gast.Name( id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=[gast.Return(value=cond_stmt)], decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(condition_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(condition_func_node) # 6. create & append loop body function node # append return values for loop body body_stmts.append( gast.Return(value=generate_name_node( loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(FOR_BODY_PREFIX), args=gast.arguments( args=[ gast.Name( id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=body_stmts, decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(body_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(body_func_node) # 7. create & append while loop node while_loop_nodes = create_while_nodes( condition_func_node.name, body_func_node.name, loop_var_names) new_stmts.extend(while_loop_nodes) return new_stmts
def create_while_nodes(condition_name, body_name, loop_var_names): """ Returns a list of gast.Node which represents the calling of Paddle controlflow while_loop. Usually, the list just contain 1 statement such as: [a, b, c] = paddle.jit.dy2static.convert_while_loop( condition_name, body_name, [a, b, c]) where a, b, c are in loop_var_names. However, if loop_var_names contains property such as foo.x, we cannot assign the property as output of convert_while_loop because Python property is a kind of read-only attribute. To handle the case, we replace the attributes which are output of convert_while_loop with generated variables, then if we know the attribute is not read-only at runtime, we assign the attribute. The created statements are like: [a, b, __attribute_variable_1] = paddle.jit.dy2static.convert_while_loop( condition_name, body_name, [a, b, foo.x]) if not isinstance(getattr(type(foo), x, None), property): foo.x = __attribute_variable_1 The number of above statements is not only 1, that's why the return type is a list of gast.Node. """ # NOTE(liym27): # It's better to parse the source code into an AST node than to customize an AST node # including child nodes, because it is easy to mistake the ast node type when customizing the node. # # For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name, # but the type of `foo.x` gast.Attribute. unique_name_to_origin = {} # We have to make loop_var_names and assign_loop_var_names with same order # set doesn't have order so we convert it to list loop_var_names = list(loop_var_names) assign_loop_var_names = [] for name in (loop_var_names): if "." in name: # name is an attribute variable such as foo.x tmp_attr_name = unique_name.generate(ATTRIBUTE_VARIABLE_PREFIX) unique_name_to_origin[tmp_attr_name] = name assign_loop_var_names.append(tmp_attr_name) else: assign_loop_var_names.append(name) while_func_name = "_jst.convert_while_loop" while_node_str = "[{}] = {}({}, {}, [{}])".format( ",".join(assign_loop_var_names), while_func_name, condition_name, body_name, ",".join(loop_var_names)) while_node = gast.parse(while_node_str).body[0] ret = [while_node] for tmp_attr_name in unique_name_to_origin: origin_attr_var = unique_name_to_origin[tmp_attr_name] dot_pos = origin_attr_var.rindex(".") obj_name = origin_attr_var[0:dot_pos] attr_name = origin_attr_var[dot_pos + 1:] assign_if_not_prop_str = "if not isinstance(getattr(type({}), '{}', None), property): {} = {}".format( obj_name, attr_name, origin_attr_var, tmp_attr_name) assign_if_not_prop_node = gast.parse(assign_if_not_prop_str).body[0] ret.append(assign_if_not_prop_node) return ret
def init_communicator(startup_program, main_program, current_endpoint, endpoints, ring_id): nranks = len(endpoints) other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) group_rank = endpoints.index(current_endpoint) assert group_rank >= 0 block = startup_program.global_block() nccl_id_var = block.create_var(name=unique_name.generate('nccl_id'), persistable=True, type=core.VarDesc.VarType.RAW) block.append_op(type='c_gen_nccl_id', inputs={}, outputs={'Out': nccl_id_var}, attrs={ 'rank': group_rank, 'endpoint': current_endpoint, 'other_endpoints': other_endpoints, OP_ROLE_KEY: OpRole.Forward, }) block.append_op(type='c_comm_init', inputs={'X': nccl_id_var}, outputs={}, attrs={ 'nranks': nranks, 'rank': group_rank, 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Forward, }) # add input op for test fill_var_name = "tensor@Filled" fill_var = block.create_var(name=fill_var_name, shape=[10, 10], dtype='float32', persistable=False, stop_gradient=True) block.append_op(type="fill_constant", outputs={"Out": fill_var_name}, attrs={ "shape": [10, 10], "dtype": fill_var.dtype, "value": 1.0, "place_type": 1 }) with fluid.program_guard(main_program): op_type = "c_allreduce_sum" data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5) helper = LayerHelper(op_type, **locals()) helper.append_op(type=op_type, inputs={'X': [data]}, outputs={'Out': [data]}, attrs={ 'ring_id': ring_id, 'use_calc_stream': True }) print("startup program:", startup_program) print("main program:", main_program)
def _scale_loss(self): main_block = paddle.static.default_main_program().global_block() main_block._sync_with_cpp() OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() loss = self.get_attr("loss") assert loss is not None loss_op = loss.op loss_op_dist_attr = self.dist_context.get_op_dist_attr_for_program( loss_op) if loss.dtype != core.VarDesc.VarType.FP32: # cast loss here will change the effective loss tensor for the computation graph # and therefore will effect all following passes whose logic is based on the loss tensor(Recompute & Gradient Merge), # so we it is not allowed by now. fixed it in future. raise NotImplementedError( "Loss's generator op is not support in FP16 in Auto Parallel by now, please put that op into your black-list." ) tmp_name = unique_name.generate(loss.name + ".cast_fp32") cast_loss = main_block.create_var(name=tmp_name, dtype=dtype) loss_dist_attr = self.dist_context.get_tensor_dist_attr_for_program( loss) ref_mesh = loss_op_dist_attr.process_mesh self.dist_context.set_tensor_dist_attr_for_program( cast_loss, loss_dist_attr) loss_op_idx = find_op_index(main_block.desc, loss_op.desc) cast_op = main_block._insert_op( loss_op_idx + 1, type='cast', inputs={'X': [loss]}, outputs={'Out': [cast_loss]}, attrs={ "in_dtype": loss.dtype, "out_dtype": core.VarDesc.VarType.FP32, 'op_role': loss_op.all_attrs()[OP_ROLE_KEY], }) loss_op._set_attr(OP_ROLE_KEY, core.op_proto_and_checker_maker.OpRole.Forward) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( cast_op, ref_mesh, [-1], self.dist_context) loss = loss.astype('float32') if self.get_attr("use_dynamic_loss_scaling" ) or self.get_attr("init_loss_scaling") != 1.0: loss_op_idx = find_op_index(main_block.desc, loss_op.desc) # forward ref_mesh = loss_op_dist_attr.process_mesh self._scaled_loss = main_block.create_var( name=unique_name.generate("scaled_loss"), shape=loss.shape, dtype=loss.dtype, persistable=loss.persistable) set_var_dist_attr(self.dist_context, self._scaled_loss, [-1], ref_mesh) elementwise_mul_op = main_block._insert_op( loss_op_idx + 1, type='elementwise_mul', inputs={ 'X': [loss], 'Y': [self._loss_scaling] }, outputs={'Out': [self._scaled_loss]}, attrs={ 'op_role': loss_op.all_attrs()[OP_ROLE_KEY], }) loss_op._set_attr(OP_ROLE_KEY, core.op_proto_and_checker_maker.OpRole.Forward) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( elementwise_mul_op, ref_mesh, [-1], self.dist_context) # backward first_backward_op = main_block.ops[loss_op_idx + 2] assert first_backward_op.type == "fill_constant" and int( first_backward_op.all_attrs()[OP_ROLE_KEY]) == 257 self._scaled_loss_grad = main_block.create_var( name=unique_name.generate("scaled_loss") + "@GRAD", shape=loss.shape, dtype=loss.dtype, persistable=loss.persistable) set_var_dist_attr(self.dist_context, self._scaled_loss_grad, [-1], ref_mesh) pre_grad_name = first_backward_op.output_arg_names[0] first_backward_op._rename_output(pre_grad_name, self._scaled_loss_grad.name) # FIXME(JZ-LIANG) a trick to insert backward op main_block._sync_with_cpp() elementwise_mul_grad_op_desc = main_block.desc._insert_op( loss_op_idx + 3) elementwise_mul_grad_op_desc.set_type("elementwise_mul_grad") elementwise_mul_grad_op_desc.set_input( 'Out@GRAD', [self._scaled_loss_grad.name]) elementwise_mul_grad_op_desc.set_input('X', [loss.name]) elementwise_mul_grad_op_desc.set_input('Y', [self._loss_scaling.name]) elementwise_mul_grad_op_desc.set_output('X@GRAD', [pre_grad_name]) elementwise_mul_grad_op_desc.set_output('Y@GRAD', []) elementwise_mul_grad_op_desc._set_attr( OP_ROLE_KEY, core.op_proto_and_checker_maker.OpRole.Backward) elementwise_mul_grad_op_desc._set_attr('axis', -1) elementwise_mul_grad_op = paddle.fluid.framework.Operator( main_block, elementwise_mul_grad_op_desc) main_block.ops.insert(loss_op_idx + 3, elementwise_mul_grad_op) main_block._sync_with_cpp() elementwise_mul_grad_op = main_block.ops[loss_op_idx + 3] assert elementwise_mul_grad_op.type == "elementwise_mul_grad" naive_set_dist_op_attr_for_program_by_mesh_and_mapping( elementwise_mul_grad_op, ref_mesh, [-1], self.dist_context) else: self._scaled_loss = loss main_block._sync_with_cpp()
def __call__(self, var, block): """Add xavier initialization ops for a variable Args: var: Variable that needs to be initialized block: The block in which initialization ops should be added Returns: the initialization op """ assert isinstance(block, framework.Block) check_variable_and_dtype(var, "Out", ["int64"], "xavier_init") if (var.dtype != VarDesc.VarType.INT64): raise ValueError("Only 'int64' dtype is supported in paddlefl's initializer.") f_in, f_out = self._compute_fans(var) # If fan_in and fan_out are passed, use them fan_in = f_in if self._fan_in is None else self._fan_in fan_out = f_out if self._fan_out is None else self._fan_out if self._seed == 0: self._seed = block.program.random_seed # create tmp var: # out_var for random number, shape = (1, ...) # out_expand_var for encrypted random number, shape = (2, ...), is same with var's shape out_dtype = VarDesc.VarType.FP32 shape_ = list(var.shape) shape_[0]=1 out_var = block.create_var( name=unique_name.generate(".".join( ['gaussian_random', var.name, 'tmp'])), shape=shape_, dtype=out_dtype, type=VarDesc.VarType.LOD_TENSOR, persistable=False) out_expand_var = block.create_var( name=unique_name.generate(".".join( ['gaussian_random_expand', var.name, 'tmp'])), shape=out_var.shape, dtype=out_dtype, type=VarDesc.VarType.LOD_TENSOR, persistable=False) if self._uniform: limit = np.sqrt(6.0 / float(fan_in + fan_out)) op = block._prepend_op( type="uniform_random", inputs={}, outputs={"Out": out_var}, attrs={ "shape": out_var.shape, "dtype": out_dtype, "min": -limit, "max": limit, "seed": self._seed }, stop_gradient=True) else: std = np.sqrt(2.0 / float(fan_in + fan_out)) op = block._prepend_op( type="gaussian_random", outputs={"Out": out_var}, attrs={ "shape": out_var.shape, "dtype": out_dtype, "mean": 0.0, "std": std, "seed": self._seed }, stop_gradient=True) # convert plaintext into cyphertext block.append_op( type="scale", inputs={"X": out_var}, outputs={"Out": out_var}, attrs={"scale": float(mdu.mpc_one_share)}) # extend one share to two share block.append_op( type="concat", inputs={"X": [out_var, out_var]}, outputs={"Out": [out_expand_var]}, attrs={"axis": 0}) # cast float into int64 block.append_op( type="cast", inputs={"X": out_expand_var}, outputs={"Out": var}, attrs={"in_dtype": out_expand_var.dtype, "out_dtype": var.dtype}) if not framework.in_dygraph_mode(): var.op = op return op
def create_mpc_parameter(self, attr, shape, dtype, is_bias=False, default_initializer=None, stop_gradient=False, type=core.VarDesc.VarType.LOD_TENSOR): """ Create mpc parameters for this layers. Refer to LayerHelper.create_parameter in Paddle 1.7. :param attr: :param shape: :param dtype: :param is_bias: :param default_initializer: :param stop_gradient: :param type: :return: """ # Deepcopy the attr so that parameters can be shared in program attr = copy.deepcopy(attr) attr = ParamAttr._to_attr(attr) if not attr: return None assert isinstance(attr, ParamAttr) suffix = 'b' if is_bias else 'w' if attr.name is None: attr.name = unique_name.generate(".".join([self.name, suffix])) if default_initializer is None and attr.initializer is None: if isinstance(dtype, core.VarDesc.VarType): if dtype != core.VarDesc.VarType.INT64: raise TypeError( "Can not create mpc parameter with default initializer " "when dtype is not int64 type. Set default_initializer " "to fit the parameter dtype!") else: if not dtype == "int64": raise TypeError( "Can not create mpc parameter with default initializer when " "dtype is not int64 type. Set default_initializer to " "fit the parameter dtype!") if is_bias: attr._set_default_bias_initializer() else: attr._set_default_initializer(XavierInitializer(seed=65536)) else: attr._set_default_initializer(default_initializer) # TODO(xukun07): not support WeightNormParamAttr in this first version # Paddle1.7: If weight normalization is set, insert extra parameters and ops. # Refer to https://arxiv.org/pdf/1602.07868.pdf if isinstance(attr, WeightNormParamAttr): # param = self._create_weight_normalize(attr, shape, dtype) # WeightNormParamAttr.params_with_weight_norm.append(param) # return param raise NotImplementedError( "The WeightNormParamAttr for attr is not " "supported in this version") startup_program_global_block = self.startup_program.global_block() create_mpc_parameter( block=startup_program_global_block, dtype=dtype, shape=shape, type=type, **attr._to_kwargs(with_initializer=True)) main_program_global_block = self.main_program.global_block() return create_mpc_parameter( block=main_program_global_block, dtype=dtype, shape=shape, type=type, **attr._to_kwargs())
def get_while_stmt_nodes(self, node): loop_var_names, create_var_names = self.name_visitor.get_loop_var_names( node) new_stmts = [] # Python can create variable in loop and use it out of loop, E.g. # # while x < 10: # x += 1 # y = x # z = y # # We need to create static variable for those variables for name in create_var_names: if "." not in name: new_stmts.append(create_static_variable_gast_node(name)) condition_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_CONDITION_PREFIX), args=gast.arguments( args=[ gast.Name( id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=[gast.Return(value=node.test)], decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(condition_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(condition_func_node) new_body = node.body new_body.append( gast.Return(value=generate_name_node( loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_BODY_PREFIX), args=gast.arguments( args=[ gast.Name( id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=new_body, decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(body_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(body_func_node) while_loop_nodes = create_while_nodes( condition_func_node.name, body_func_node.name, loop_var_names) new_stmts.extend(while_loop_nodes) return new_stmts
def get_while_stmt_nodes(self, node): # TODO: consider while - else in python if not self.name_visitor.is_control_flow_loop(node): return [node] loop_var_names, create_var_names = self.name_visitor.get_loop_var_names( node) new_stmts = [] # Python can create variable in loop and use it out of loop, E.g. # # while x < 10: # x += 1 # y = x # z = y # # We need to create static variable for those variables for name in create_var_names: if "." not in name: new_stmts.append(create_static_variable_gast_node(name)) # while x < 10 in dygraph should be convert into static tensor < 10 for name in loop_var_names: new_stmts.append(to_static_variable_gast_node(name)) logical_op_transformer = LogicalOpTransformer(node.test) cond_value_node = logical_op_transformer.transform() condition_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_CONDITION_PREFIX), args=gast.arguments(args=[ gast.Name(id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=[gast.Return(value=cond_value_node)], decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(condition_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(condition_func_node) new_body = node.body new_body.append( gast.Return( value=generate_name_node(loop_var_names, ctx=gast.Load()))) body_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_BODY_PREFIX), args=gast.arguments(args=[ gast.Name(id=name, ctx=gast.Param(), annotation=None, type_comment=None) for name in loop_var_names ], posonlyargs=[], vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), body=new_body, decorator_list=[], returns=None, type_comment=None) for name in loop_var_names: if "." in name: rename_transformer = RenameTransformer(body_func_node) rename_transformer.rename( name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(body_func_node) while_loop_node = create_while_node(condition_func_node.name, body_func_node.name, loop_var_names) new_stmts.append(while_loop_node) return new_stmts
def create_new_para_name(attr): if attr: assert attr.name, "attr should have a name already!" name_key = 'PARL_target_' + attr.name attr.name = unique_name.generate(name_key)
def fp16_compression(param_and_grads): """ Compress fp32 gradients to fp16 during allreduce. """ op_maker = core.op_proto_and_checker_maker new_param_and_grads = [] # param, grad, is_cast # cast grad from fp32->fp16 before allreduce, for param, grad in param_and_grads: if grad is None or grad.dtype != core.VarDesc.VarType.FP32: new_param_and_grads.append((param, grad, False)) continue op = grad.op block = grad.block var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] if param.name not in var_attr: new_param_and_grads.append((param, grad, False)) continue # remove (param, grad) from op_role_var var_attr.remove(param.name) var_attr.remove(grad.name) if len(var_attr) > 1: op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) else: op._remove_attr(op_maker.kOpRoleVarAttrName()) new_grad = block.create_var( name=unique_name.generate(grad.name + ".cast_fp16"), dtype=core.VarDesc.VarType.FP16, persistable=False, stop_gradient=True) with block.program._backward_role_guard(): cast_op = block.append_op(type="cast", inputs={"X": grad}, outputs={"Out": new_grad}, attrs={ "in_dtype": core.VarDesc.VarType.FP32, "out_dtype": core.VarDesc.VarType.FP16 }, stop_gradient=True) backward = op_maker.OpRole.Backward cast_op._set_attr(op_maker.kOpRoleAttrName(), backward) cast_op._set_attr(op_maker.kOpRoleVarAttrName(), [param.name, new_grad.name]) new_grad.op = cast_op new_param_and_grads.append((param, new_grad, True)) ret_param_and_grads = [] # cast grad from fp16->fp32 after allreduce. # NOTE. Now we split fp16 compression into two for loops, # if we do not separate them, fuse allreduce will wrong. # This must be the problem of fuse allreduce pass, need # fixed in future. for param, grad, cast in new_param_and_grads: if not cast: ret_param_and_grads.append((param, grad)) continue block = grad.block new_grad = block.create_var( name=unique_name.generate(grad.name + ".cast_fp32"), dtype=core.VarDesc.VarType.FP32, persistable=False, stop_gradient=True) with block.program._optimized_guard( [param, grad]), framework.name_scope('fp16_allreduce'): cast_op = block.append_op(type="cast", inputs={"X": grad}, outputs={"Out": new_grad}, attrs={ "in_dtype": core.VarDesc.VarType.FP16, "out_dtype": core.VarDesc.VarType.FP32 }, stop_gradient=True) ret_param_and_grads.append((param, new_grad)) return ret_param_and_grads
def _update_name_to_var_shape(self, node): def replace_dot(name): # replace all '.' into '_' return name.replace('.', '_') assert isinstance(node, gast.Assign) target_node = node.targets[0] value_node = node.value update_static_shape_var_node = None if isinstance(target_node, gast.Tuple): update_static_shape_var_node = [] for idx, element in enumerate(target_node.elts): target_id = ast_to_source_code(element).strip() if isinstance(value_node, gast.Name): if value_node.id in self.name_to_var_shape: # TODO(zhhsplendid): is context a problem for the result node of gast.parse? static_shape_var_name = unique_name.generate( replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value static_shape_value_name = self.name_to_var_shape[ value_node.id] sub_node_str = "{}[{}]".format(static_shape_value_name, idx) sub_node = gast.parse(sub_node_str).body[0].value update_static_shape_var_node.append( gast.Assign( targets=[static_shape_var_node], value=sub_node)) self.name_to_var_shape[ target_id] = static_shape_var_name if isinstance(value_node, gast.Attribute): if self._is_var_shape(value_node): # eg: x.shape static_shape_var_name = unique_name.generate( replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value static_shape_value_node = copy.deepcopy(value_node) # x.shape becomes convert_var_shape_simple(x) static_shape_value_node = ShapeAttributeTransformer( ).visit(static_shape_value_node) sub_node_str = "{}[{}]".format( ast_to_source_code(static_shape_value_node).strip(), idx) sub_node = gast.parse(sub_node_str).body[0].value # Note(Aurelius84): Becuase static_shape_var_name is used in # eval_if_exist_else_none() as plain string, so it will not # be pasred as argument in convert_loop/ifelse. We delcare it # as global var because it has unique name. update_static_shape_var_node.append( gast.Global(names=[static_shape_var_name])) update_static_shape_var_node.append( gast.Assign( targets=[static_shape_var_node], value=sub_node)) self.name_to_var_shape[ target_id] = static_shape_var_name return update_static_shape_var_node else: target_id = ast_to_source_code(target_node).strip() if isinstance(value_node, gast.Name): if value_node.id in self.name_to_var_shape: static_shape_var_name = unique_name.generate( replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value static_shape_value_name = self.name_to_var_shape[ value_node.id] static_shape_value_node = gast.parse( static_shape_value_name).body[0].value update_static_shape_var_node = [ gast.Assign( targets=[static_shape_var_node], value=static_shape_value_node) ] self.name_to_var_shape[target_id] = static_shape_var_name elif self._is_var_shape(value_node): # eg: x.shape or x.shape[0] static_shape_var_name = unique_name.generate( replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse(static_shape_var_name).body[ 0].value static_shape_value_node = copy.deepcopy(value_node) # x.shape becomes convert_var_shape_simple(x) static_shape_value_node = ShapeAttributeTransformer().visit( static_shape_value_node) # Declare static_shape_var_name as global var update_static_shape_var_node = [ gast.Global(names=[static_shape_var_name]) ] update_static_shape_var_node.append( gast.Assign( targets=[static_shape_var_node], value=static_shape_value_node)) self.name_to_var_shape[target_id] = static_shape_var_name return update_static_shape_var_node
def _get_offload_var_name(self, name): return unique_name.generate(name + '@offload')
def _shard_parameter(self, main_block, startup_block): if self.stage < 3: return dp_ring_ids = [group.id for group in self.dp_groups] for sharding_info in self.sharding_infos: need_broadcast_vars, param_usage = sharding_info.get_broadcast_vars_and_param_usage( main_block) not_used_param_nane = [] for param_name in param_usage: if param_usage[param_name] == 0 and sharding_info.get_var_rank( param_name) != sharding_info.local_rank: not_used_param_nane.append(param_name) for idx, op in reversed(list(enumerate(main_block.ops))): if is_optimizer_op(op): continue for input_name in op.desc.input_arg_names(): if op.type == "cast": continue if input_name not in need_broadcast_vars: continue root_rank = sharding_info.get_var_rank(input_name) if root_rank == sharding_info.local_rank: broadcast_varname = input_name else: broadcast_varname = unique_name.generate(input_name + "@BroadCast") input_var = main_block.var(input_name) new_var = main_block.create_var(name=broadcast_varname, shape=input_var.shape, dtype=input_var.dtype, persistable=False) ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program( input_var) out_var_dist_attr = set_var_dist_attr( self._dist_context, new_var, ref_dist_attr.dims_mapping, ref_dist_attr.process_mesh) op._rename_input(input_name, broadcast_varname) _insert_init_and_broadcast_op(main_block, idx, broadcast_varname, sharding_info.local_rank, root_rank, sharding_info.group.id, op.attr('op_role'), self._dist_context) for idx, op in reversed(list(enumerate(main_block.ops))): if op.type != "cast": continue input_name = op.input_arg_names[0] output_name = op.output_arg_names[0] if input_name in not_used_param_nane: main_block._remove_op(idx, sync=False) main_block._remove_var(output_name, sync=False) for idx, op in reversed(list(enumerate(startup_block.ops))): assert len(op.output_arg_names) == 1 output_name = op.output_arg_names[0] if op.type == "c_broadcast" and op.attr( "ring_id") in dp_ring_ids: if self.outer_dp_group and sharding_info.get_var_rank( output_name) == sharding_info.local_rank: op._set_attr("ring_id", self.outer_dp_group.id) else: startup_block._remove_op(idx, sync=False) continue if op.type != "c_broadcast" and output_name in param_usage and sharding_info.get_var_rank( output_name) != sharding_info.local_rank: startup_block._remove_op(idx, sync=False) for param_name in param_usage: if sharding_info.get_var_rank( param_name) != sharding_info.local_rank: main_block._remove_var(param_name, sync=False) startup_block._remove_var(param_name, sync=False) main_block._sync_with_cpp() startup_block._sync_with_cpp()
def __init__(self, block, type=core.VarDesc.VarType.LOD_TENSOR, name=None, shape=None, dtype=None, lod_level=None, capacity=None, persistable=None, error_clip=None, stop_gradient=False, is_data=False, need_check_feed=False, belong_to_optimizer=False, **kwargs): self.block = block if name is None: name = unique_name.generate('_generated_var') if dtype is not None: if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) self.belong_to_optimizer = belong_to_optimizer self.error_clip = error_clip is_new_var = False name = cpt.to_text(name) self.desc = self.block.desc.find_var(cpt.to_bytes(name)) if self.desc is None: self.desc = self.block.desc.var(cpt.to_bytes(name)) is_new_var = True if is_new_var: self.desc.set_type(type) elif self.desc.type() != type: raise ValueError("MpcVariable {0} has been created before. The " "previous type is {1}; the new type is {2}. They" " are not matched".format(self.name, self.desc.type(), type)) if shape is not None: if is_new_var: # resize the shape for MpcVariable mpc_shape = list(shape) mpc_shape.insert(0, 2) self.desc.set_shape(mpc_shape) else: old_shape = self.shape shape = tuple(shape) if shape != old_shape: raise ValueError( "MpcVariable {0} has been created before. the previous " "shape is {1}; the new shape is {2}. They are not " "matched.".format(self.name, old_shape, shape)) if dtype is not None: if is_new_var: self.desc.set_dtype(dtype) else: old_dtype = self.dtype if dtype != old_dtype: raise ValueError( "MpcVariable {0} has been created before. " "The previous data type is {1}; the new " "data type is {2}. They are not " "matched.".format(self.name, old_dtype, dtype)) if lod_level is not None: if is_new_var: self.desc.set_lod_level(lod_level) else: if lod_level != self.lod_level: raise ValueError( "MpcVariable {0} has been created before. " "The previous lod_level is {1}; the new " "lod_level is {2}. They are not " "matched".format(self.name, self.lod_level, lod_level)) if persistable is not None: if is_new_var: self.desc.set_persistable(persistable) else: if persistable != self.persistable: raise ValueError( "MpcVariable {0} has been created before." "The previous persistable is {1}; the new " "persistable is {2}. They are not matched".format( self.name, self.persistable, persistable)) if need_check_feed and is_new_var: self.desc.set_need_check_feed(need_check_feed) if capacity is not None: if is_new_var: self.desc.set_capacity(capacity) else: # TODO(abhinavarora) by Paddle 1.7: Compare with set capacity once, # get_capacity is implemented pass self.block.vars[name] = self self.op = None self._stop_gradient = stop_gradient self.is_data = is_data
def visit_FunctionDef(self, node): self.function_def.append(node) self.return_value_name[node] = None self.return_name[node] = [] self.return_no_value_name[node] = [] self.pre_analysis = ReturnAnalysisVisitor(node) max_return_length = self.pre_analysis.get_func_max_return_length(node) while self.pre_analysis.get_func_return_count(node) > 1: self.generic_visit(node) self.pre_analysis = ReturnAnalysisVisitor(node) if max_return_length == 0: self.function_def.pop() return node # Prepend initialization of final return and append final return statement value_name = self.return_value_name[node] if value_name is not None: node.body.append( gast.Return(value=gast.Name(id=value_name, ctx=gast.Load(), annotation=None, type_comment=None))) init_names = [ unique_name.generate(RETURN_VALUE_INIT_NAME) for i in range(max_return_length) ] assign_zero_nodes = [ create_fill_constant_node(iname, 0.0) for iname in init_names ] if len(init_names) == 1: return_value_nodes = gast.Name(id=init_names[0], ctx=gast.Load(), annotation=None, type_comment=None) else: # We need to initialize return value as a tuple because control # flow requires some inputs or outputs have same structure return_value_nodes = gast.Tuple(elts=[ gast.Name(id=iname, ctx=gast.Load(), annotation=None, type_comment=None) for iname in init_names ], ctx=gast.Load()) assign_return_value_node = gast.Assign(targets=[ gast.Name(id=value_name, ctx=gast.Store(), annotation=None, type_comment=None) ], value=return_value_nodes) node.body.insert(0, assign_return_value_node) node.body[:0] = assign_zero_nodes # Prepend no value placeholders for name in self.return_no_value_name[node]: assign_no_value_node = create_fill_constant_node( name, RETURN_NO_VALUE_MAGIC_NUM) node.body.insert(0, assign_no_value_node) self.function_def.pop() return node
def modify_forward_desc_for_recompute(self, dist_context): """ If program's foward part has 'dropout' op, this function will insert a seed op before it to guarantee that two dropout op have the same outputs. """ op_types = [op.desc.type() for op in self._ops] if "dropout" not in op_types: return op_idx = 0 while op_idx < len(self._ops): cur_op = self._ops[op_idx] if "grad" in cur_op.type: break if cur_op.type != "dropout": op_idx += 1 continue if cur_op.input("Seed") is not None and len(cur_op.input("Seed")): op_idx += 1 continue cur_op_dist_attr = dist_context.get_op_dist_attr_for_program( cur_op) # insert seed op to guarantee that two dropout op have the same outputs op_unique_name = unique_name.generate("seed") var_unique_name = unique_name.generate_with_ignorable_key(".".join( [op_unique_name, 'tmp'])) seed_var = self._block.create_var( name=var_unique_name, dtype='int32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) # set new seed_var's dist_attr ref_dims_mapping = [-1] ref_process_mesh = cur_op_dist_attr.process_mesh seed_var_dist_attr = set_var_dist_attr(dist_context, seed_var, ref_dims_mapping, ref_process_mesh) seed = 0 if cur_op.attr("fix_seed") is False else int( cur_op.attr("seed")) seed_op = self._block._insert_op_without_sync( index=cur_op.idx, type="seed", inputs={}, outputs={"Out": seed_var}, attrs={ "seed": seed, "force_cpu": True }) # set new seed op's dist_attr naive_set_dist_op_attr_for_program_by_mesh_and_mapping( seed_op, ref_process_mesh, ref_dims_mapping, dist_context) # modify dropout op's desc self._ops.insert(op_idx, seed_op) cur_op.desc.set_input("Seed", [var_unique_name]) cur_op.desc.remove_attr("fix_seed") cur_op.desc.remove_attr("seed") cur_op_dist_attr.set_input_dist_attr(seed_var.name, seed_var_dist_attr) self._block._sync_with_cpp() op_idx += 2
def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name, max_return_length, parent_node_of_return): assert max_return_length >= 0, "Input illegal max_return_length" i = index_in_list(stmt_list, return_node) if i == -1: return False assign_nodes = [] # Here assume that the parent node of return is gast.If if isinstance(parent_node_of_return, gast.If): # Prepend control flow boolean nodes such as '__return@1 = True' node_str = "{} = _jst.create_bool_as_type({}, True)".format( return_name, ast_to_source_code(parent_node_of_return.test).strip()) assign_true_node = gast.parse(node_str).body[0] assign_nodes.append(assign_true_node) cur_func_node = self.function_def[-1] return_length = get_return_size(return_node) if return_length < max_return_length: # In this case we should append RETURN_NO_VALUE placeholder # # max_return_length must be >= 1 here because return_length will be # 0 at least. if self.return_value_name[cur_func_node] is None: self.return_value_name[cur_func_node] = unique_name.generate( RETURN_VALUE_PREFIX) no_value_names = [ unique_name.generate(RETURN_NO_VALUE_VAR_NAME) for j in range(max_return_length - return_length) ] self.return_no_value_name[cur_func_node].extend(no_value_names) # Handle tuple/non-tuple case if max_return_length == 1: assign_nodes.append( gast.Assign(targets=[ gast.Name(id=self.return_value_name[cur_func_node], ctx=gast.Store(), annotation=None, type_comment=None) ], value=gast.Name(id=no_value_names[0], ctx=gast.Load(), annotation=None, type_comment=None))) else: # max_return_length > 1 which means we should assign tuple fill_tuple = [ gast.Name(id=n, ctx=gast.Load(), annotation=None, type_comment=None) for n in no_value_names ] if return_node.value is not None: if isinstance(return_node.value, gast.Tuple): fill_tuple[:0] = return_node.value.elts else: fill_tuple.insert(0, return_node.value) assign_nodes.append( gast.Assign(targets=[ gast.Name(id=self.return_value_name[cur_func_node], ctx=gast.Store(), annotation=None, type_comment=None) ], value=gast.Tuple(elts=fill_tuple, ctx=gast.Load()))) else: # In this case we should NOT append RETURN_NO_VALUE placeholder if return_node.value is not None: cur_func_node = self.function_def[-1] if self.return_value_name[cur_func_node] is None: self.return_value_name[ cur_func_node] = unique_name.generate( RETURN_VALUE_PREFIX) assign_nodes.append( gast.Assign(targets=[ gast.Name(id=self.return_value_name[cur_func_node], ctx=gast.Store(), annotation=None, type_comment=None) ], value=return_node.value)) stmt_list[i:] = assign_nodes return True
def __init__(self, learning_rate=0.001, lamb_weight_decay=0.01, beta1=0.9, beta2=0.999, epsilon=1e-6, parameters=None, grad_clip=None, exclude_from_weight_decay_fn=None, clip_after_allreduce=True, is_grad_scaled_by_nranks=True, alignment=128, use_master_param_norm=True, gradient_accumulation_steps=1, name=None): assert not framework._non_static_mode( ), "DistributedFusedLamb does not support dygraph mode" super(DistributedFusedLamb, self).__init__(learning_rate=learning_rate, grad_clip=None, name=name) self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon self._weight_decay = lamb_weight_decay if lamb_weight_decay is not None else 0.0 if grad_clip is not None: assert isinstance( grad_clip, ClipGradByGlobalNorm ), "Only ClipGradByGlobalNorm is supported in DistributedFusedLamb" max_global_grad_norm = grad_clip.clip_norm else: max_global_grad_norm = -1.0 self._max_global_grad_norm = max_global_grad_norm self._alignment = alignment if alignment is not None else -1 self._clip_after_allreduce = clip_after_allreduce self._is_grad_scaled_by_nranks = is_grad_scaled_by_nranks self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn self._scale = None self._ring_id = 0 self._use_master_param_norm = use_master_param_norm self._gradient_accumulation_steps = gradient_accumulation_steps assert self._gradient_accumulation_steps >= 1 self.helper = LayerHelper('distributed_fused_lamb') self._supports_check_nan_inf = True # very import flag for AMP main_block = self.helper.main_program.global_block() self._found_inf = main_block.create_var( name=unique_name.generate('found_inf'), shape=[1], dtype=core.VarDesc.VarType.BOOL) self._step = None if self._gradient_accumulation_steps > 1: self._stop_update = main_block.create_var( name=unique_name.generate('stop_update'), shape=[1], dtype=core.VarDesc.VarType.BOOL) else: self._stop_update = None self._param_to_master_param = {}