def __init__(self, size, num_partitions=1, gather_out=True, param_attr=None, bias_attr=None, name=None): super().__init__() if in_dygraph_mode(): rank = paddle.distributed.get_rank() nranks = paddle.distributed.get_world_size() else: assert fleet._role_maker, ("To use paddle.distributed.split, " "you must call fleet.init() firstly.") rank = fleet.worker_index() nranks = fleet.worker_num() # rank within a model parallel group inner_rank = rank % num_partitions self.gather_out = gather_out assert size[1] % num_partitions == 0, ( "Number of column of the weight for linear ({}) must be" " divisible by num_partitions ({})".format(size[1], num_partitions)) self.per_part_size = size[1] // num_partitions linear_size = (size[0], self.per_part_size) num_rows, num_cols = linear_size if not name: name = "fc_by_col_rank_%d" % inner_rank else: name = name + "_by_col_rank_%d" % inner_rank self.linear = paddle.nn.Linear(num_rows, num_cols, weight_attr=param_attr, bias_attr=bias_attr, name=name) weight = self.linear.weight weight.is_distributed = True # alias for weight tensor self.weight = self.linear.weight startup_block = paddle.static.default_startup_program().global_block() main_block = paddle.static.default_main_program().global_block() startup_block.vars[weight.name].is_distributed = True main_block.vars[weight.name].is_distributed = True # set is_distributed for splited bias # if a linear layer is splited by col, the bias would also be split into each rank as its weight if self.linear._bias_attr != False: startup_block.vars[self.linear.bias.name].is_distributed = True main_block.vars[self.linear.bias.name].is_distributed = True self.bias = self.linear.bias
def __init__(self, size, num_partitions=1, input_is_parallel=False, param_attr=None, bias_attr=None, name=None): super().__init__() if in_dygraph_mode(): rank = paddle.distributed.get_rank() nranks = paddle.distributed.get_world_size() else: assert fleet._role_maker, ("To use paddle.distributed.split, " "you must call fleet.init() firstly.") rank = fleet.worker_index() nranks = fleet.worker_num() # rank within a model parallel group inner_rank = rank % num_partitions self.input_is_parallel = input_is_parallel assert size[0] % num_partitions == 0, ( "Number of rows of the weight for linear ({}) must be" " divisible by num_partitions ({})".format(size[0], num_partitions)) self.per_part_size = size[0] // num_partitions linear_size = (self.per_part_size, size[1]) num_rows, num_cols = linear_size if not name: name = "fc_by_row_rank_%d" % inner_rank else: name = name + "_by_row_rank_%d" % inner_rank self.linear = paddle.nn.Linear( num_rows, num_cols, weight_attr=param_attr, # NOTE(wangxi): row split, bias need add after allreduce bias_attr=False, name=name) weight = self.linear.weight weight.is_distributed = True # alias for weight tensor self.weight = self.linear.weight self.bias = self.linear.bias startup_block = paddle.static.default_startup_program().global_block() main_block = paddle.static.default_main_program().global_block() startup_block.vars[weight.name].is_distributed = True main_block.vars[weight.name].is_distributed = True # set is_distributed for splited bias # if a linear layer is splited by row, each rank would hold a complete bias if bias_attr is not False: self.bias = self.create_parameter(shape=[num_cols], attr=bias_attr, dtype=self._dtype, is_bias=True) else: self.bias = None