def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, batch_size=32, has_bias=True, activation=None): super(Dense_Thor_GPU, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None split_dim = 128 matrix_A_shape, matrix_G_shape = caculate_matmul_shape(self.in_channels, self.out_channels, split_dim) self.matrix_A_inv = Parameter(Tensor(np.zeros(matrix_A_shape).astype(np.float32)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor(np.zeros(matrix_G_shape).astype(np.float32)), name="matrix_G_inv", requires_grad=False) self.broadcast_to = P.BroadcastTo(matrix_A_shape) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.mul = P.Mul() self.cube_matmul = P.MatMul(transpose_a=True) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.batch_size = Tensor(batch_size, mstype.float16) self.getG = P.InsertGradientOf(self.save_gradient) self.damping = Parameter(Tensor(damping), name="damping_value", requires_grad=False) self.dampingA = Tensor(np.identity(in_channels), mstype.float32) self.dampingG = Tensor(np.identity(out_channels), mstype.float32) self.cast = P.Cast() self.gather = P.GatherV2() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.add = P.TensorAdd() self.sqrt = P.Sqrt() self.cholesky = P.Cholesky(split_dim=split_dim) self.vector_matmul = P.BatchMatMul(transpose_a=True)
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, data_format='NCHW', has_bias=False, weight_init='normal', damping=0.03, loss_scale=1, frequency=278, batch_size=32, bias_init='zeros'): self.thor = True self.hw = kernel_size * kernel_size kernel_size = twice(kernel_size) super(Conv2d_Thor_GPU, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, data_format, has_bias, weight_init, bias_init, ) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group) self.matrix_A_dim = self.in_channels * self.kernel_size[ 0] * self.kernel_size[1] self.matrix_G_dim = self.out_channels split_dim = 128 matrix_A_shape, matrix_G_shape = caculate_matmul_shape( self.matrix_A_dim, self.matrix_G_dim, split_dim) self.matrix_A_inv = Parameter(np.zeros(matrix_A_shape).astype( np.float32), requires_grad=False) self.matrix_G_inv = Parameter(np.zeros(matrix_G_shape).astype( np.float32), requires_grad=False) self.broadcast_to = P.BroadcastTo(matrix_A_shape) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same") self.matmul = P.MatMul(transpose_b=True) self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.getG = P.InsertGradientOf(self.save_gradient) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.batch_size = Tensor(batch_size, mstype.float16) self.transpose = P.Transpose() self.cast = P.Cast() self.gather = P.Gather() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.sqrt = P.Sqrt() self.reduce_mean = P.ReduceMean(keep_dims=False) self.damping = Parameter(Tensor(damping), requires_grad=False) self.dampingA = Tensor(np.identity(self.matrix_A_dim), mstype.float32) self.dampingG = Tensor(np.identity(self.matrix_G_dim), mstype.float32) self.cholesky = P.CholeskyTrsm(split_dim=split_dim) self.vector_matmul = P.BatchMatMul(transpose_a=True)
def generate( model=None, config=None, input_ids: Optional[Tensor] = None, input_mask: Optional[Tensor] = None, max_length: Optional[int] = 1024, min_length: Optional[int] = 200, do_sample: Optional[bool] = False, early_stopping: Optional[bool] = False, num_beams: Optional[int] = 1, temperature: Optional[float] = 1.0, top_k: Optional[int] = 50, top_p: Optional[float] = 1.0, repetition_penalty: Optional[float] = 1.0, bos_token_id: Optional[int] = 50256, pad_token_id: Optional[int] = 50256, eos_token_id: Optional[int] = 50256, length_penalty: Optional[float] = 1.0, no_repeat_ngram_size: Optional[int] = 0, num_return_sequences: Optional[int] = 1, attention_mask: Optional[Tensor] = None, use_cache: Optional[bool] = True, ): r""" Generates sequences for models with a language modeling head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. Args: config: the config of gpt2 model which you want to use to generate. input_ids (Tensor): shape with (batch_size, seq_length) max_length (int): The maximum length of the sequence to be generated. min_length: The minimum length of the sequence to be generated. do_sample: Whether or not to use sampling ; use greedy decoding otherwise. early_stopping: Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. num_beams: Number of beams for beam search. 1 means no beam search. temperature: The value used to module the next token probabilities. top_k: The number of highest probability vocabulary tokens to keep for top-k-filtering. top_p: If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation. repetition_penalty: Default 1.0 .The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details. bos_token_id: The id of the `padding` token. pad_token_id: The id of the `beginning-of-sequence` token. eos_token_id: The id of the `end-of-sequence` token. length_penalty: Exponential penalty to the length. 1.0 means no penalty. Default: 1.0. no_repeat_ngram_size: If set to int > 0, all ngrams of that size can only occur once. Default: 0. num_return_sequences: The number of independently computed returned sequences for each element in the batch. Default: 1. attention_mask: shape with (batch_size, seq_length) Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for tokens that are not masked, and 0 for masked tokens. use_cache: Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. Default: True . Returns: List of shape (batch_size * num_return_sequences, seq_length) The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all batches finished early due to the :obj:`eos_token_id`. """ if input_ids is not None: batch_size, seq_len = P.Shape()(input_ids) else: batch_size = 1 assert model is not None, "model should not be a None object." assert config is not None, "config of gpt2_model is a must input param." assert isinstance( max_length, int ) and max_length > 0, "`max_length` should be a strictly positive integer." assert isinstance( min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(use_cache, bool), "`use_cache` should be a boolean." assert isinstance( num_beams, int ) and num_beams > 0, "`num_beams` should be a strictly positive integer." assert temperature > 0, "`temperature` should be strictly positive." assert isinstance( top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0)), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0)), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictly positive." assert (isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 ), "`no_repeat_ngram_size` should be a positive integer." assert (isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictly positive integer." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" assert attention_mask is not None, "`attention_mask` should be provided。" vocab_size = config.vocab_size # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if num_return_sequences > 1 or num_beams > 1: expand_shape = (batch_size, effective_batch_mult * num_beams, seq_len) broadcast_to = P.BroadcastTo(expand_shape) input_ids = P.ExpandDims()(input_ids, 1) # [batch_size, 1, seq_len] input_ids = broadcast_to(input_ids) attention_mask = P.ExpandDims()(attention_mask, 1) attention_mask = broadcast_to(attention_mask) input_ids = P.Reshape()(input_ids, (effective_batch_size * num_beams, seq_len)) # shape: (batch_size * num_return_sequences * num_beams, cur_len) attention_mask = P.Reshape()( attention_mask, (effective_batch_size * num_beams, seq_len)) # shape: (batch_size * num_return_sequences * num_beams, cur_len) cur_len = seq_len assert ( cur_len < max_length ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" if num_beams > 1: output = generate_beam_search( model=model, config=config, input_ids=input_ids, input_mask=input_mask, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, pad_token_id=pad_token_id, eos_token_id=eos_token_id, #batch_size=effective_batch_size, #num_return_sequences=num_return_sequences, length_penalty=length_penalty, num_beams=num_beams, #vocab_size=vocab_size, #attention_mask=attention_mask, use_cache=use_cache, ) else: '''