def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" features = copy_to_model_parallel_region(features) # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(features, self.embed_tokens.weight) else: x = F.linear(features, self.embed_out) if getattr(self.args, 'criterion') != 'vocab_parallel_cross_entropy': x = gather_from_model_parallel_region(x).contiguous() return x
def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if not self.share_input_output_embed: raise NotImplementedError( "Model parallel training currently requires --share-decoder-input-output-embed" ) features = copy_to_model_parallel_region(features) # project back to size of vocabulary x = self.output_projection(features) if getattr(self.args, "criterion") != "vocab_parallel_cross_entropy": x = gather_from_model_parallel_region(x).contiguous() return x
def forward(self, features, masked_tokens=None, **kwargs): # Only project the unmasked tokens while training, # saves both memory and computation if masked_tokens is not None: features = features[masked_tokens, :] x = self.dense(features) x = self.activation_fn(x) x = self.layer_norm(x) x = copy_to_model_parallel_region(x) # project back to size of vocabulary with bias x = F.linear(x, self.weight) x = gather_from_model_parallel_region(x).contiguous() x = x + self.bias return x