Exemplo n.º 1
0
    def output_layer(self, features, **kwargs):
        """Project features to the vocabulary size."""
        features = copy_to_model_parallel_region(features)

        # project back to size of vocabulary
        if self.share_input_output_embed:
            x = F.linear(features, self.embed_tokens.weight)
        else:
            x = F.linear(features, self.embed_out)

        if getattr(self.args, 'criterion') != 'vocab_parallel_cross_entropy':
            x = gather_from_model_parallel_region(x).contiguous()
        return x
Exemplo n.º 2
0
    def output_layer(self, features, **kwargs):
        """Project features to the vocabulary size."""
        if not self.share_input_output_embed:
            raise NotImplementedError(
                "Model parallel training currently requires --share-decoder-input-output-embed"
            )

        features = copy_to_model_parallel_region(features)

        # project back to size of vocabulary
        x = self.output_projection(features)

        if getattr(self.args, "criterion") != "vocab_parallel_cross_entropy":
            x = gather_from_model_parallel_region(x).contiguous()
        return x
Exemplo n.º 3
0
    def forward(self, features, masked_tokens=None, **kwargs):
        # Only project the unmasked tokens while training,
        # saves both memory and computation
        if masked_tokens is not None:
            features = features[masked_tokens, :]

        x = self.dense(features)
        x = self.activation_fn(x)
        x = self.layer_norm(x)

        x = copy_to_model_parallel_region(x)
        # project back to size of vocabulary with bias
        x = F.linear(x, self.weight)
        x = gather_from_model_parallel_region(x).contiguous()
        x = x + self.bias
        return x