示例#1
0
文件: word2vec.py 项目: lei1993/HanLP
 def module(self, vocabs: VocabDict, **kwargs) -> Optional[nn.Module]:
     vocab = vocabs[self.field]
     num_tokens_in_trn = len(vocab)
     embed = build_word2vec_with_vocab(self.embed,
                                       vocab,
                                       self.extend_vocab,
                                       self.unk,
                                       self.lowercase,
                                       self.trainable,
                                       normalize=self.normalize)
     if self.word_dropout:
         assert vocab.unk_token, f'unk_token of vocab {self.field} has to be set in order to ' \
                                 f'make use of word_dropout'
         padding = []
         if vocab.pad_token:
             padding.append(vocab.pad_idx)
         word_dropout = WordDropout(self.word_dropout,
                                    vocab.unk_idx,
                                    exclude_tokens=padding)
     else:
         word_dropout = None
     return Word2VecEmbeddingModule(self.field,
                                    embed,
                                    word_dropout=word_dropout,
                                    cpu=self.cpu,
                                    second_channel=self.second_channel,
                                    num_tokens_in_trn=num_tokens_in_trn,
                                    unk_idx=vocab.unk_idx)
示例#2
0
    def __init__(self,
                 transformer: Union[PreTrainedModel, str],
                 transformer_tokenizer: PreTrainedTokenizer,
                 average_subwords=False,
                 scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
                 word_dropout=None,
                 max_sequence_length=None,
                 ret_raw_hidden_states=False,
                 transformer_args: Dict[str, Any] = None,
                 trainable=Union[bool, Optional[Tuple[int, int]]],
                 training=True) -> None:
        """A pre-trained transformer encoder.

        Args:
            transformer: A ``PreTrainedModel`` or an identifier of a ``PreTrainedModel``.
            transformer_tokenizer: A ``PreTrainedTokenizer``.
            average_subwords: ``True`` to average subword representations.
            scalar_mix: Layer attention.
            word_dropout: Dropout rate of randomly replacing a subword with MASK.
            max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                window. If ``None``, then the ``max_position_embeddings`` of the transformer will be used.
            ret_raw_hidden_states: ``True`` to return hidden states of each layer.
            transformer_args: Extra arguments passed to the transformer.
            trainable: ``False`` to use static embeddings.
            training: ``False`` to skip loading weights from pre-trained transformers.
        """
        super().__init__()
        self.ret_raw_hidden_states = ret_raw_hidden_states
        self.average_subwords = average_subwords
        if word_dropout:
            oov = transformer_tokenizer.mask_token_id
            if isinstance(word_dropout, Sequence):
                word_dropout, replacement = word_dropout
                if replacement == 'unk':
                    # Electra English has to use unk
                    oov = transformer_tokenizer.unk_token_id
                elif replacement == 'mask':
                    # UDify uses [MASK]
                    oov = transformer_tokenizer.mask_token_id
                else:
                    oov = replacement
            pad = transformer_tokenizer.pad_token_id
            cls = transformer_tokenizer.cls_token_id
            sep = transformer_tokenizer.sep_token_id
            excludes = [pad, cls, sep]
            self.word_dropout = WordDropout(p=word_dropout, oov_token=oov, exclude_tokens=excludes)
        else:
            self.word_dropout = None
        if isinstance(transformer, str):
            output_hidden_states = scalar_mix is not None
            if transformer_args is None:
                transformer_args = dict()
            transformer_args['output_hidden_states'] = output_hidden_states
            transformer = AutoModel_.from_pretrained(transformer, training=training or not trainable,
                                                     **transformer_args)
            if max_sequence_length is None:
                max_sequence_length = transformer.config.max_position_embeddings
        self.max_sequence_length = max_sequence_length
        if hasattr(transformer, 'encoder') and hasattr(transformer, 'decoder'):
            # For seq2seq model, use its encoder
            transformer = transformer.encoder
        self.transformer = transformer
        if not trainable:
            transformer.requires_grad_(False)
        elif isinstance(trainable, tuple):
            layers = []
            if hasattr(transformer, 'embeddings'):
                layers.append(transformer.embeddings)
            layers.extend(transformer.encoder.layer)
            for i, layer in enumerate(layers):
                if i < trainable[0] or i >= trainable[1]:
                    layer.requires_grad_(False)

        if isinstance(scalar_mix, ScalarMixWithDropoutBuilder):
            self.scalar_mix: ScalarMixWithDropout = scalar_mix.build()
        else:
            self.scalar_mix = None
示例#3
0
    def __init__(
        self,
        config,
        pretrained_embed: torch.Tensor = None,
        transformer: PreTrainedModel = None,
        transformer_tokenizer: PreTrainedTokenizer = None,
    ):
        super(EncoderWithContextualLayer, self).__init__()

        self.secondary_encoder = config.get('secondary_encoder', None)
        self.config = config

        if not transformer:
            self.pad_index = config.pad_index
            self.unk_index = config.unk_index
            if config.word_dropout:
                oov = self.unk_index
                excludes = [self.pad_index]
                self.word_dropout = WordDropout(p=config.word_dropout,
                                                oov_token=oov,
                                                exclude_tokens=excludes)
            else:
                self.word_dropout = None
        if transformer:
            input_size = 0
            if self.config.transformer_lr:
                hidden_size = transformer.config.hidden_size
            else:
                input_size = transformer.config.hidden_size
                hidden_size = config.n_lstm_hidden * 2
            if config.feat == 'pos':
                self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
                                               embedding_dim=config.n_embed)
                self.embed_dropout = IndependentDropout(p=config.embed_dropout)
                if self.config.transformer_lr:
                    hidden_size += config.n_embed
                else:
                    input_size += config.n_embed
            if not self.config.transformer_lr:
                self.lstm = VariationalLSTM(input_size=input_size,
                                            hidden_size=config.n_lstm_hidden,
                                            num_layers=config.n_lstm_layers,
                                            dropout=config.hidden_dropout,
                                            bidirectional=True)
        else:
            # the embedding layer
            input_size = config.n_embed
            self.word_embed = nn.Embedding(num_embeddings=config.n_words,
                                           embedding_dim=config.n_embed)
            if pretrained_embed is not None:
                if not isinstance(pretrained_embed, torch.Tensor):
                    pretrained_embed = torch.Tensor(pretrained_embed)
                self.pretrained = nn.Embedding.from_pretrained(
                    pretrained_embed)
                nn.init.zeros_(self.word_embed.weight)
            if config.feat == 'pos':
                self.feat_embed = nn.Embedding(num_embeddings=config.n_feats,
                                               embedding_dim=config.n_embed)
                self.embed_dropout = IndependentDropout(p=config.embed_dropout)
                input_size += config.n_embed

            # the word-lstm layer
            hidden_size = config.n_lstm_hidden * 2
            self.lstm = VariationalLSTM(input_size=input_size,
                                        hidden_size=config.n_lstm_hidden,
                                        num_layers=config.n_lstm_layers,
                                        dropout=config.hidden_dropout,
                                        bidirectional=True)
        self.hidden_size = hidden_size
        self.hidden_dropout = SharedDropout(p=config.hidden_dropout)
        if transformer:
            transformer = TransformerEncoder(
                transformer,
                transformer_tokenizer,
                config.average_subwords,
                word_dropout=config.word_dropout,
                max_sequence_length=config.max_sequence_length)
        self.transformer = transformer