def __init__(self, config, transformer: PreTrainedModel = None, transformer_tokenizer: PreTrainedTokenizer = None ) -> None: super().__init__() self.encoder = TransformerEncoder(transformer, transformer_tokenizer, config.average_subwords, config.scalar_mix, None, # No word_dropout since SA is predicting masked tokens config.transformer_hidden_dropout, config.layer_dropout, config.max_sequence_length) hidden_size = transformer.config.hidden_size self.sa = StructuralAttentionLayer(hidden_size, config.n_mlp_arc, config.n_mlp_rel, config.mlp_dropout, config.n_rels, config.projection ) if config.projection: hidden_size = config.projection self.mlm = nn.Linear(hidden_size, transformer_tokenizer.vocab_size)
def build_transformer(self, training=True): transformer = TransformerEncoder(self.config.transformer, self.transformer_tokenizer, self.config.average_subwords, self.config.scalar_mix, self.config.word_dropout, self.config.max_seq_len, self.config.ret_raw_hidden_states, training=training) transformer_layers = self.config.get('transformer_layers', None) if transformer_layers: transformer.transformer.encoder.layer = transformer.transformer.encoder.layer[:-transformer_layers] return transformer
def __init__( self, config, pretrained_embed: torch.Tensor = None, transformer: PreTrainedModel = None, transformer_tokenizer: PreTrainedTokenizer = None, ): super(EncoderWithContextualLayer, self).__init__() self.secondary_encoder = config.get('secondary_encoder', None) self.config = config if not transformer: self.pad_index = config.pad_index self.unk_index = config.unk_index if config.word_dropout: oov = self.unk_index excludes = [self.pad_index] self.word_dropout = WordDropout(p=config.word_dropout, oov_token=oov, exclude_tokens=excludes) else: self.word_dropout = None if transformer: input_size = 0 if self.config.transformer_lr: hidden_size = transformer.config.hidden_size else: input_size = transformer.config.hidden_size hidden_size = config.n_lstm_hidden * 2 if config.feat == 'pos': self.feat_embed = nn.Embedding(num_embeddings=config.n_feats, embedding_dim=config.n_embed) self.embed_dropout = IndependentDropout(p=config.embed_dropout) if self.config.transformer_lr: hidden_size += config.n_embed else: input_size += config.n_embed if not self.config.transformer_lr: self.lstm = VariationalLSTM(input_size=input_size, hidden_size=config.n_lstm_hidden, num_layers=config.n_lstm_layers, dropout=config.hidden_dropout, bidirectional=True) else: # the embedding layer input_size = config.n_embed self.word_embed = nn.Embedding(num_embeddings=config.n_words, embedding_dim=config.n_embed) if pretrained_embed is not None: if not isinstance(pretrained_embed, torch.Tensor): pretrained_embed = torch.Tensor(pretrained_embed) self.pretrained = nn.Embedding.from_pretrained( pretrained_embed) nn.init.zeros_(self.word_embed.weight) if config.feat == 'pos': self.feat_embed = nn.Embedding(num_embeddings=config.n_feats, embedding_dim=config.n_embed) self.embed_dropout = IndependentDropout(p=config.embed_dropout) input_size += config.n_embed # the word-lstm layer hidden_size = config.n_lstm_hidden * 2 self.lstm = VariationalLSTM(input_size=input_size, hidden_size=config.n_lstm_hidden, num_layers=config.n_lstm_layers, dropout=config.hidden_dropout, bidirectional=True) self.hidden_size = hidden_size self.hidden_dropout = SharedDropout(p=config.hidden_dropout) if transformer: transformer = TransformerEncoder( transformer, transformer_tokenizer, config.average_subwords, word_dropout=config.word_dropout, max_sequence_length=config.max_sequence_length) self.transformer = transformer
def build_model(self, training=True, **kwargs) -> torch.nn.Module: transformer = TransformerEncoder.build_transformer(config=self.config, training=training) model = StructuralAttentionModel(self.config, transformer, self.transformer_tokenizer) return model
def __init__(self, field: str, transformer: str, average_subwords=False, scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None, word_dropout: Optional[Union[float, Tuple[float, str]]] = None, max_sequence_length=None, truncate_long_sequences=False, cls_is_bos=False, sep_is_eos=False, ret_token_span=True, ret_subtokens=False, ret_subtokens_group=False, ret_prefix_mask=False, ret_raw_hidden_states=False, transformer_args: Dict[str, Any] = None, use_fast=True, do_basic_tokenize=True, trainable=True) -> None: """A contextual word embedding builder which builds a :class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a :class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`. Args: field: The field to work on. Usually some token fields. transformer: An identifier of a ``PreTrainedModel``. average_subwords: ``True`` to average subword representations. scalar_mix: Layer attention. word_dropout: Dropout rate of randomly replacing a subword with MASK. max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding window. truncate_long_sequences: ``True`` to return hidden states of each layer. cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is. ``False`` (default) means the first token is not [CLS], it will have its own embedding other than the embedding of [CLS]. sep_is_eos: ``True`` means the last token of input is [SEP]. ``False`` means it's not but [SEP] will be appended, ``None`` means it dependents on `input[-1] == [EOS]`. ret_token_span: ``True`` to return span of each token measured by subtoken offsets. ret_subtokens: ``True`` to return list of subtokens belonging to each token. ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token. ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token. ret_raw_hidden_states: ``True`` to return hidden states of each layer. transformer_args: Extra arguments passed to the transformer. use_fast: Whether or not to try to load the fast version of the tokenizer. do_basic_tokenize: Whether to do basic tokenization before wordpiece. trainable: ``False`` to use static embeddings. """ super().__init__() self.truncate_long_sequences = truncate_long_sequences self.transformer_args = transformer_args self.trainable = trainable self.ret_subtokens_group = ret_subtokens_group self.ret_subtokens = ret_subtokens self.ret_raw_hidden_states = ret_raw_hidden_states self.sep_is_eos = sep_is_eos self.cls_is_bos = cls_is_bos self.max_sequence_length = max_sequence_length self.word_dropout = word_dropout self.scalar_mix = scalar_mix self.average_subwords = average_subwords self.transformer = transformer self.field = field self._transformer_tokenizer = TransformerEncoder.build_transformer_tokenizer( self.transformer, use_fast=use_fast, do_basic_tokenize=do_basic_tokenize) self._tokenizer_transform = TransformerSequenceTokenizer( self._transformer_tokenizer, field, truncate_long_sequences=truncate_long_sequences, ret_prefix_mask=ret_prefix_mask, ret_token_span=ret_token_span, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos, ret_subtokens=ret_subtokens, ret_subtokens_group=ret_subtokens_group, max_seq_length=self.max_sequence_length)