Exemplo n.º 1
0
    def __init__(self,
                 bert_path: Path,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary,
                 hidden_dim: int = 100,
                 encoder_dropout: float = 0.0,
                 train_bert: bool = False) -> None:
        # We have to pass the vocabulary to the constructor.
        super().__init__(vocab)
        self.word_embeddings = bert_embeddings(pretrained_model=bert_path,
                                               training=train_bert)

        self.encoder_dropout: torch.nn.Module
        if encoder_dropout > 0:
            self.encoder_dropout = torch.nn.Dropout(p=encoder_dropout)
        else:
            self.encoder_dropout = torch.nn.Identity()

        self.pooler = BertPooler(pretrained_model=str(bert_path))
        self.dense1 = torch.nn.Linear(in_features=self.pooler.get_output_dim(),
                                      out_features=hidden_dim)
        self.encoder = encoder
        self.self_attn = LinearSelfAttention(
            input_dim=self.encoder.get_output_dim(), bias=True)
        self.dense2 = torch.nn.Linear(
            in_features=self.encoder.get_output_dim(), out_features=1)
Exemplo n.º 2
0
    def __init__(self, args, word_embedder):
        super(Pooler_for_mention, self).__init__()
        self.args = args
        self.huggingface_nameloader()
        self.bertpooler_sec2vec = BertPooler(pretrained_model=self.bert_weight_filepath)
        self.word_embedder = word_embedder
        self.word_embedding_dropout = nn.Dropout(self.args.word_embedding_dropout)

        self.linear_for_mention_encoding = nn.Linear(self.bertpooler_sec2vec.get_output_dim(),
                                                     self.bertpooler_sec2vec.get_output_dim())
        self.linear_for_dimentionReduction = nn.Linear(self.bertpooler_sec2vec.get_output_dim(),
                                                       self.args.dimentionReductionToThisDim)
Exemplo n.º 3
0
    def __init__(
        self,
        vocab: Vocabulary,
        transformer_model: str = "roberta-large",
        override_weights_file: Optional[str] = None,
        override_weights_strip_prefix: Optional[str] = None,
        **kwargs
    ) -> None:
        super().__init__(vocab, **kwargs)

        self._text_field_embedder = PretrainedTransformerEmbedder(
            transformer_model,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
        )
        self._text_field_embedder = BasicTextFieldEmbedder(
            {"tokens": self._text_field_embedder})
        self._pooler = BertPooler(
            transformer_model,
            override_weights_file=override_weights_file,
            override_weights_strip_prefix=override_weights_strip_prefix,
            dropout=0.1,
        )

        self._linear_layer = torch.nn.Linear(
            self._text_field_embedder.get_output_dim(), 1)
        self._linear_layer.weight.data.normal_(mean=0.0, std=0.02)
        self._linear_layer.bias.data.zero_()

        self._loss = torch.nn.CrossEntropyLoss()
        self._accuracy = CategoricalAccuracy()
Exemplo n.º 4
0
def build_model(vocab: Vocabulary, bert_model: str = None) -> Model:
    if bert_model:
        embedder = BasicTextFieldEmbedder({"bert": PretrainedTransformerEmbedder(model_name=bert_model,
                                                                                 train_parameters=True)})
        encoder = BertPooler(pretrained_model=bert_model, requires_grad=True)
    else:
        # (3) How to get vectors for each Token ID:
        # (3.1) embed each token
        token_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("token_vocab"))
        # pretrained_file='https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz'

        # (3.2) embed each character in each token
        character_embedding = Embedding(embedding_dim=3, num_embeddings=vocab.get_vocab_size("character_vocab"))
        cnn_encoder = CnnEncoder(embedding_dim=3, num_filters=4, ngram_filter_sizes=[3,])
        token_encoder = TokenCharactersEncoder(character_embedding, cnn_encoder)
        # (3.3) embed the POS of each token
        pos_tag_embedding = Embedding(embedding_dim=10, num_embeddings=vocab.get_vocab_size("pos_tag_vocab"))

        # Each TokenEmbedders embeds its input, and the result is concatenated in an arbitrary (but consistent) order
        # cf: https://docs.allennlp.org/master/api/modules/text_field_embedders/basic_text_field_embedder/
        embedder = BasicTextFieldEmbedder(
            token_embedders={"tokens": token_embedding,
                             "token_characters": token_encoder,
                             "pos_tags": pos_tag_embedding}
        )  # emb_dim = 10 + 4 + 10 = 24
        encoder = BagOfEmbeddingsEncoder(embedding_dim=24, averaged=True)
        #                                                  ^
        # average the embeddings across time, rather than simply summing
        # (ie. we will divide the summed embeddings by the length of the sentence).
    return SimpleClassifier(vocab, embedder, encoder)
def build_adversarial_transformer_model(vocab: Vocabulary, transformer_model: str) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    embedding = PretrainedTransformerEmbedder(model_name=transformer_model)
    embedder = BasicTextFieldEmbedder(token_embedders={'bert_tokens': embedding})
    encoder = BertPooler(transformer_model)
    return SimpleClassifier(vocab, embedder, encoder)
Exemplo n.º 6
0
    def __init__(self,
                 bert_path: Path,
                 vocab: Vocabulary,
                 train_bert: bool = False
                 ) -> None:
        # We have to pass the vocabulary to the constructor.
        super().__init__(vocab)
        self.word_embeddings = bert_embeddings(pretrained_model=bert_path,
                                               training=train_bert)

        self.pooler = BertPooler(pretrained_model=str(bert_path))

        hidden_dim = self.pooler.get_output_dim()
        self.hidden2logit = torch.nn.Linear(
            in_features=hidden_dim,
            out_features=1
        )
Exemplo n.º 7
0
class Pooler_for_title_and_desc(Seq2VecEncoder):
    def __init__(self, args, word_embedder):
        super(Pooler_for_title_and_desc, self).__init__()
        self.args = args
        self.huggingface_nameloader()
        self.bertpooler_sec2vec = BertPooler(
            pretrained_model=self.bert_weight_filepath)
        self.word_embedder = word_embedder
        self.word_embedding_dropout = nn.Dropout(
            self.args.word_embedding_dropout)

        self.linear_for_entity_encoding = nn.Linear(
            self.bertpooler_sec2vec.get_output_dim(),
            self.bertpooler_sec2vec.get_output_dim())

        self.linear_for_dimentionReduction = nn.Linear(
            self.bertpooler_sec2vec.get_output_dim(),
            self.args.dimentionReductionToThisDim)

    def huggingface_nameloader(self):
        if self.args.bert_name == 'bert-base-uncased':
            self.bert_weight_filepath = 'bert-base-uncased'
        else:
            self.bert_weight_filepath = 'dummy'
            print('Currently not supported', self.args.bert_name)
            exit()

    def forward(self, title_and_desc_concatnated_text):
        mask_sent = get_text_field_mask(title_and_desc_concatnated_text)
        entity_emb = self.word_embedder(title_and_desc_concatnated_text)
        entity_emb = self.word_embedding_dropout(entity_emb)

        if self.args.entityPooling == "CLSLinear":
            entity_emb = entity_emb[:, 0, :]
            entity_emb = self.linear_for_entity_encoding(entity_emb)
        elif self.args.entityPooling == 'CLS':
            entity_emb = entity_emb[:, 0, :]
        else:
            assert self.args.entityPooling == "CLSLinearTanh"
            entity_emb = self.bertpooler_sec2vec(entity_emb, mask_sent)

        if self.args.dimentionReduction:
            return self.linear_for_dimentionReduction(entity_emb)
        else:
            return entity_emb
 def __init__(self, args, word_embedder):
     super(Pooler_for_blink_mention, self).__init__()
     self.args = args
     self.huggingface_nameloader()
     self.bertpooler_sec2vec = BertPooler(
         pretrained_model=self.bert_weight_filepath)
     self.word_embedder = word_embedder
     self.word_embedding_dropout = nn.Dropout(
         self.args.word_embedding_dropout)
    def __init__(self, args, input_dim, word_embedder):
        super(Concatenate_Right_and_Left_MentionEncoder, self).__init__()
        self.config = args
        self.args = args
        self.input_dim = input_dim

        self.word_embedder = word_embedder
        self.word_embedding_dropout = nn.Dropout(
            self.args.word_embedding_dropout)
        self.ff_seq2vecs = nn.Linear(input_dim * 4, input_dim)
        self.huggingface_nameloader()
        self.bertpooler_sec2vec = BertPooler(
            pretrained_model=self.bert_weight_filepath)
Exemplo n.º 10
0
 def __init__(
     self,
     word_embedding_dropout: float = 0.05,
     bert_model_name: str = 'japanese_bert',
     word_embedder: BasicTextFieldEmbedder = BasicTextFieldEmbedder({
         'tokens':
         PretrainedTransformerEmbedder(
             model_name='cl-tohoku/bert-base-japanese')
     })):
     super(BertPoolerForMention, self).__init__()
     self.bert_model_name = bert_model_name
     self.huggingface_nameloader()
     self.bertpooler_sec2vec = BertPooler(
         pretrained_model=self.bert_weight_filepath)
     self.word_embedder = word_embedder
     self.word_embedding_dropout = nn.Dropout(word_embedding_dropout)
Exemplo n.º 11
0
    def test_encoder(self):
        encoder = BertPooler("bert-base-uncased")
        assert encoder.get_input_dim() == encoder.get_output_dim()
        embedding = torch.rand(8, 24, encoder.get_input_dim())

        pooled1 = encoder(embedding)
        assert pooled1.size() == (8, encoder.get_input_dim())

        embedding[:, 1:, :] = 0
        pooled2 = encoder(embedding)
        numpy.testing.assert_array_almost_equal(pooled1.detach().numpy(),
                                                pooled2.detach().numpy())
 def __init__(self,
              vocab,
              pretrained_model: str = "bert-base-uncased",
              requires_grad: bool = True):
     super(ChatClassification, self).__init__()
     self.vocab = vocab
     self.turn_pooler = BertPooler(pretrained_model,
                                   requires_grad,
                                   dropout=0.0)
     #self.turn_pooler =
     self.chat_encoder = StackedBidirectionalLstm(
         hidden_size=400,
         input_size=768,
         num_layers=1,
         recurrent_dropout_probability=0.3,
         use_highway=True)
     self.classif_layer = torch.nn.Linear(
         in_features=self.chat_encoder.hidden_size, out_features=2)
     self.accuracy = CategoricalAccuracy()
Exemplo n.º 13
0
def build_model_Transformer(vocab: Vocabulary, use_reg: bool = True) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 300
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
    embedder = PretrainedTransformerEmbedder(BERT_MODEL_NAME)
    encoder = BertPooler(
        BERT_MODEL_NAME
    )  # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f
    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if use_reg:
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg), ("encoder", l2_reg),
                   ("classifier", l2_reg)]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,
                               regularizer_applicator)
Exemplo n.º 14
0
def train_bert(train_dataset, validation_dataset, batch_size, pretrained_model, double_input=False, dense_vector=False,
               col_name=None, epochs=100, patience=None, learning_rate=3e-4, num_classes=2, use_gpu=False):
    """
    Trains BERT on train_dataset; with optional early stopping on validation_dataset.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    validation_dataset: List[Instance]
        Instances for validation set
    batch_size: int
        number of Instances to process in a batch
    pretrained_model: str
        pretrained BERT model to use
    double_input: bool
        True to run DoubleInput classifier | False (default) for SingleInput classifier
    dense_vector: bool
        True to concatenate dense feature vector before feeding to the FeedForward layer
    col_name: str
        'reply_text' or 'question' (for calculating dense feature vector) | Only applicable when dense_vector is True
    epochs: int
        total number of epochs to train on (default=30)
    patience: int or None
        early stopping - number of epochs to wait for validation loss to improve (default=5). If 'None': disables early stopping, and uses train+validation set for training
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    vocab = Vocabulary()

    if double_input: # need context_tokens as well
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens"),
                                                ("context_tokens", "num_tokens")])

    else: # only reply_tokens
        iterator = BucketIterator(batch_size=batch_size,
                                  sorting_keys=[("reply_tokens", "num_tokens")])

    iterator.index_with(vocab) # numericalize the data

    word_embeddings: TextFieldEmbedder = load_bert_embeddings(pretrained_model)
    encoder: Seq2VecEncoder = BertPooler(pretrained_model=pretrained_model,
                                         requires_grad=True)

    if double_input: # consider preceding 'comment_text'
        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = 2 * (encoder.get_output_dim() + DENSE_VECTOR_LEN)
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseDoubleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=encoder,
                                                 context_encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            classifier_feedforward: FeedForward = nn.Linear(2*encoder.get_output_dim(), num_classes)
            model = models.DoubleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 reply_encoder=encoder,
                                                 context_encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward)
    else: # only 'reply_text' or 'question'
        if dense_vector: # add length of dense vector to input dimension of Feedforward
            ff_input_dim = encoder.get_output_dim() + DENSE_VECTOR_LEN
            classifier_feedforward: FeedForward = nn.Linear(ff_input_dim, num_classes)
            model = models.DenseSingleClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward,
                                                 col_name=col_name)

        else:
            # Feedforward:
            classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(), num_classes)

            model = models.SingleInputClassifier(vocab=vocab,
                                                 word_embeddings=word_embeddings,
                                                 encoder=encoder,
                                                 classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    if patience == None: # No early stopping: train on both train+validation dataset
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset + validation_dataset,
            cuda_device=0 if use_gpu else -1,
            num_epochs=epochs)

    else:
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            iterator=iterator,
            train_dataset=train_dataset,
            validation_dataset=validation_dataset,
            cuda_device=0 if use_gpu else -1,
            patience=patience, # stop if loss does not improve for 'patience' epochs
            num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Exemplo n.º 15
0
class SimpleBertClassifier(BaseModel):
    """
    Model that encodes input using BERT, takes the embedding for the CLS
    token (using BertPooler) and puts the output through a FFN to get the
    probabilities.
    """

    def __init__(self,
                 bert_path: Path,
                 vocab: Vocabulary,
                 train_bert: bool = False
                 ) -> None:
        # We have to pass the vocabulary to the constructor.
        super().__init__(vocab)
        self.word_embeddings = bert_embeddings(pretrained_model=bert_path,
                                               training=train_bert)

        self.pooler = BertPooler(pretrained_model=str(bert_path))

        hidden_dim = self.pooler.get_output_dim()
        self.hidden2logit = torch.nn.Linear(
            in_features=hidden_dim,
            out_features=1
        )

    # This is the computation bit of the model. The arguments of this function
    # are the fields from the `Instance` we created, as that's what's going to
    # be passed to this. We also have the optional `label`, which is only
    # available at training time, used to calculate the loss.
    def forward(self,
                metadata: Dict[str, torch.Tensor],
                bert0: Dict[str, torch.Tensor],
                bert1: Dict[str, torch.Tensor],
                label: Optional[torch.Tensor] = None
                ) -> Dict[str, torch.Tensor]:
        # Every sample in a batch has to have the same size (as it's a tensor),
        # so smaller entries are padded. The mask is used to counteract this
        # padding.
        t0_masks = util.get_text_field_mask(bert0)
        t1_masks = util.get_text_field_mask(bert1)

        # We create the embeddings from the input text
        t0_embs = self.word_embeddings(bert0)
        t1_embs = self.word_embeddings(bert1)

        # Then we use those embeddings (along with the masks) as inputs for
        # our encoders
        enc0_outs = self.pooler(t0_embs, t0_masks)
        enc1_outs = self.pooler(t1_embs, t1_masks)

        # Finally, we pass each encoded output tensor to the feedforward layer
        # to produce logits corresponding to each class.
        logit0 = self.hidden2logit(enc0_outs).squeeze(-1)
        logit1 = self.hidden2logit(enc1_outs).squeeze(-1)
        logit0, _ = torch.max(logit0, dim=1)
        logit1, _ = torch.max(logit1, dim=1)
        logits = torch.stack((logit0, logit1), dim=-1)
        # We also compute the class with highest likelihood (our prediction)
        prob = torch.softmax(logits, dim=-1)
        output = {"logits": logits, "prob": prob}

        # Labels are optional. If they're present, we calculate the accuracy
        # and the loss function.
        if label is not None:
            self.accuracy(prob, label)
            output["loss"] = self.loss(logits, label)

        # The output is the dict we've been building, with the logits, loss
        # and the prediction.
        return output
Exemplo n.º 16
0
class AdvancedAttentionBertClassifier(BaseModel):
    """
    Model similar to the AttentiveClassifier with BERT, but without external
    features.

    SimpleTrian is this with the attention before the encoders.
    """
    def __init__(self,
                 bert_path: Path,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary,
                 hidden_dim: int = 100,
                 encoder_dropout: float = 0.0,
                 train_bert: bool = False) -> None:
        # We have to pass the vocabulary to the constructor.
        super().__init__(vocab)
        self.word_embeddings = bert_embeddings(pretrained_model=bert_path,
                                               training=train_bert)

        self.encoder_dropout: torch.nn.Module
        if encoder_dropout > 0:
            self.encoder_dropout = torch.nn.Dropout(p=encoder_dropout)
        else:
            self.encoder_dropout = torch.nn.Identity()

        self.pooler = BertPooler(pretrained_model=str(bert_path))
        self.dense1 = torch.nn.Linear(in_features=self.pooler.get_output_dim(),
                                      out_features=hidden_dim)
        self.encoder = encoder
        self.self_attn = LinearSelfAttention(
            input_dim=self.encoder.get_output_dim(), bias=True)
        self.dense2 = torch.nn.Linear(
            in_features=self.encoder.get_output_dim(), out_features=1)

    # This is the computation bit of the model. The arguments of this function
    # are the fields from the `Instance` we created, as that's what's going to
    # be passed to this. We also have the optional `label`, which is only
    # available at training time, used to calculate the loss.
    def forward(
            self,
            metadata: Dict[str, torch.Tensor],
            bert0: Dict[str, torch.Tensor],
            bert1: Dict[str, torch.Tensor],
            label: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        # Every sample in a batch has to have the same size (as it's a tensor),
        # so smaller entries are padded. The mask is used to counteract this
        # padding.

        # We create the embeddings from the input text
        t0_embs = self.word_embeddings(bert0)
        t1_embs = self.word_embeddings(bert1)

        t0_pooled = self.pooler(t0_embs)
        t1_pooled = self.pooler(t1_embs)

        t0_transformed = self.dense1(t0_pooled)
        t1_transformed = self.dense1(t1_pooled)

        t0_enc_hiddens = self.encoder_dropout(
            self.encoder(t0_transformed, mask=None))
        t1_enc_hiddens = self.encoder_dropout(
            self.encoder(t1_transformed, mask=None))

        t0_enc_attn = self.self_attn(t0_enc_hiddens, t0_enc_hiddens)
        t1_enc_attn = self.self_attn(t1_enc_hiddens, t1_enc_hiddens)

        t0_enc_out = util.weighted_sum(t0_enc_hiddens, t0_enc_attn)
        t1_enc_out = util.weighted_sum(t1_enc_hiddens, t1_enc_attn)

        logit0 = self.dense2(t0_enc_out).squeeze(-1)
        logit1 = self.dense2(t1_enc_out).squeeze(-1)

        logits = torch.stack((logit0, logit1), dim=-1)

        # We also compute the class with highest likelihood (our prediction)
        prob = torch.softmax(logits, dim=-1)
        output = {"logits": logits, "prob": prob}

        # Labels are optional. If they're present, we calculate the accuracy
        # and the loss function.
        if label is not None:
            self.accuracy(prob, label)
            output["loss"] = self.loss(logits, label)

        # The output is the dict we've been building, with the logits, loss
        # and the prediction.
        return output