示例#1
0
 def test_pass_through_encoder_passes_through(self):
     encoder = PassThroughEncoder(input_dim=9)
     tensor = torch.randn([2, 3, 9])
     output = encoder(tensor)
     numpy.testing.assert_array_almost_equal(
         tensor.detach().cpu().numpy(), output.detach().cpu().numpy()
     )
示例#2
0
    def init_model(self) -> Model:
        """build the model

        Args:
            vocab (Vocabulary): the vocabulary of corpus

        Returns:
            Model: the final models
        """
        bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
        bert_text_field_embedder
        tagger = SimpleTagger(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={
                    'tokens': bert_text_field_embedder
                }
            ),
            encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
            verbose_metrics=True,
            calculate_span_f1=True,
            label_encoding="BMES",
        )
        
        tagger.to(device=self.config.device)
        return tagger
 def test_saturated_dropout_trivial_mask(self):
     encoder = PassThroughEncoder(input_dim=1)
     pruner = PercentSaturatedDropout(encoder, percent=0.25)
     mask = torch.ones(1, 1)
     dropped = pruner(INPUTS, mask)
     exp_dropped = torch.tensor([[[0.0, 0.0, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]]])
     torch.testing.assert_allclose(dropped, exp_dropped)
示例#4
0
def get_encoder(input_dim, output_dim, encoder_type, args):
    if encoder_type == "pass":
        return PassThroughEncoder(input_dim)
    if encoder_type == "bilstm":
        return PytorchSeq2SeqWrapper(
            AllenNLPSequential(torch.nn.ModuleList(
                [get_encoder(input_dim, output_dim, "bilstm-unwrapped",
                             args)]),
                               input_dim,
                               output_dim,
                               bidirectional=True,
                               residual_connection=args.residual_connection,
                               dropout=args.dropout))
    if encoder_type == "bilstm-unwrapped":
        return torch.nn.LSTM(
            input_dim,
            output_dim,
            batch_first=True,
            bidirectional=True,
            dropout=args.dropout,
        )
    if encoder_type == "self_attention":
        return IntraSentenceAttentionEncoder(input_dim=input_dim,
                                             projection_dim=output_dim)
    if encoder_type == "stacked_self_attention":
        return StackedSelfAttentionEncoder(
            input_dim=input_dim,
            hidden_dim=output_dim,
            projection_dim=output_dim,
            feedforward_hidden_dim=output_dim,
            num_attention_heads=5,
            num_layers=3,
            dropout_prob=args.dropout,
        )
    raise RuntimeError(f"Unknown encoder type={encoder_type}")
示例#5
0
    def __init__(
        self, embedder: TokenEmbedder, encoder: Seq2SeqEncoder = None, feature_type: str = "entity_start",
    ):
        super().__init__()
        self.embedder = embedder
        self.encoder = encoder or PassThroughEncoder(input_dim=self.embedder.get_output_dim())

        self.feature_type = feature_type
示例#6
0
    def test_pass_through_encoder_with_mask(self):
        encoder = PassThroughEncoder(input_dim=9)
        tensor = torch.randn([2, 3, 9])
        mask = torch.BoolTensor([[True, True, True], [True, False, False]])
        output = encoder(tensor, mask)

        target = tensor * mask.unsqueeze(dim=-1).float()
        numpy.testing.assert_array_almost_equal(output.detach().cpu().numpy(),
                                                target.detach().cpu().numpy())
示例#7
0
 def __init__(self,
              my_device=torch.device('cuda:2'),
              model_name='roberta.hdf5',
              model_path=current_directory_path +
              '/external_pretrained_models/'):
     self.answ = "UNKNOWN ERROR"
     self.model_name = model_name
     self.model_path = model_path
     self.first_object = ''
     self.second_object = ''
     self.predicates = ''
     self.aspects = ''
     cuda_device = my_device
     self.spans = [
     ]  # we can't use set because span object is dict and dict is unchashable. We add function add_span to keep non-repeatability
     try:
         print(self.model_path + self.model_name)
         print(model_path + "vocab_dir")
         vocab = Vocabulary.from_files(model_path + "vocab_dir")
         BERT_MODEL = 'google/electra-base-discriminator'
         embedder = PretrainedTransformerMismatchedEmbedder(
             model_name=BERT_MODEL)
         text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
         seq2seq_encoder = PassThroughEncoder(
             input_dim=embedder.get_output_dim())
         print("encoder loaded")
         self.indexer = PretrainedTransformerMismatchedIndexer(
             model_name=BERT_MODEL)
         print("indexer loaded")
         self.model = SimpleTagger(
             text_field_embedder=text_field_embedder,
             vocab=vocab,
             encoder=seq2seq_encoder,
             calculate_span_f1=True,
             label_encoding='IOB1').cuda(device=cuda_device)
         self.model.load_state_dict(
             torch.load(self.model_path + self.model_name))
         print("model loaded")
         self.reader = Conll2003DatasetReader(
             token_indexers={'tokens': self.indexer})
         print("reader loaded")
     except:
         e = sys.exc_info()[0]
         print("exeption while mapping to gpu in extractor ", e)
         raise RuntimeError(
             "Init extractor: can't map to gpu. Maybe it is OOM")
     try:
         self.predictor = SentenceTaggerPredictor(self.model, self.reader)
     except:
         e = sys.exc_info()[0]
         print("exeption in creating predictor ", e)
         raise RuntimeError(
             "Init extractor: can't map to gpu. Maybe it is WTF")
示例#8
0
def _build_model(config,
                 vocab,
                 lemmatize_helper,
                 morpho_vectorizer,
                 bert_max_length=None):
    embedder = _load_embedder(config, vocab, bert_max_length)

    input_dim = embedder.get_output_dim()
    if config.embedder.use_pymorphy:
        input_dim += morpho_vectorizer.morpho_vector_dim

    pos_tag_embedding = None
    if config.task.task_type == 'single' and config.task.params['use_pos_tag']:
        pos_tag_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('grammar_value_tags'),
            embedding_dim=config.task.params['pos_embedding_dim'])
        input_dim += config.task.params['pos_embedding_dim']

    encoder = None
    if config.encoder.encoder_type != 'lstm':
        encoder = PassThroughEncoder(input_dim=input_dim)
    elif config.encoder.use_weight_drop:
        encoder = LstmWeightDropSeq2SeqEncoder(
            input_dim,
            config.encoder.hidden_dim,
            num_layers=config.encoder.num_layers,
            bidirectional=True,
            dropout=config.encoder.dropout,
            variational_dropout=config.encoder.variational_dropout)
    else:
        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(input_dim,
                          config.encoder.hidden_dim,
                          num_layers=config.encoder.num_layers,
                          dropout=config.encoder.dropout,
                          bidirectional=True,
                          batch_first=True))

    return DependencyParser(
        vocab=vocab,
        text_field_embedder=embedder,
        encoder=encoder,
        lemmatize_helper=lemmatize_helper,
        task_config=config.task,
        pos_tag_embedding=pos_tag_embedding,
        morpho_vector_dim=morpho_vectorizer.morpho_vector_dim
        if config.embedder.use_pymorphy else 0,
        tag_representation_dim=config.parser.tag_representation_dim,
        arc_representation_dim=config.parser.arc_representation_dim,
        dropout=config.parser.dropout,
        input_dropout=config.embedder.dropout,
        gram_val_representation_dim=config.parser.gram_val_representation_dim,
        lemma_representation_dim=config.parser.lemma_representation_dim)
    def __init__(
        self,
        backbone: ModelBackbone,
        labels: List[str],
        tokens_pooler: Optional[Seq2VecEncoderConfiguration] = None,
        sentences_encoder: Optional[Seq2SeqEncoderConfiguration] = None,
        sentences_pooler: Seq2VecEncoderConfiguration = None,
        feedforward: Optional[FeedForwardConfiguration] = None,
        multilabel: bool = False,
    ) -> None:

        super(DocumentClassification, self).__init__(
            backbone, labels=labels, multilabel=multilabel
        )

        self.backbone.encoder = TimeDistributedEncoder(backbone.encoder)

        # layers
        self.tokens_pooler = TimeDistributedEncoder(
            BagOfEmbeddingsEncoder(embedding_dim=self.backbone.encoder.get_output_dim())
            if not tokens_pooler
            else tokens_pooler.input_dim(
                self.backbone.encoder.get_output_dim()
            ).compile()
        )
        self.sentences_encoder = (
            PassThroughEncoder(self.tokens_pooler.get_output_dim())
            if not sentences_encoder
            else sentences_encoder.input_dim(
                self.tokens_pooler.get_output_dim()
            ).compile()
        )
        self.sentences_pooler = (
            BagOfEmbeddingsEncoder(self.sentences_encoder.get_output_dim())
            if not sentences_pooler
            else sentences_pooler.input_dim(
                self.sentences_encoder.get_output_dim()
            ).compile()
        )
        self.feedforward = (
            None
            if not feedforward
            else feedforward.input_dim(self.sentences_pooler.get_output_dim()).compile()
        )

        self._classification_layer = torch.nn.Linear(
            (self.feedforward or self.sentences_pooler).get_output_dim(),
            self.num_labels,
        )
示例#10
0
    def __init__(
        self,
        vocab: Vocabulary,
        featurizer: InputFeaturizer,
        embedder: TextFieldEmbedder,
        encoder: Optional[Encoder] = None,
    ):
        super(ModelBackbone, self).__init__()

        self.vocab = vocab
        self.featurizer = featurizer
        self.embedder = embedder
        self.encoder = (encoder.input_dim(
            self.embedder.get_output_dim()).compile() if encoder else
                        PassThroughEncoder(self.embedder.get_output_dim()))
示例#11
0
    def __init__(self,
                 pooler: Seq2VecEncoder,
                 knowledge_encoder: Seq2SeqEncoder = None):
        super().__init__()
        self.pooler = pooler
        pass_thru = PassThroughEncoder(pooler.get_input_dim())

        self.knowledge_encoder = TimeDistributed(
            knowledge_encoder or pass_thru)  # TimeDistributed(context_encoder)

        self.knowledge_attn = DotProductMatrixAttention(
        )  # CosineMatrixAttention()
        # self.attn = DotProductMatrixAttention()

        self.input_dim = pooler.get_input_dim()
        self.output_dim = pooler.get_output_dim()
示例#12
0
    def __init__(
        self,
        backbone: ModelBackbone,
        labels: List[str],
        token_pooler: Optional[Seq2VecEncoderConfiguration] = None,
        sentence_encoder: Optional[Seq2SeqEncoderConfiguration] = None,
        sentence_pooler: Seq2VecEncoderConfiguration = None,
        feedforward: Optional[FeedForwardConfiguration] = None,
        dropout: float = 0.0,
        multilabel: bool = False,
        label_weights: Optional[Union[List[float], Dict[str, float]]] = None,
    ) -> None:

        super().__init__(
            backbone,
            labels=labels,
            multilabel=multilabel,
            label_weights=label_weights,
        )

        self._empty_prediction = DocumentClassificationPrediction(
            labels=[], probabilities=[])

        self.backbone.encoder = TimeDistributedEncoder(backbone.encoder)

        # layers
        self.token_pooler = TimeDistributedEncoder(
            BagOfEmbeddingsEncoder(
                embedding_dim=self.backbone.encoder.get_output_dim(
                )) if not token_pooler else token_pooler.
            input_dim(self.backbone.encoder.get_output_dim()).compile())
        self.sentence_encoder = (
            PassThroughEncoder(self.token_pooler.get_output_dim())
            if not sentence_encoder else sentence_encoder.input_dim(
                self.token_pooler.get_output_dim()).compile())
        self.sentence_pooler = (
            BagOfEmbeddingsEncoder(self.sentence_encoder.get_output_dim())
            if not sentence_pooler else sentence_pooler.input_dim(
                self.sentence_encoder.get_output_dim()).compile())
        self.feedforward = (None if not feedforward else feedforward.input_dim(
            self.sentence_pooler.get_output_dim()).compile())
        self.dropout = torch.nn.Dropout(dropout)

        self._classification_layer = torch.nn.Linear(
            (self.feedforward or self.sentence_pooler).get_output_dim(),
            self.num_labels,
        )
示例#13
0
def _build_model(config,
                 vocab,
                 lemmatize_helper,
                 morpho_vectorizer,
                 bert_max_length=None):
    embedder = _load_embedder(config, bert_max_length)

    input_dim = embedder.get_output_dim()
    if config.embedder.use_pymorphy:
        input_dim += morpho_vectorizer.morpho_vector_dim

    encoder = None
    if config.encoder.encoder_type != 'lstm':
        encoder = PassThroughEncoder(input_dim=input_dim)
    elif config.encoder.use_weight_drop:
        encoder = LstmWeightDropSeq2SeqEncoder(
            input_dim,
            config.encoder.hidden_dim,
            num_layers=config.encoder.num_layers,
            bidirectional=True,
            dropout=config.encoder.dropout,
            variational_dropout=config.encoder.variational_dropout)
    else:
        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(input_dim,
                          config.encoder.hidden_dim,
                          num_layers=config.encoder.num_layers,
                          dropout=config.encoder.dropout,
                          bidirectional=True,
                          batch_first=True))

    return DependencyParser(
        vocab=vocab,
        text_field_embedder=embedder,
        encoder=encoder,
        lemmatize_helper=lemmatize_helper,
        morpho_vector_dim=morpho_vectorizer.morpho_vector_dim
        if config.embedder.use_pymorphy else 0,
        tag_representation_dim=config.parser.tag_representation_dim,
        arc_representation_dim=config.parser.arc_representation_dim,
        dropout=config.parser.dropout,
        input_dropout=config.embedder.dropout,
        gram_val_representation_dim=config.parser.gram_val_representation_dim,
        lemma_representation_dim=config.parser.lemma_representation_dim)
示例#14
0
    def __init__(self,
                 pooler: Seq2VecEncoder,
                 context_encoder: Seq2SeqEncoder = None,
                 kb_path: str = None,
                 kb_shape: Tuple[int, int] = None,
                 trainable_kb: bool = False,
                 projection_dim: int = None):
        super().__init__()

        kb = (torch.load(kb_path) if kb_path else torch.ones(kb_shape)).float()
        self.knowledge = nn.Parameter(kb, requires_grad=trainable_kb).float()
        self.projection_dim = projection_dim
        if projection_dim:
            self.kb_proj = nn.Linear(self.knowledge.size(0),
                                     self.projection_dim)

        self.context_encoder = context_encoder or PassThroughEncoder(
            pooler.get_input_dim())
        self.pooler = pooler
        self.output_dim = pooler.get_output_dim()
def test_sequence_tagging_reader():
    model_name = 'bert-base-chinese'

    bert_token_indexers = PretrainedTransformerIndexer(model_name=model_name)
    reader = SequenceTaggingDatasetReader(
        token_indexers={"tokens": bert_token_indexers})

    train_file = './data/weibo/train.corpus'
    dev_file = './data/weibo/dev.corpus'
    test_file = './data/weibo/dev.corpus'
    train_instances = list(reader.read(train_file))
    dev_instances = list(reader.read(dev_file))
    test_instances = list(reader.read(test_file))

    vocab: Vocabulary = Vocabulary.from_instances(train_instances)
    assert vocab.get_namespaces() is not None

    bert_text_field_embedder = PretrainedTransformerEmbedder(
        model_name=model_name)
    tagger = SimpleTagger(
        vocab=vocab,
        text_field_embedder=BasicTextFieldEmbedder(
            token_embedders={'tokens': bert_text_field_embedder}),
        encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
        calculate_span_f1=True,
        label_encoding="BMES",
        # verbose_metrics=True
    )

    train_data_loader, dev_data_loader = build_data_loaders(
        train_instances, dev_instances)
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    trainer = build_trainer(model=tagger,
                            serialization_dir='./output',
                            train_loader=train_data_loader,
                            dev_loader=dev_data_loader)
    print("Starting training")
    trainer.train()
    print("Finished training")
示例#16
0
 def init_crf_model(self) -> Model:
     """init crf tagger model
     """
     # 1. import related modules
     from allennlp
     bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
     bert_text_field_embedder
     tagger = SimpleTagger(
         vocab=self.vocab,
         text_field_embedder=BasicTextFieldEmbedder(
             token_embedders={
                 'tokens': bert_text_field_embedder
             }
         ),
         encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
         verbose_metrics=True,
         calculate_span_f1=True,
         label_encoding="BMES",
     )
     
     tagger.to(device=self.config.device)
     return tagger
示例#17
0
        def model_ctor():
            # model = BertForTokenClassificationCustom.from_pretrained(self._bert_model_type,
            #                                                          cache_dir=self._cache_dir,
            #                                                          num_labels=len(self._tag2idx)).cuda()
            #
            # seq_tagger = SequenceTaggerBert(model, self._bert_tokenizer, idx2tag=self._idx2tag,
            #                                 tag2idx=self._tag2idx, pred_batch_size=self._ebs)

            embedder = PretrainedTransformerMismatchedEmbedder(
                model_name=self._bert_model_type)
            text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})

            seq2seq_encoder = PassThroughEncoder(
                input_dim=embedder.get_output_dim())

            tagger = SimpleTagger(text_field_embedder=text_field_embedder,
                                  vocab=self.vocab,
                                  encoder=seq2seq_encoder,
                                  calculate_span_f1=True,
                                  label_encoding='IOB1').cuda()

            return tagger
 def test_saturated_dropout_zero(self):
     encoder = PassThroughEncoder(input_dim=1)
     pruner = PercentSaturatedDropout(encoder, percent=0.0)
     dropped = pruner(INPUTS)
     exp_dropped = torch.tensor([[[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]]])
     torch.testing.assert_allclose(dropped, exp_dropped)
    def __init__(self,
                 vocab: Vocabulary,
                 token_representation_dim: int,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 decoder: Optional[Union[FeedForward, str]] = None,
                 contextualizer: Optional[Contextualizer] = None,
                 calculate_per_label_f1: bool = False,
                 loss_average: str = "batch",
                 pretrained_file: Optional[str] = None,
                 transfer_contextualizer_from_pretrained_file: bool = False,
                 transfer_encoder_from_pretrained_file: bool = False,
                 freeze_encoder: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SelectiveTagger, self).__init__(vocab, regularizer)

        self._num_classes = self.vocab.get_vocab_size("labels")
        self._token_representation_dim = token_representation_dim
        self._contextualizer = contextualizer
        if encoder is None:
            encoder = PassThroughEncoder(
                input_dim=self._token_representation_dim)
        self._encoder = encoder

        # Load the contextualizer and encoder weights from the
        # pretrained_file if applicable
        if pretrained_file:
            archive = None
            if self._contextualizer and transfer_contextualizer_from_pretrained_file:
                logger.info("Attempting to load contextualizer weights from "
                            "pretrained_file at {}".format(pretrained_file))
                archive = load_archive(cached_path(pretrained_file))
                contextualizer_state = archive.model._contextualizer.state_dict(
                )
                contextualizer_layer_num = self._contextualizer._layer_num
                self._contextualizer.load_state_dict(contextualizer_state)
                if contextualizer_layer_num is not None:
                    logger.info("Setting layer num to {}".format(
                        contextualizer_layer_num))
                    self._contextualizer.set_layer_num(
                        contextualizer_layer_num)
                else:
                    self._contextualizer.reset_layer_num()
                logger.info("Successfully loaded contextualizer weights!")
            if transfer_encoder_from_pretrained_file:
                logger.info("Attempting to load encoder weights from "
                            "pretrained_file at {}".format(pretrained_file))
                if archive is None:
                    archive = load_archive(cached_path(pretrained_file))
                encoder_state = archive.model._encoder.state_dict()
                self._encoder.load_state_dict(encoder_state)
                logger.info("Successfully loaded encoder weights!")

        self._freeze_encoder = freeze_encoder
        for parameter in self._encoder.parameters():
            # If freeze is true, requires_grad should be false and vice versa.
            parameter.requires_grad_(not self._freeze_encoder)

        if decoder is None or decoder == "linear":
            # Create the default decoder (logistic regression) if it is not provided.
            decoder = FeedForward.from_params(
                Params({
                    "input_dim": self._encoder.get_output_dim(),
                    "num_layers": 1,
                    "hidden_dims": self._num_classes,
                    "activations": "linear"
                }))
            logger.info("No decoder provided to model, using default "
                        "decoder: {}".format(decoder))
        elif decoder == "mlp":
            # Create the MLP decoder
            decoder = FeedForward.from_params(
                Params({
                    "input_dim": self._encoder.get_output_dim(),
                    "num_layers": 2,
                    "hidden_dims": [1024, self._num_classes],
                    "activations": ["relu", "linear"]
                }))
            logger.info("Using MLP decoder: {}".format(decoder))
        self._decoder = decoder

        check_dimensions_match(self._token_representation_dim,
                               self._encoder.get_input_dim(),
                               "token representation dim", "encoder input dim")
        check_dimensions_match(self._encoder.get_output_dim(),
                               self._decoder.get_input_dim(),
                               "encoder output dim", "decoder input dim")
        check_dimensions_match(self._decoder.get_output_dim(),
                               self._num_classes, "decoder output dim",
                               "number of classes")
        if loss_average not in {"batch", "token"}:
            raise ConfigurationError(
                "loss_average is {}, expected one of batch "
                "or token".format(loss_average))
        self.loss_average = loss_average
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.calculate_per_label_f1 = calculate_per_label_f1

        label_metric_name = "label_{}" if self.calculate_per_label_f1 else "_label_{}"
        for label_name, label_index in self.vocab._token_to_index[
                "labels"].items():
            self.metrics[label_metric_name.format(label_name)] = F1Measure(
                positive_label=label_index)

        # Whether to run in error analysis mode or not, see commands.error_analysis
        self.error_analysis = False
        logger.info("Applying initializer...")
        initializer(self)
 def test_get_dimension_is_correct(self):
     encoder = PassThroughEncoder(input_dim=9)
     assert encoder.get_input_dim() == 9
     assert encoder.get_output_dim() == 9
示例#21
0
    def __init__(self,
                 vocab: Vocabulary,
                 token_representation_dim: int,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 decoder: Optional[Union[FeedForward, str]] = None,
                 use_crf: bool = False,
                 constrain_crf_decoding: bool = False,
                 include_start_end_transitions: bool = True,
                 label_encoding: Optional[str] = None,
                 contextualizer: Optional[Contextualizer] = None,
                 calculate_per_label_f1: bool = False,
                 calculate_span_f1: bool = False,
                 calculate_perplexity: bool = False,
                 loss_average: str = "batch",
                 pretrained_file: Optional[str] = None,
                 transfer_contextualizer_from_pretrained_file: bool = False,
                 transfer_encoder_from_pretrained_file: bool = False,
                 freeze_encoder: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(Tagger, self).__init__(vocab, regularizer)

        self._num_classes = self.vocab.get_vocab_size("labels")
        self._token_representation_dim = token_representation_dim
        self._contextualizer = contextualizer
        if encoder is None:
            encoder = PassThroughEncoder(input_dim=token_representation_dim)
        self._encoder = encoder

        # Load the contextualizer and encoder weights from the
        # pretrained_file if applicable
        if pretrained_file:
            archive = None
            if self._contextualizer and transfer_contextualizer_from_pretrained_file:
                logger.info("Attempting to load contextualizer weights from "
                            "pretrained_file at {}".format(pretrained_file))
                archive = load_archive(cached_path(pretrained_file))
                contextualizer_state = archive.model._contextualizer.state_dict()
                contextualizer_layer_num = self._contextualizer._layer_num
                logger.info("contextualizer_layer_num {}".format(contextualizer_layer_num))
                self._contextualizer.load_state_dict(contextualizer_state)
                if contextualizer_layer_num is not None:
                    logger.info("Setting layer num to {}".format(
                        contextualizer_layer_num))
                    self._contextualizer.set_layer_num(contextualizer_layer_num)
                else:
                    self._contextualizer.reset_layer_num()
                logger.info("Successfully loaded contextualizer weights!")
            if transfer_encoder_from_pretrained_file:
                logger.info("Attempting to load encoder weights from "
                            "pretrained_file at {}".format(pretrained_file))
                if archive is None:
                    archive = load_archive(cached_path(pretrained_file))
                encoder_state = archive.model._encoder.state_dict()
                self._encoder.load_state_dict(encoder_state)
                logger.info("Successfully loaded encoder weights!")

        self._freeze_encoder = freeze_encoder
        for parameter in self._encoder.parameters():
            # If freeze is true, requires_grad should be false and vice versa.
            parameter.requires_grad_(not self._freeze_encoder)

        if decoder is None or decoder == "linear":
            # Create the default decoder (logistic regression) if it is not provided.
            decoder = FeedForward.from_params(Params(
                {"input_dim": self._encoder.get_output_dim(),
                 "num_layers": 1,
                 "hidden_dims": self._num_classes,
                 "activations": "linear"}))
            logger.info("No decoder provided to model, using default "
                        "decoder: {}".format(decoder))
        elif decoder == "mlp":
            # Create the MLP decoder
            decoder = FeedForward.from_params(Params(
                {"input_dim": self._encoder.get_output_dim(),
                 "num_layers": 2,
                 "hidden_dims": [1024, self._num_classes],
                 "activations": ["relu", "linear"]}))
            logger.info("Using MLP decoder: {}".format(decoder))

        self._decoder = TimeDistributed(decoder)
        self._use_crf = use_crf
        self._constrain_crf_decoding = constrain_crf_decoding
        self._crf = None
        if use_crf:
            logger.info("Using CRF on top of decoder outputs")
            if constrain_crf_decoding:
                if label_encoding is None:
                    raise ConfigurationError(
                        "constrain_crf_decoding is True, but "
                        "label_encoding was not provided. label_encoding "
                        "must be provided.")
                logger.info("Constraining CRF decoding with label "
                            "encoding {}".format(label_encoding))
                labels = self.vocab.get_index_to_token_vocabulary("labels")
                constraints = allowed_transitions(label_encoding, labels)
            else:
                constraints = None
            self._crf = ConditionalRandomField(
                self._num_classes, constraints,
                include_start_end_transitions=include_start_end_transitions)

        check_dimensions_match(self._token_representation_dim, self._encoder.get_input_dim(),
                               "dimensionality of token representation", "encoder input dim")
        check_dimensions_match(self._encoder.get_output_dim(), self._decoder._module.get_input_dim(),
                               "encoder output dim", "decoder input dim")
        check_dimensions_match(self._decoder._module.get_output_dim(), self._num_classes,
                               "decoder output dim", "number of classes")
        if loss_average not in {"batch", "token"}:
            raise ConfigurationError("loss_average is {}, expected one of batch "
                                     "or token".format(loss_average))
        self.loss_average = loss_average
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }

        self.calculate_perplexity = calculate_perplexity
        if calculate_perplexity:
            self.metrics["perplexity"] = Perplexity()

        self.calculate_per_label_f1 = calculate_per_label_f1
        self.calculate_span_f1 = calculate_span_f1
        if label_encoding and label_encoding not in ["BIO", "BIOUL", "IOB1"]:
            raise ConfigurationError("If not None, label encoding must be one of BIO, BIOUL, "
                                     "or IOB1. Got {}".format(label_encoding))
        self.label_encoding = label_encoding

        label_metric_name = "label_{}" if self.calculate_per_label_f1 else "_label_{}"
        for label_name, label_index in self.vocab._token_to_index["labels"].items():
            self.metrics[label_metric_name.format(label_name)] = F1Measure(positive_label=label_index)

        if self.calculate_span_f1:
            if not self.label_encoding:
                raise ConfigurationError("label_encoding must be provided when "
                                         "calculating_span_f1 is true.")
            else:
                # Set up span-based F1 measure
                self.metrics["span_based_f1"] = SpanBasedF1Measure(self.vocab,
                                                                   tag_namespace="labels",
                                                                   label_encoding=self.label_encoding)

        # Whether to run in error analysis mode or not, see commands.error_analysis
        self.error_analysis = False
        logger.info("Applying initializer...")
        initializer(self)
示例#22
0
    def __init__(self, args, input_dim, hidden_dim, word_embedder):
        super(RelationAttendedDefinitionSentenceEncoder, self).__init__()
        self.config = args
        self.args = args
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.projection_dim = input_dim
        self.feedforward_hidden_dim = input_dim
        self.num_layers = self.args.num_layers_for_stackatt
        self.num_attention_heads = self.args.num_atthead_for_stackatt

        self.word_embedder = word_embedder
        self.word_embedding_dropout = nn.Dropout(
            self.args.word_embedding_dropout)

        # from allennlp.modules.seq2seq_encoders import , , \
        #     , ,
        #     BidirectionalLanguageModelTransformer, FeedForwardEncoder

        if self.args.definition_seq2seq == 'passthrough':
            self.seq2seq = PassThroughEncoder(input_dim=input_dim)
        elif self.args.definition_seq2seq == 'multiheadstackatt':
            self.seq2seq = StackedSelfAttentionEncoder(
                input_dim=input_dim,
                hidden_dim=input_dim,
                projection_dim=input_dim,
                feedforward_hidden_dim=input_dim,
                num_layers=2,
                num_attention_heads=2)
        elif self.args.definition_seq2seq == 'qanet':
            self.seq2seq = QaNetEncoder(input_dim=input_dim,
                                        hidden_dim=input_dim,
                                        attention_projection_dim=input_dim,
                                        feedforward_hidden_dim=input_dim,
                                        num_blocks=2,
                                        num_convs_per_block=2,
                                        conv_kernel_size=3,
                                        num_attention_heads=2)
        elif self.args.definition_seq2seq == 'intrasentenceatt':
            self.seq2seq = IntraSentenceAttentionEncoder(
                input_dim=input_dim,
                projection_dim=input_dim,
                output_dim=input_dim)
        elif self.args.definition_seq2seq == 'gatedcnn':
            self.seq2seq = GatedCnnEncoder(input_dim=512,
                                           layers=[[[4, 512]],
                                                   [[4, 512], [4, 512]],
                                                   [[4, 512], [4, 512]],
                                                   [[4, 512], [4, 512]]],
                                           dropout=0.05)
        elif self.args.definition_seq2seq == 'bilmtransformer':
            self.seq2seq = BidirectionalLanguageModelTransformer(
                input_dim=input_dim, hidden_dim=input_dim, num_layers=2)
        # elif self.args.definition_seq2seq == 'feedfoward':
        #     feedforward = FeedForward(input_dim=input_dim, num_layers=1, hidden_dims=input_dim, activations=self.args.activation_for_sentence_ff)
        #     self.seq2seq = FeedForwardEncoder(feedforward)

        # '''
        # *"linear"
        # *`"relu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ReLU>`_
        # *`"relu6" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ReLU6>`_
        # *`"elu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ELU>`_
        # *`"prelu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.PReLU>`_
        # *`"leaky_relu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.LeakyReLU>`_
        # *`"threshold" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Threshold>`_
        # *`"hardtanh" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Hardtanh>`_
        # *`"sigmoid" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Sigmoid>`_
        # *`"tanh" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Tanh>`_
        # *`"log_sigmoid" < https: // pytorch.org / docs / master / nn.html  # torch.nn.LogSigmoid>`_
        # *`"softplus" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softplus>`_
        # *`"softshrink" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softshrink>`_
        # *`"softsign" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softsign>`_
        # *`"tanhshrink" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Tanhshrink>`_
        # '''

        elif self.args.definition_seq2seq == 'multiheadselfatt':
            self.seq2seq = MultiHeadSelfAttention(
                num_heads=2,
                input_dim=input_dim,
                output_projection_dim=input_dim,
                attention_dim=input_dim,
                values_dim=input_dim)
        else:
            print('Encoder not defined:', self.args.definition_seq2seq)
            exit()
示例#23
0
train_dataset = reader.read("conll2003/eng.train")
validation_dataset = reader.read("conll2003/eng.testa")
test_dataset = reader.read("conll2003/eng.testb")

all_insts = train_dataset + validation_dataset + test_dataset


vocab = Vocabulary.from_instances(all_insts)

dataset = Batch(all_insts)
dataset.index_instances(vocab)

embedder = PretrainedTransformerMismatchedEmbedder(model_name, last_layer_only = True)
token_embedder = BasicTextFieldEmbedder({"bert" : embedder})
embedding_dim = 768
encoder = PassThroughEncoder(input_dim=embedding_dim)

model = SimpleTagger(vocab = vocab,
                     text_field_embedder = token_embedder,
                     encoder = encoder,
                     calculate_span_f1 = True,
                     label_encoding = "IOB1")

optimizer = optim.Adam(model.parameters(), lr=3e-05)

if torch.cuda.is_available():
    print("Using GPU")
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1
示例#24
0
 def __init__(
     self, embedder: TokenEmbedder, encoder: Seq2SeqEncoder = None,
 ):
     super().__init__()
     self.embedder = embedder
     self.encoder = encoder or PassThroughEncoder(input_dim=self.embedder.get_output_dim())
示例#25
0
    def __init__(self,
                 vocab: Vocabulary,
                 token_representation_dim: int,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 decoder: Optional[Union[FeedForward, str]] = None,
                 contextualizer: Optional[Contextualizer] = None,
                 pretrained_file: Optional[str] = None,
                 transfer_contextualizer_from_pretrained_file: bool = False,
                 transfer_encoder_from_pretrained_file: bool = False,
                 freeze_encoder: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SelectiveRegressor, self).__init__(vocab, regularizer)

        self._token_representation_dim = token_representation_dim
        self._contextualizer = contextualizer
        if encoder is None:
            encoder = PassThroughEncoder(
                input_dim=self._token_representation_dim)
        self._encoder = encoder

        # Load the contextualizer and encoder weights from the
        # pretrained_file if applicable
        if pretrained_file:
            archive = None
            if self._contextualizer and transfer_contextualizer_from_pretrained_file:
                logger.info("Attempting to load contextualizer weights from "
                            "pretrained_file at {}".format(pretrained_file))
                archive = load_archive(cached_path(pretrained_file))
                contextualizer_state = archive.model._contextualizer.state_dict(
                )
                contextualizer_layer_num = self._contextualizer._layer_num
                self._contextualizer.load_state_dict(contextualizer_state)
                if contextualizer_layer_num is not None:
                    logger.info("Setting layer num to {}".format(
                        contextualizer_layer_num))
                    self._contextualizer.set_layer_num(
                        contextualizer_layer_num)
                else:
                    self._contextualizer.reset_layer_num()
                logger.info("Successfully loaded contextualizer weights!")
            if transfer_encoder_from_pretrained_file:
                logger.info("Attempting to load encoder weights from "
                            "pretrained_file at {}".format(pretrained_file))
                if archive is None:
                    archive = load_archive(cached_path(pretrained_file))
                encoder_state = archive.model._encoder.state_dict()
                self._encoder.load_state_dict(encoder_state)
                logger.info("Successfully loaded encoder weights!")

        self._freeze_encoder = freeze_encoder
        for parameter in self._encoder.parameters():
            # If freeze is true, requires_grad should be false and vice versa.
            parameter.requires_grad_(not self._freeze_encoder)

        if decoder is None or decoder == "linear":
            # Create the default decoder (logistic regression) if it is not provided.
            decoder = FeedForward.from_params(
                Params({
                    "input_dim": self._encoder.get_output_dim(),
                    "num_layers": 1,
                    "hidden_dims": 1,
                    "activations": "linear"
                }))
            logger.info("No decoder provided to model, using default "
                        "decoder: {}".format(decoder))
        elif decoder == "mlp":
            # Create the MLP decoder
            decoder = FeedForward.from_params(
                Params({
                    "input_dim": self._encoder.get_output_dim(),
                    "num_layers": 2,
                    "hidden_dims": [1024, 1],
                    "activations": ["relu", "linear"]
                }))
            logger.info("Using MLP decoder: {}".format(decoder))
        self._decoder = decoder

        check_dimensions_match(self._token_representation_dim,
                               self._encoder.get_input_dim(),
                               "token representation dim", "encoder input dim")
        check_dimensions_match(self._encoder.get_output_dim(),
                               self._decoder.get_input_dim(),
                               "encoder output dim", "decoder input dim")
        check_dimensions_match(self._decoder.get_output_dim(), 1,
                               "decoder output dim",
                               "1, since we're predicting a real value")
        # SmoothL1Loss as described in "Neural Models of Factuality" (NAACL 2018)
        self.loss = torch.nn.SmoothL1Loss(reduction="none")
        self.metrics = {
            "mae": MeanAbsoluteError(),
            "pearson_r": PearsonCorrelation()
        }

        # Whether to run in error analysis mode or not, see commands.error_analysis
        self.error_analysis = False
        logger.info("Applying initializer...")
        initializer(self)