def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y']
        #              0     1      2      3      4
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {(0, 0), (0, 1), (0, 3), (1, 0), (1, 1), (1, 2),
                                (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0),
                                (3, 1), (3, 3), (3, 4), (4, 0), (4, 1), (4, 3),
                                (4, 4)}

        bioul_labels = [
            'O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y'
        ]
        #                0     1      2      3      4      5      6      7      8
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {(0, 0), (0, 1), (0, 4), (0, 5), (0, 8), (1, 2),
                                (1, 3), (2, 2), (2, 3), (3, 0), (3, 1), (3, 4),
                                (3, 5), (3, 8), (4, 0), (4, 1), (4, 4), (4, 5),
                                (4, 8), (5, 6), (5, 7), (6, 6), (6, 7), (7, 0),
                                (7, 1), (7, 4), (7, 5), (7, 8), (8, 0), (8, 1),
                                (8, 4), (8, 5), (8, 8)}

        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})
    def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y']
        #              0     1      2      3      4
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {
            (0, 0), (0, 1),         (0, 3),
            (1, 0), (1, 1), (1, 2), (1, 3),
            (2, 0), (2, 1), (2, 2), (2, 3),
            (3, 0), (3, 1),         (3, 3), (3, 4),
            (4, 0), (4, 1),         (4, 3), (4, 4)
        }

        bioul_labels = ['O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y']
        #                0     1      2      3      4      5      6      7      8
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {
            (0, 0), (0, 1),                 (0, 4), (0, 5),                 (0, 8),
                            (1, 2), (1, 3),
                            (2, 2), (2, 3),
            (3, 0), (3, 1),                 (3, 4), (3, 5),                 (3, 8),
            (4, 0), (4, 1),                 (4, 4), (4, 5),                 (4, 8),
                                                            (5, 6), (5, 7),
                                                            (6, 6), (6, 7),
            (7, 0), (7, 1),                 (7, 4), (7, 5),                 (7, 8),
            (8, 0), (8, 1),                 (8, 4), (8, 5),                 (8, 8)
        }

        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})
Пример #3
0
    def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                         # Extra column for end tag.
            (0, 0), (0, 1),         (0, 3),              (0, 6),
            (1, 0), (1, 1), (1, 2), (1, 3),              (1, 6),
            (2, 0), (2, 1), (2, 2), (2, 3),              (2, 6),
            (3, 0), (3, 1),         (3, 3), (3, 4),      (3, 6),
            (4, 0), (4, 1),         (4, 3), (4, 4),      (4, 6),
            (5, 0), (5, 1),         (5, 3)                      # Extra row for start tag
        }

        bioul_labels = ['O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y'] # start tag, end tag
        #                0     1      2      3      4      5      6      7      8          9        10
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                                                   # Extra column for end tag.
            (0, 0), (0, 1),                 (0, 4), (0, 5),                 (0, 8),       (0, 10),
                            (1, 2), (1, 3),
                            (2, 2), (2, 3),
            (3, 0), (3, 1),                 (3, 4), (3, 5),                 (3, 8),       (3, 10),
            (4, 0), (4, 1),                 (4, 4), (4, 5),                 (4, 8),       (4, 10),
                                                            (5, 6), (5, 7),
                                                            (6, 6), (6, 7),
            (7, 0), (7, 1),                 (7, 4), (7, 5),                 (7, 8),       (7, 10),
            (8, 0), (8, 1),                 (8, 4), (8, 5),                 (8, 8),       (8, 10),
            # Extra row for start tag.
            (9, 0), (9, 1),                 (9, 4), (9, 5),                 (9, 8)
        }

        iob1_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("IOB1", dict(enumerate(iob1_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                            # Extra column for end tag.
            (0, 0),         (0, 2),         (0, 4),         (0, 6),
            (1, 0), (1, 1), (1, 2),         (1, 4),         (1, 6),
            (2, 0), (2, 1), (2, 2),         (2, 4),         (2, 6),
            (3, 0),         (3, 2), (3, 3), (3, 4),         (3, 6),
            (4, 0),         (4, 2), (4, 3), (4, 4),         (4, 6),
            (5, 0),         (5, 2),         (5, 4),                # Extra row for start tag
        }
        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})
Пример #4
0
    def populate_dd_vars(self, vocab):  #NOTE

        # Set up allowed transitions
        all_labels = vocab.get_index_to_token_vocabulary(namespace="labels")
        num_labels = len(all_labels)
        constraints = allowed_transitions("BIOUL", all_labels)
        self.transition_matrix = torch.zeros([
            num_labels + 1 + self.include_eos,
            num_labels + 1 + self.include_eos
        ]).fill_(0.0)

        for c in constraints:
            # if (c[0] < num_labels) and (c[1] < num_labels):
            if (self.include_eos
                    or ((c[0] <= num_labels) and (c[1] <= num_labels))):
                self.transition_matrix[c[0], c[1]] = 1.0

        self.nconstraints = num_labels + 1 + int(self.include_eos)

        aggregate_type = self.config.get("aggregate_type", "max")
        if aggregate_type.lower() not in ["sum", "max"]:
            raise Exception("Transition aggregation type invalid")
        if aggregate_type.lower() == "sum":
            self.aggregate_mat = lambda x: x.sum(dim=3)
        elif aggregate_type.lower() == "max":
            self.aggregate_mat = lambda x: x.max(dim=3)[0]
Пример #5
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 feedforward: Optional[FeedForward] = None,
                 label_encoding: Optional[str] = None,
                 include_start_end_transitions: bool = True,
                 constrain_crf_decoding: bool = None,
                 calculate_span_f1: bool = None,
                 dropout: Optional[float] = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super().__init__(vocab, text_field_embedder, encoder,
                         label_namespace, feedforward, label_encoding,
                         include_start_end_transitions, constrain_crf_decoding, calculate_span_f1,
                         dropout, verbose_metrics, initializer, regularizer)

        # Gets the kwargs needs to initialize the WISER CRF. We skip some
        # configuration checks that are checked in the super constructor
        if constrain_crf_decoding:
            labels = self.vocab.get_index_to_token_vocabulary(
                self.label_namespace)
            constraints = allowed_transitions(self.label_encoding, labels)
        else:
            constraints = None

        # Replaces the CRF created by the super constructor with the WISER CRF
        self.crf = WiserConditionalRandomField(
            self.num_tags, constraints,
            include_start_end_transitions=include_start_end_transitions
        )
Пример #6
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        label_namespace: str = "labels",
        feature_namespace: str = None,
        feature_encoder: Seq2VecEncoder = None,
        label_encoding: Optional[str] = None,
        include_start_end_transitions: bool = True,
        constrain_crf_decoding: bool = None,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes))

        if feature_namespace:
            self.feature_encoder = feature_encoder
            self.feat_classification_layer = Linear(
                self.feature_encoder.get_output_dim(),
                self.vocab.get_vocab_size(feature_namespace))
            # print("num_features:", self.vocab.get_vocab_size(feature_namespace))

        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
            self.num_classes,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3),
        }

        self._f1_metric = None

        initializer(self)
Пример #7
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(self.num_tags, constraints)

        self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=label_namespace)

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self)
Пример #8
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(self.num_tags, constraints)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type
                                              or "BIO")

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self)
Пример #9
0
def get_viterbi_pairwise_potentials(vocab, label_encoding):
    """
    Generate a matrix of pairwise transition potentials for the BIO labels.
    The only constraint implemented here is that I-XXX labels must be preceded
    by either an identical I-XXX tag or a B-XXX tag. In order to achieve this
    constraint, pairs of labels which do not satisfy this constraint have a
    pairwise potential of -inf.

    Returns
    -------
    transition_matrix : torch.Tensor
        A (num_labels, num_labels) matrix of pairwise potentials.
    """
    all_labels = vocab.get_index_to_token_vocabulary("labels")
    num_labels = len(all_labels)
    transition_matrix = torch.zeros([num_labels + 2,
                                     num_labels + 2]).fill_(float("-inf"))

    constraints = allowed_transitions(label_encoding, all_labels)
    # print(constraints)
    for c in constraints:
        #if (c[0] < num_labels) and (c[1] < num_labels):
        transition_matrix[c[0], c[1]] = 0.0
    #
    return transition_matrix
    """
Пример #10
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 dropout: Optional[float] = 0,
                 label_encoding: Optional[str] = 'BIO',
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super(CharBertCrfModel, self).__init__(vocab, regularizer)
        self._text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size('labels')

        self._labels_predictor = Linear(
            self._text_field_embedder.get_output_dim(), self.num_tags)
        self.dropout = torch.nn.Dropout(dropout)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self._f1_metric = SpanBasedF1Measure(vocab,
                                             tag_namespace='labels',
                                             label_encoding=label_encoding)
        labels = self.vocab.get_index_to_token_vocabulary('labels')
        constraints = allowed_transitions(label_encoding, labels)
        self.label_to_index = self.vocab.get_token_to_index_vocabulary(
            'labels')
        self.crf = ConditionalRandomField(self.num_tags,
                                          constraints,
                                          include_start_end_transitions=False)
        # self.loss = torch.nn.CrossEntropyLoss()
        initializer(self)
Пример #11
0
    def __init__(
        self,
        rnn2seqencoder: Lstm2SeqEncoder,
        encoding_dim: int,
        datasets_manager: DatasetsManager,
        device: torch.device = torch.device("cpu"),
        namespace_to_constraints: Dict[str, List[Tuple[int, int]]] = None,
        tagging_type=None,
        include_start_end_trainsitions: bool = True,
    ):
        """

        Parameters
        ----------
        rnn2seqencoder : Lstm2SeqEncoder
            Lstm2SeqEncoder that encodes a set of instances to a sequence of hidden states
        encoding_dim : int
            Hidden dimension of the lstm2seq encoder
        namespace_to_constraints: Dict[str, List[Tuple[int, int]]]
            A set of constraints that are valid transitions
        include_start_end_trainsitions: bool
            Whether to include start end transitions
        """
        super(RnnSeqCrfTagger, self).__init__()
        self.rnn2seqencoder = rnn2seqencoder
        self.encoding_dim = encoding_dim
        self.datasets_manager = datasets_manager

        self.label_namespaces = datasets_manager.label_namespaces
        self.device = device
        self.tagging_type = tagging_type
        self.crfs = nn.ModuleDict()
        self.linear_clfs = nn.ModuleDict()
        self.include_start_end_transitions = include_start_end_trainsitions

        if namespace_to_constraints is None and self.tagging_type is not None:
            namespace_to_constraints = defaultdict(list)
            for namespace in self.label_namespaces:
                idx2label_mapping = self.datasets_manager.get_idx_label_mapping(
                    label_namespace=namespace
                )
                transitions_allowed = allowed_transitions(
                    constraint_type=self.tagging_type, labels=idx2label_mapping
                )
                namespace_to_constraints[namespace] = transitions_allowed
        else:
            namespace_to_constraints = defaultdict(list)

        self.namespace_to_constraints = namespace_to_constraints
        for namespace in self.label_namespaces:
            num_labels = self.datasets_manager.num_labels[namespace]
            crf = CRF(
                num_tags=num_labels,
                constraints=self.namespace_to_constraints.get(namespace),
                include_start_end_transitions=self.include_start_end_transitions,
            )  # we do not add start and end tags to our labels
            clf = nn.Linear(self.encoding_dim, num_labels)
            self.crfs.update({namespace: crf})
            self.linear_clfs.update({namespace: clf})
Пример #12
0
    def __init__(self,
                 vocab,
                 text_field_embedder,
                 encoder,
                 label_namespace=u"labels",
                 constraint_type=None,
                 feedforward=None,
                 include_start_end_transitions=True,
                 dropout=None,
                 verbose_metrics=False,
                 initializer=InitializerApplicator(),
                 regularizer=None):
        super(CrfTagger, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(
            Linear(output_dim, self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type
                                              or u"BIO")

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               u"text field embedding dim",
                               u"encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(),
                                   feedforward.get_input_dim(),
                                   u"encoder output dim",
                                   u"feedforward input dim")
        initializer(self)
Пример #13
0
    def __init__(self, 
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True) -> None:
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn = Seq2SeqEncoder.from_params(Params({     
                "type": "lstm",
                "input_size": 1024,
                "hidden_size": 512,
                "bidirectional": True,
                "batch_first": True
            }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(torch.nn.Linear(
            in_features=hidden2tag_in_dim,
            out_features=vocab.get_vocab_size("labels")))
        
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        
        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels")
            )
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True
            )
        
        self.f1 = SpanBasedF1Measure(vocab, 
                                     tag_namespace="labels",
                                     ignore_classes=["news/type","negation",
                                                     "demonstrative_reference",
                                                     "timer/noun","timer/attributes"],
                                     label_encoding="BIO")
Пример #14
0
    def __init__(
        self,
        task: str,
        vocab: Vocabulary,
        input_dim: int,
        loss_weight: float = 1.0,
        label_encoding: Optional[str] = 'BIO',
        include_start_end_transitions: bool = True,
        constrain_crf_decoding: bool = True,
        calculate_span_f1: bool = None,
        verbose_metrics: bool = False,
        metric: str = 'span_f1',
        top_k: int = 1,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        self.task = task
        self.input_dim = input_dim
        self.loss_weight = loss_weight
        self.num_tags = self.vocab.get_vocab_size(task)
        self.top_k = top_k
        self._verbose_metrics = verbose_metrics

        self.tag_projection_layer = TimeDistributed(
            Linear(input_dim, self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError(
                    "constrain_crf_decoding is True, but no label_encoding was specified."
                )
            labels = self.vocab.get_index_to_token_vocabulary(task)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions)
        self.metrics = {
            "span_f1":
            SpanBasedF1Measure(self.vocab,
                               tag_namespace=self.task,
                               label_encoding="BIO")
        }
Пример #15
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 feedforward: FeedForward = None,
                 include_start_end_transitions: bool = True,
                 dropout: float = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or "BIO")


        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   "encoder output dim", "feedforward input dim")
        initializer(self)
Пример #16
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 binary_feature_dim: int,
                 embedding_dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 label_smoothing: float = None,
                 label_namespace: str = "labels",
                 ignore_span_metric: bool = False,
                 label_encoding: Optional[str] = 'BIO',
                 include_start_end_transitions: bool = True,
                 constrain_crf_decoding: bool = True) -> None:
        super(OieLabelerCRF, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")

        # For the span based evaluation, we don't want to consider labels
        # for verb, because the verb index is provided to the model.
        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="labels",
                                              ignore_classes=["V"])
        self.label_namespace = label_namespace
        self.encoder = encoder
        # There are exactly 2 binary features for the verb predicate embedding.
        self.binary_feature_embedding = Embedding(2, binary_feature_dim)
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes))
        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        self.include_start_end_transitions = include_start_end_transitions
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if constrain_crf_decoding:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            print(labels)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None
        self.crf = ConditionalRandomField(
            self.num_classes,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        check_dimensions_match(
            text_field_embedder.get_output_dim() + binary_feature_dim,
            encoder.get_input_dim(),
            "text embedding dim + verb indicator embedding dim",
            "encoder input dim")
        initializer(self)
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 feedforward: Optional[FeedForward] = None,
                 dropout: Optional[float] = None,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super().__init__(vocab, regularizer)
        label_namespace = 'labels'
        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder

        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.


        self.label_encoding = 'BIOUL'
        labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
        constraints = allowed_transitions(self.label_encoding, labels)


        self.include_start_end_transitions = True
        self.crf = ConditionalRandomField(
            self.num_tags, constraints,
            include_start_end_transitions=self.include_start_end_transitions
        )

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }

        self._f1_metric = SpanBasedF1Measure(vocab,
                                             tag_namespace=label_namespace,
                                             label_encoding=self.label_encoding)
Пример #18
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        clauses_encoder: Seq2VecEncoder,
        outer_encoder: Seq2SeqEncoder,
        label_namespace: str = "labels",
        constraint_type: str = None,
        include_start_end_transitions: bool = True,
        dropout: float = None,
        loss_weights: Optional[List] = [],
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super(JCC, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.clauses_encoder = inner_encoder
        self.outer_encoder = outer_encoder
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self.label_projection_layer = TimeDistributed(
            Linear(outer_encoder.get_output_dim(), self.num_tags))

        labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
        constraints = allowed_transitions(constraint_type, labels)
        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions,
        )
        self.metrics = {"accuracy": Accuracy()}

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            clauses_encoder.get_input_dim(),
            "text field embedding dim",
            "clauses encoder input dim",
        )
        check_dimensions_match(
            clauses_encoder.get_output_dim(),
            outer_encoder.get_input_dim(),
            "clauses encoder output dim",
            "outer encoder input dim",
        )
        initializer(self)
Пример #19
0
 def __init__(self, model_path, vocab: Vocabulary):
     super().__init__(vocab)
     self.pretrained_tokenizer = BertForPreTraining.from_pretrained(
         model_path)
     config = BertConfig.from_pretrained(model_path)
     bert_model = BertForPreTraining(config)
     self.bert = bert_model.bert
     tags = vocab.get_index_to_token_vocabulary("tags")
     num_tags = len(tags)
     constraints = allowed_transitions(constraint_type="BMES", labels=tags)
     self.projection = torch.nn.Linear(768, num_tags)
     self.crf = ConditionalRandomField(num_tags=num_tags,
                                       constraints=constraints,
                                       include_start_end_transitions=False)
Пример #20
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder

        self.last_layer = Linear(400, 64)

        self.bias_outside = torch.nn.Parameter(torch.zeros(1) - 1.,
                                               requires_grad=True)

        self.stds = torch.autograd.Variable(torch.ones(self.num_tags))
        self.sums = np.zeros(self.num_tags) + 10
        self.amount = np.zeros(self.num_tags) + 11
        self.loss = torch.nn.CrossEntropyLoss()

        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(self.num_tags, constraints)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace)

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self)

        self.hash = 0
        self.new = True
Пример #21
0
    def __init__(self, num_input_features: '(int) number of input features', hidden_size: '(int) number of\
    hidden features the outputs will also have hidden_size features'                                                                    , num_layers: '(int) number of \
    recursion'              , dropout_gru, bidirectional: '(bool) if True, use bidirectional GRU',\
    tags: "(dict[int: str])example: {0:'I', 1:'B', 2:'O', 3:'<PAD>'}", dropout_FCN: '(double)'):
        super().__init__()
        self.gru = nn.GRU(input_size=num_input_features, hidden_size=hidden_size, \
                                 num_layers=num_layers,batch_first = True, dropout=dropout_gru, \
                                 bidirectional=bidirectional)

        all_transition = allowed_transitions('BIO', tags)
        #self.crf = CRF(num_tags=len(tags), batch_first= True)
        self.linear = nn.Linear(hidden_size * 2, hidden_size)
        self.BN = nn.BatchNorm1d(num_layers)
        self.linear2 = nn.Linear(hidden_size, len(tags))
        self.BN2 = nn.BatchNorm1d(num_layers)
        self.crf = ConditionalRandomField(len(tags), all_transition)
        self.dropout = nn.Dropout(dropout_FCN)
    def __init__(
        self,
        backbone: ModelBackbone,
        labels: List[str],
        label_encoding: Optional[str] = "BIOUL",
        top_k: int = 1,
        dropout: Optional[float] = 0.0,
        feedforward: Optional[FeedForwardConfiguration] = None,
    ) -> None:
        super(TokenClassification, self).__init__(backbone)
        vocabulary.set_labels(self.backbone.vocab, labels)

        self.top_k = top_k
        self.dropout = torch.nn.Dropout(dropout)
        self._feedforward: FeedForward = (
            None if not feedforward else feedforward.input_dim(
                backbone.encoder.get_output_dim()).compile())
        # output layers
        self._classifier_input_dim = (self._feedforward.get_output_dim()
                                      if self._feedforward else
                                      backbone.encoder.get_output_dim())
        # we want this linear applied to each token in the sequence
        self._label_projection_layer = TimeDistributed(
            torch.nn.Linear(self._classifier_input_dim, self.num_labels))
        constraints = allowed_transitions(
            label_encoding,
            vocabulary.get_index_to_labels_dictionary(self.backbone.vocab),
        )
        self._crf = ConditionalRandomField(self.num_labels,
                                           constraints,
                                           include_start_end_transitions=True)

        self.metrics = {"accuracy": CategoricalAccuracy()}
        if self.top_k:
            self.metrics.update({
                f"accuracy_{self.top_k}":
                CategoricalAccuracy(top_k=self.top_k)
            })
        self.f1_metric = SpanBasedF1Measure(
            self.backbone.vocab,
            tag_namespace=vocabulary.LABELS_NAMESPACE,
            label_encoding=label_encoding,
        )

        self.__all_metrics = [self.f1_metric]
        self.__all_metrics.extend(self.metrics.values())
Пример #23
0
    def __init__(self, hparams):
        """
        input:
            hparams: namespace with the following items:
                'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00'
                'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory')
                'max_length' (int): Max Length. default: 1024
                'lr' (float): Learning Rate. default: 1e-2
                'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False
                'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4
        """
        super().__init__()
        self.hparams = hparams
        self.itol = ID_TO_LABEL
        self.ltoi = {v: k for k, v in self.itol.items()}

        # Load Pretrained BioELMo
        DIR_ELMo = Path(str(self.hparams.bioelmo_dir))
        self.bioelmo = Elmo(DIR_ELMo / 'biomed_elmo_options.json',
                            DIR_ELMo / 'biomed_elmo_weights.hdf5',
                            1,
                            requires_grad=bool(self.hparams.fine_tune_bioelmo),
                            dropout=0)
        self.bioelmo_output_dim = self.bioelmo.get_output_dim()

        # ELMo Padding token (In ELMo token with ID 0 is used for padding)
        VOCAB_FILE_PATH = DIR_ELMo / 'vocab.txt'
        command = shlex.split(f"head -n 1 {VOCAB_FILE_PATH}")
        res = subprocess.Popen(command, stdout=subprocess.PIPE)
        self.bioelmo_pad_token = res.communicate()[0].decode('utf-8').strip()

        # Initialize Intermediate Affine Layer
        self.hidden_to_tag = nn.Linear(int(self.bioelmo_output_dim),
                                       len(self.itol))

        # Initialize CRF
        TRANSITIONS = conditional_random_field.allowed_transitions(
            constraint_type='BIO', labels=self.itol)
        self.crf = conditional_random_field.ConditionalRandomField(
            # set to 7 because here "tags" means ['O', 'B-P', 'I-P', 'B-I', 'I-I', 'B-O', 'I-O']
            # no need to include 'BOS' and 'EOS' in "tags"
            num_tags=len(self.itol),
            constraints=TRANSITIONS,
            include_start_end_transitions=False)
        self.crf.reset_parameters()
Пример #24
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 lang_dim: int,
                 lang_map: Dict[str, int],
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 include_start_end_transitions: bool = True,
                 dropout: float = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        lang_num = len(lang_map)
        self.lang_embedding = Embedding(lang_num, lang_dim)

        self.encoder = encoder
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or "BIO")

        check_dimensions_match(text_field_embedder.get_output_dim() + lang_dim, encoder.get_input_dim(),
                               "text field embedding dim + lang_dim", "encoder input dim")
        initializer(self)
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ConstrainedConditionalModule':
        hard_constraints = params.pop("hard_constraints", [])
        soft_constraints = params.pop("soft_constraints", {})
        label_namespace = params.pop("label_namespace", "labels")
        sentence_penalty_map_dict = params.pop("sentence_penalty_map", None)
        constrain_crf_decoding = params.pop("constrain_crf_decoding", False)
        label_encoding = params.pop("label_encoding", None)

        sentence_penalty_map = None
        if sentence_penalty_map_dict:
            assert len(sentence_penalty_map_dict) == 1, "multiple sentence constraints not supported"
            tag, penalty = list(sentence_penalty_map_dict.items())[0]
            tag_index = vocab.get_token_index(tag, label_namespace)
            sentence_penalty_map = (tag_index, penalty)

        hard_constraints_to_indices: Dict[str, List[int]] = {}
        for tag in hard_constraints:
            hard_constraints_to_indices[tag] = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    hard_constraints_to_indices[tag].append(index)
        soft_constraints = soft_constraints or {}
        soft_constraints_to_indices: Dict[str, Tuple[List[int], float]] = {}
        for tag, penalty in soft_constraints.items():
            indices = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    indices.append(index)
            soft_constraints_to_indices[tag] = (indices, penalty)
        num_tags = vocab.get_vocab_size(label_namespace)
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None
        params.assert_empty(cls.__name__)
        return ConstrainedConditionalModule(num_tags, constraints,
                                            hard_constraints_to_indices,
                                            soft_constraints_to_indices,
                                            sentence_penalty_map)
Пример #26
0
    def __init__(
        self,
        base_model,
        basic_tokenizer,
        subword_tokenizer,
        model_dir=DEFAULT_MODEL_PATH,
        normalizer=None,
    ):
        """初期化

        非推奨です

        Args:
            base_model (nn.Module): BertCrfモデル
            basic_tokenizer (callable): 単語分割用トークナイザ
            subword_tokenizer (callable): サブワード分割用トークナイザ
            label_vocab (dict): {label:label_idx, ...}
            model_dir (pathlib.Path or str): モデルフォルダのpath.labels.txtとfinal.model
            normalizer (callable): 単語正規化関数
        """
        if not isinstance(model_dir, pathlib.PurePath):
            model_dir = Path(model_dir)

        #self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.device = "cpu"

        label_vocab = create_label_vocab_from_file(
            str(model_dir / "labels.txt"))
        self.itol = {i: l for l, i in label_vocab.items()}
        constraints = allowed_transitions(
            "BIO", {i: w
                    for w, i in label_vocab.items()})
        self.model = BertCrf(base_model, len(label_vocab), constraints)
        self.model.load_state_dict(
            torch.load(str(model_dir / "final.model"),
                       map_location=self.device))
        self.model.to(self.device)
        self.model.eval()

        self.basic_tokenizer = basic_tokenizer
        self.subword_tokenizer = subword_tokenizer

        self.normalizer = normalizer
Пример #27
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 dropout: Optional[float] = 0,
                 label_encoding: Optional[str] = 'BIO',
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        """
        :param vocab: ``Vocabulary``
        :param text_field_embedder: ``TextFieldEmbedder``
        Used to embed the ``question`` and ``passage`` ``TextFields`` we get as input to the model.
        :param dropout:
        :param label_encoding: BIO
        :param initializer:``InitializerApplicator``, optional (default=``InitializerApplicator()``)
        Used to initialize the model parameters.
        :param regularizer:``RegularizerApplicator``, optional (default=``None``)
        :param print_bad_case: 是否将出错的case打印出来
        If provided, will be used to calculate the regularization penalty during training.
        """
        super(BertCrfTaggerModel, self).__init__(vocab, regularizer)
        self._text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size('labels')

        self._labels_predictor = Linear(
            self._text_field_embedder.get_output_dim(), self.num_tags)
        self.dropout = torch.nn.Dropout(dropout)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self._f1_metric = SpanBasedF1Measure(vocab,
                                             tag_namespace='labels',
                                             label_encoding=label_encoding)
        labels = self.vocab.get_index_to_token_vocabulary('labels')
        constraints = allowed_transitions(label_encoding, labels)
        self.label_to_index = self.vocab.get_token_to_index_vocabulary(
            'labels')
        self.crf = ConditionalRandomField(self.num_tags,
                                          constraints,
                                          include_start_end_transitions=True)
        self.loss = torch.nn.CrossEntropyLoss()
        initializer(self)
Пример #28
0
    def __init__(
        self,
        vocab: Vocabulary,
        mention_feedforward: FeedForward,
        label_namespace: str = "ner_type_labels",
        label_encoding: str = "BIOUL",
        exact_match: bool = False,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super(NERTagger, self).__init__(vocab, regularizer)

        self._vocab = vocab
        self.label_namespace = label_namespace
        self._n_labels = vocab.get_vocab_size(label_namespace)

        self.label_map = self.vocab.get_index_to_token_vocabulary(label_namespace)
        print(self.label_map)

        self._mention_feedforward = TimeDistributed(mention_feedforward)

        self._ner_scorer = TimeDistributed(
            torch.nn.Linear(mention_feedforward.get_output_dim(), self._n_labels)
        )
        constraints = allowed_transitions(
            label_encoding, self.vocab.get_index_to_token_vocabulary(label_namespace)
        )
        self._ner_crf = ConditionalRandomField(
            self._n_labels, constraints, include_start_end_transitions=False
        )

        if exact_match:
            self._ner_metrics = SpanBasedF1Measure(self.label_map, label_encoding=label_encoding)
        else:
            self._ner_metrics = SpanBasedF1MeasureAllennlp(
                vocabulary=vocab, tag_namespace=label_namespace, label_encoding=label_encoding
            )

        initializer(self)
Пример #29
0
    def __init__(self, vocab: Vocabulary, embedding_dim=300, embedder_type=None, bert_trainable=True, **kwargs):
        super().__init__(vocab)
        for k in kwargs:
            self.__setattr__(k, kwargs[k])

        text_field_embedder = get_embeddings(embedder_type, self.vocab, embedding_dim, bert_trainable)
        embedding_dim = text_field_embedder.get_output_dim()

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(embedding_dim, self.num_rnn_units, batch_first=True, bidirectional=True, dropout=self.dropout_rate))

        self.label_namespace = label_namespace = 'ner_bio_labels'
        self.num_tags = self.vocab.get_vocab_size(label_namespace)

        self.text_field_embedder = text_field_embedder
        self.encoder = encoder
        self.dropout = torch.nn.Dropout(self.dropout_rate)

        output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        self.label_encoding = label_encoding = 'BIO'
        labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
        constraints = allowed_transitions(self.label_encoding, labels)

        self.include_start_end_transitions = True
        self.crf = ConditionalRandomField(
            self.num_tags, constraints,
            include_start_end_transitions=True
        )

        self._f1_metric = SpanBasedF1Measure(self.vocab,
                                             tag_namespace=label_namespace,
                                             label_encoding=label_encoding)
        self._verbose_metrics = False
Пример #30
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        label_namespace: str = "labels",
        feedforward: Optional[FeedForward] = None,
        label_encoding: Optional[str] = None,
        include_start_end_transitions: bool = True,
        constrain_crf_decoding: bool = None,
        calculate_span_f1: bool = None,
        dropout: Optional[float] = None,
        verbose_metrics: bool = False,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        top_k: int = 1,
    ) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.top_k = top_k
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(
            Linear(output_dim, self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3),
        }
        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError("calculate_span_f1 is True, but "
                                         "no label_encoding was specified.")
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )
        if feedforward is not None:
            check_dimensions_match(
                encoder.get_output_dim(),
                feedforward.get_input_dim(),
                "encoder output dim",
                "feedforward input dim",
            )
        initializer(self)
Пример #31
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 feedforward: Optional[FeedForward] = None,
                 label_encoding: Optional[str] = None,
                 constraint_type: Optional[str] = None,
                 include_start_end_transitions: bool = True,
                 constrain_crf_decoding: bool = None,
                 calculate_span_f1: bool = None,
                 dropout: Optional[float] = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        if constraint_type is not None:
            warnings.warn("'constraint_type' was removed and replaced with"
                          "'label_encoding', 'constrain_crf_decoding', and "
                          "'calculate_span_f1' in version 0.6.1. It will be "
                          "removed in version 0.8.", DeprecationWarning)
            label_encoding = constraint_type

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.metrics = {
                "accuracy": CategoricalAccuracy(),
                "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError("calculate_span_f1 is True, but "
                                         "no label_encoding was specified.")
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)
        elif constraint_type is not None:
            # Maintain deprecated behavior if constraint_type is provided
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=constraint_type)

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   "encoder output dim", "feedforward input dim")
        initializer(self)
Пример #32
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 relation_scorer: RelationScorer,
                 ner_tag_namespace: str = 'tags',
                 evaluated_ner_labels: List[str] = None,
                 re_loss_weight: float = 1.0,
                 ner_tag_embedder: TokenEmbedder = None,
                 use_aux_ner_labels: bool = False,
                 aux_coarse_namespace: str = 'coarse_tags',
                 aux_modifier_namespace: str = 'modifier_tags',
                 aux_loss_weight: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab=vocab, regularizer=regularizer)

        self.text_field_embedder = text_field_embedder
        self.encoder = encoder

        # NER subtask 2
        self._ner_label_encoding = 'BIO'
        self._ner_tag_namespace = ner_tag_namespace
        ner_input_dim = self.encoder.get_output_dim()
        num_ner_tags = self.vocab.get_vocab_size(ner_tag_namespace)
        self.tag_projection_layer = TimeDistributed(
            Linear(ner_input_dim, num_ner_tags))

        self._use_aux_ner_labels = use_aux_ner_labels
        if self._use_aux_ner_labels:
            self._coarse_tag_namespace = aux_coarse_namespace
            self._num_coarse_tags = self.vocab.get_vocab_size(
                self._coarse_tag_namespace)
            self._coarse_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_coarse_tags))
            self._modifier_tag_namespace = aux_modifier_namespace
            self._num_modifier_tags = self.vocab.get_vocab_size(
                self._modifier_tag_namespace)
            self._modifier_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_modifier_tags))
            self._coarse_acc = CategoricalAccuracy()
            self._modifier_acc = CategoricalAccuracy()
            self._aux_loss_weight = aux_loss_weight

        self.ner_accuracy = CategoricalAccuracy()
        if evaluated_ner_labels is None:
            ignored_classes = None
        else:
            assert self._ner_label_encoding == 'BIO', 'expected BIO encoding'
            all_ner_tags = self.vocab.get_token_to_index_vocabulary(
                ner_tag_namespace).keys()
            ner_tag_classes = set(
                [bio_tag[2:] for bio_tag in all_ner_tags if len(bio_tag) > 2])
            ignored_classes = list(
                set(ner_tag_classes).difference(evaluated_ner_labels))
        self.ner_f1 = SpanBasedF1Measure(
            vocabulary=vocab,
            tag_namespace=ner_tag_namespace,
            label_encoding=self._ner_label_encoding,
            ignore_classes=ignored_classes)

        # Use constrained crf decoding with the BIO labeling scheme
        ner_labels = self.vocab.get_index_to_token_vocabulary(
            ner_tag_namespace)
        constraints = allowed_transitions(self._ner_label_encoding, ner_labels)

        self.crf = ConditionalRandomField(num_ner_tags,
                                          constraints,
                                          include_start_end_transitions=True)

        # RE subtask 3
        self.ner_tag_embedder = ner_tag_embedder
        self.relation_scorer = relation_scorer
        self._re_loss_weight = re_loss_weight

        initializer(self)
    def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                         # Extra column for end tag.
            (0, 0), (0, 1),         (0, 3),              (0, 6),
            (1, 0), (1, 1), (1, 2), (1, 3),              (1, 6),
            (2, 0), (2, 1), (2, 2), (2, 3),              (2, 6),
            (3, 0), (3, 1),         (3, 3), (3, 4),      (3, 6),
            (4, 0), (4, 1),         (4, 3), (4, 4),      (4, 6),
            (5, 0), (5, 1),         (5, 3)                      # Extra row for start tag
        }

        bioul_labels = ['O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y'] # start tag, end tag
        #                0     1      2      3      4      5      6      7      8          9        10
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                                                   # Extra column for end tag.
            (0, 0), (0, 1),                 (0, 4), (0, 5),                 (0, 8),       (0, 10),
                            (1, 2), (1, 3),
                            (2, 2), (2, 3),
            (3, 0), (3, 1),                 (3, 4), (3, 5),                 (3, 8),       (3, 10),
            (4, 0), (4, 1),                 (4, 4), (4, 5),                 (4, 8),       (4, 10),
                                                            (5, 6), (5, 7),
                                                            (6, 6), (6, 7),
            (7, 0), (7, 1),                 (7, 4), (7, 5),                 (7, 8),       (7, 10),
            (8, 0), (8, 1),                 (8, 4), (8, 5),                 (8, 8),       (8, 10),
            # Extra row for start tag.
            (9, 0), (9, 1),                 (9, 4), (9, 5),                 (9, 8)
        }

        iob1_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("IOB1", dict(enumerate(iob1_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                            # Extra column for end tag.
            (0, 0),         (0, 2),         (0, 4),         (0, 6),
            (1, 0), (1, 1), (1, 2),         (1, 4),         (1, 6),
            (2, 0), (2, 1), (2, 2),         (2, 4),         (2, 6),
            (3, 0),         (3, 2), (3, 3), (3, 4),         (3, 6),
            (4, 0),         (4, 2), (4, 3), (4, 4),         (4, 6),
            (5, 0),         (5, 2),         (5, 4),                # Extra row for start tag
        }
        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})

        bmes_labels = ['B-X', 'M-X', 'E-X', 'S-X', 'B-Y', 'M-Y', 'E-Y', 'S-Y'] # start tag, end tag
        #               0      1      2      3      4      5      6      7       8          9
        allowed = allowed_transitions("BMES", dict(enumerate(bmes_labels)))
        assert set(allowed) == {
                    (0, 1), (0, 2),
                            (1, 2),                                         # Extra column for end tag.
            (2, 0),                 (2, 3), (2, 4),                 (2, 7), (2, 9),
            (3, 0),                 (3, 3), (3, 4),                 (3, 7), (3, 9),
                                                    (4, 5), (4, 6),
                                                            (5, 6),
            (6, 0),                 (6, 3), (6, 4),                 (6, 7), (6, 9),
            (7, 0),                 (7, 3), (7, 4),                 (7, 7), (7, 9),
            (8, 0),                 (8, 3), (8, 4),                 (8, 7),  # Extra row for start tag
        }
Пример #34
0
def default_crf() -> ConditionalRandomField:
    include_start_end_transitions = True
    constraints = allowed_transitions('BIO', {0: 'O', 1: 'B', 2: 'I'})
    return ConditionalRandomField(3, constraints,
                                  include_start_end_transitions)
Пример #35
0
    }

    task_idx2classnames = {
        idx: classname
        for idx, classname in idx2classnames.items() if idx in range(0, 8)
    }
    process_idx2classnames = {
        idx - 8: classname
        for idx, classname in idx2classnames.items() if idx in range(8, 16)
    }
    material_idx2classnames = {
        idx - 16: classname
        for idx, classname in idx2classnames.items() if idx in range(16, 24)
    }

    task_constraints = allowed_transitions(constraint_type="BIOUL",
                                           labels=task_idx2classnames)
    process_constraints = allowed_transitions(constraint_type="BIOUL",
                                              labels=process_idx2classnames)
    material_constraints = allowed_transitions(constraint_type="BIOUL",
                                               labels=material_idx2classnames)

    embedder = VanillaEmbedder(embedding=embedding,
                               embedding_dim=EMBEDDING_DIMENSION)

    if USE_CHAR_ENCODER:
        char_embedder = VanillaEmbedder(embedding=char_embedding,
                                        embedding_dim=CHAR_EMBEDDING_DIMENSION)
        char_encoder = CharLSTMEncoder(
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            char_embedder=char_embedder,
            bidirectional=True,