Python allowed_transitions примеры, allennlp.modules.conditional_random_field.allowed_transitions Python примеры использования

Пример #1

0

Показать файл

Файл: conditional_random_field_test.py Проект: deepmipt/ner-meta

    def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y']
        #              0     1      2      3      4
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {(0, 0), (0, 1), (0, 3), (1, 0), (1, 1), (1, 2),
                                (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0),
                                (3, 1), (3, 3), (3, 4), (4, 0), (4, 1), (4, 3),
                                (4, 4)}

        bioul_labels = [
            'O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y'
        ]
        #                0     1      2      3      4      5      6      7      8
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {(0, 0), (0, 1), (0, 4), (0, 5), (0, 8), (1, 2),
                                (1, 3), (2, 2), (2, 3), (3, 0), (3, 1), (3, 4),
                                (3, 5), (3, 8), (4, 0), (4, 1), (4, 4), (4, 5),
                                (4, 8), (5, 6), (5, 7), (6, 6), (6, 7), (7, 0),
                                (7, 1), (7, 4), (7, 5), (7, 8), (8, 0), (8, 1),
                                (8, 4), (8, 5), (8, 8)}

        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})

Пример #2

0

Показать файл

Файл: conditional_random_field_test.py Проект: Jordan-Sauchuk/allennlp

    def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y']
        #              0     1      2      3      4
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {
            (0, 0), (0, 1),         (0, 3),
            (1, 0), (1, 1), (1, 2), (1, 3),
            (2, 0), (2, 1), (2, 2), (2, 3),
            (3, 0), (3, 1),         (3, 3), (3, 4),
            (4, 0), (4, 1),         (4, 3), (4, 4)
        }

        bioul_labels = ['O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y']
        #                0     1      2      3      4      5      6      7      8
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {
            (0, 0), (0, 1),                 (0, 4), (0, 5),                 (0, 8),
                            (1, 2), (1, 3),
                            (2, 2), (2, 3),
            (3, 0), (3, 1),                 (3, 4), (3, 5),                 (3, 8),
            (4, 0), (4, 1),                 (4, 4), (4, 5),                 (4, 8),
                                                            (5, 6), (5, 7),
                                                            (6, 6), (6, 7),
            (7, 0), (7, 1),                 (7, 4), (7, 5),                 (7, 8),
            (8, 0), (8, 1),                 (8, 4), (8, 5),                 (8, 8)
        }

        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})

Пример #3

0

Показать файл

    def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                         # Extra column for end tag.
            (0, 0), (0, 1),         (0, 3),              (0, 6),
            (1, 0), (1, 1), (1, 2), (1, 3),              (1, 6),
            (2, 0), (2, 1), (2, 2), (2, 3),              (2, 6),
            (3, 0), (3, 1),         (3, 3), (3, 4),      (3, 6),
            (4, 0), (4, 1),         (4, 3), (4, 4),      (4, 6),
            (5, 0), (5, 1),         (5, 3)                      # Extra row for start tag
        }

        bioul_labels = ['O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y'] # start tag, end tag
        #                0     1      2      3      4      5      6      7      8          9        10
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                                                   # Extra column for end tag.
            (0, 0), (0, 1),                 (0, 4), (0, 5),                 (0, 8),       (0, 10),
                            (1, 2), (1, 3),
                            (2, 2), (2, 3),
            (3, 0), (3, 1),                 (3, 4), (3, 5),                 (3, 8),       (3, 10),
            (4, 0), (4, 1),                 (4, 4), (4, 5),                 (4, 8),       (4, 10),
                                                            (5, 6), (5, 7),
                                                            (6, 6), (6, 7),
            (7, 0), (7, 1),                 (7, 4), (7, 5),                 (7, 8),       (7, 10),
            (8, 0), (8, 1),                 (8, 4), (8, 5),                 (8, 8),       (8, 10),
            # Extra row for start tag.
            (9, 0), (9, 1),                 (9, 4), (9, 5),                 (9, 8)
        }

        iob1_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("IOB1", dict(enumerate(iob1_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                            # Extra column for end tag.
            (0, 0),         (0, 2),         (0, 4),         (0, 6),
            (1, 0), (1, 1), (1, 2),         (1, 4),         (1, 6),
            (2, 0), (2, 1), (2, 2),         (2, 4),         (2, 6),
            (3, 0),         (3, 2), (3, 3), (3, 4),         (3, 6),
            (4, 0),         (4, 2), (4, 3), (4, 4),         (4, 6),
            (5, 0),         (5, 2),         (5, 4),                # Extra row for start tag
        }
        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})

Пример #4

0

Показать файл

Файл: penalty.py Проект: djin31/loss-landscape

    def populate_dd_vars(self, vocab):  #NOTE

        # Set up allowed transitions
        all_labels = vocab.get_index_to_token_vocabulary(namespace="labels")
        num_labels = len(all_labels)
        constraints = allowed_transitions("BIOUL", all_labels)
        self.transition_matrix = torch.zeros([
            num_labels + 1 + self.include_eos,
            num_labels + 1 + self.include_eos
        ]).fill_(0.0)

        for c in constraints:
            # if (c[0] < num_labels) and (c[1] < num_labels):
            if (self.include_eos
                    or ((c[0] <= num_labels) and (c[1] <= num_labels))):
                self.transition_matrix[c[0], c[1]] = 1.0

        self.nconstraints = num_labels + 1 + int(self.include_eos)

        aggregate_type = self.config.get("aggregate_type", "max")
        if aggregate_type.lower() not in ["sum", "max"]:
            raise Exception("Transition aggregation type invalid")
        if aggregate_type.lower() == "sum":
            self.aggregate_mat = lambda x: x.sum(dim=3)
        elif aggregate_type.lower() == "max":
            self.aggregate_mat = lambda x: x.max(dim=3)[0]

Пример #5

0

Показать файл

Файл: crf_tagger.py Проект: yangyuegly/wiser

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 feedforward: Optional[FeedForward] = None,
                 label_encoding: Optional[str] = None,
                 include_start_end_transitions: bool = True,
                 constrain_crf_decoding: bool = None,
                 calculate_span_f1: bool = None,
                 dropout: Optional[float] = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super().__init__(vocab, text_field_embedder, encoder,
                         label_namespace, feedforward, label_encoding,
                         include_start_end_transitions, constrain_crf_decoding, calculate_span_f1,
                         dropout, verbose_metrics, initializer, regularizer)

        # Gets the kwargs needs to initialize the WISER CRF. We skip some
        # configuration checks that are checked in the super constructor
        if constrain_crf_decoding:
            labels = self.vocab.get_index_to_token_vocabulary(
                self.label_namespace)
            constraints = allowed_transitions(self.label_encoding, labels)
        else:
            constraints = None

        # Replaces the CRF created by the super constructor with the WISER CRF
        self.crf = WiserConditionalRandomField(
            self.num_tags, constraints,
            include_start_end_transitions=include_start_end_transitions
        )

Пример #6

0

Показать файл

Файл: masked_crf_tagger.py Проект: YoumiMa/DEFT

    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        label_namespace: str = "labels",
        feature_namespace: str = None,
        feature_encoder: Seq2VecEncoder = None,
        label_encoding: Optional[str] = None,
        include_start_end_transitions: bool = True,
        constrain_crf_decoding: bool = None,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes))

        if feature_namespace:
            self.feature_encoder = feature_encoder
            self.feat_classification_layer = Linear(
                self.feature_encoder.get_output_dim(),
                self.vocab.get_vocab_size(feature_namespace))
            # print("num_features:", self.vocab.get_vocab_size(feature_namespace))

        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
            self.num_classes,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3),
        }

        self._f1_metric = None

        initializer(self)

Пример #7

0

Показать файл

Файл: crf_tagger.py Проект: Jordan-Sauchuk/allennlp

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(self.num_tags, constraints)

        self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=label_namespace)

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self)

Пример #8

0

Показать файл

Файл: crf_tagger.py Проект: ziyaoh/allennlp

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(self.num_tags, constraints)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type
                                              or "BIO")

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self)

Пример #9

0

Показать файл

Файл: helper.py Проект: djin31/loss-landscape

def get_viterbi_pairwise_potentials(vocab, label_encoding):
    """
    Generate a matrix of pairwise transition potentials for the BIO labels.
    The only constraint implemented here is that I-XXX labels must be preceded
    by either an identical I-XXX tag or a B-XXX tag. In order to achieve this
    constraint, pairs of labels which do not satisfy this constraint have a
    pairwise potential of -inf.

    Returns
    -------
    transition_matrix : torch.Tensor
        A (num_labels, num_labels) matrix of pairwise potentials.
    """
    all_labels = vocab.get_index_to_token_vocabulary("labels")
    num_labels = len(all_labels)
    transition_matrix = torch.zeros([num_labels + 2,
                                     num_labels + 2]).fill_(float("-inf"))

    constraints = allowed_transitions(label_encoding, all_labels)
    # print(constraints)
    for c in constraints:
        #if (c[0] < num_labels) and (c[1] < num_labels):
        transition_matrix[c[0], c[1]] = 0.0
    #
    return transition_matrix
    """

Пример #10

0

Показать файл

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 dropout: Optional[float] = 0,
                 label_encoding: Optional[str] = 'BIO',
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super(CharBertCrfModel, self).__init__(vocab, regularizer)
        self._text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size('labels')

        self._labels_predictor = Linear(
            self._text_field_embedder.get_output_dim(), self.num_tags)
        self.dropout = torch.nn.Dropout(dropout)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self._f1_metric = SpanBasedF1Measure(vocab,
                                             tag_namespace='labels',
                                             label_encoding=label_encoding)
        labels = self.vocab.get_index_to_token_vocabulary('labels')
        constraints = allowed_transitions(label_encoding, labels)
        self.label_to_index = self.vocab.get_token_to_index_vocabulary(
            'labels')
        self.crf = ConditionalRandomField(self.num_tags,
                                          constraints,
                                          include_start_end_transitions=False)
        # self.loss = torch.nn.CrossEntropyLoss()
        initializer(self)

Пример #11

0

Показать файл

Файл: rnn_seq_crf_tagger.py Проект: yyht/sciwing

    def __init__(
        self,
        rnn2seqencoder: Lstm2SeqEncoder,
        encoding_dim: int,
        datasets_manager: DatasetsManager,
        device: torch.device = torch.device("cpu"),
        namespace_to_constraints: Dict[str, List[Tuple[int, int]]] = None,
        tagging_type=None,
        include_start_end_trainsitions: bool = True,
    ):
        """

        Parameters
        ----------
        rnn2seqencoder : Lstm2SeqEncoder
            Lstm2SeqEncoder that encodes a set of instances to a sequence of hidden states
        encoding_dim : int
            Hidden dimension of the lstm2seq encoder
        namespace_to_constraints: Dict[str, List[Tuple[int, int]]]
            A set of constraints that are valid transitions
        include_start_end_trainsitions: bool
            Whether to include start end transitions
        """
        super(RnnSeqCrfTagger, self).__init__()
        self.rnn2seqencoder = rnn2seqencoder
        self.encoding_dim = encoding_dim
        self.datasets_manager = datasets_manager

        self.label_namespaces = datasets_manager.label_namespaces
        self.device = device
        self.tagging_type = tagging_type
        self.crfs = nn.ModuleDict()
        self.linear_clfs = nn.ModuleDict()
        self.include_start_end_transitions = include_start_end_trainsitions

        if namespace_to_constraints is None and self.tagging_type is not None:
            namespace_to_constraints = defaultdict(list)
            for namespace in self.label_namespaces:
                idx2label_mapping = self.datasets_manager.get_idx_label_mapping(
                    label_namespace=namespace
                )
                transitions_allowed = allowed_transitions(
                    constraint_type=self.tagging_type, labels=idx2label_mapping
                )
                namespace_to_constraints[namespace] = transitions_allowed
        else:
            namespace_to_constraints = defaultdict(list)

        self.namespace_to_constraints = namespace_to_constraints
        for namespace in self.label_namespaces:
            num_labels = self.datasets_manager.num_labels[namespace]
            crf = CRF(
                num_tags=num_labels,
                constraints=self.namespace_to_constraints.get(namespace),
                include_start_end_transitions=self.include_start_end_transitions,
            )  # we do not add start and end tags to our labels
            clf = nn.Linear(self.encoding_dim, num_labels)
            self.crfs.update({namespace: crf})
            self.linear_clfs.update({namespace: clf})

Пример #12

0

Показать файл

Файл: crf_tagger.py Проект: ryan-leung/ml_monorepo

    def __init__(self,
                 vocab,
                 text_field_embedder,
                 encoder,
                 label_namespace=u"labels",
                 constraint_type=None,
                 feedforward=None,
                 include_start_end_transitions=True,
                 dropout=None,
                 verbose_metrics=False,
                 initializer=InitializerApplicator(),
                 regularizer=None):
        super(CrfTagger, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(
            Linear(output_dim, self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type
                                              or u"BIO")

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               u"text field embedding dim",
                               u"encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(),
                                   feedforward.get_input_dim(),
                                   u"encoder output dim",
                                   u"feedforward input dim")
        initializer(self)

Пример #13

0

Показать файл

    def __init__(self, 
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True) -> None:
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn = Seq2SeqEncoder.from_params(Params({     
                "type": "lstm",
                "input_size": 1024,
                "hidden_size": 512,
                "bidirectional": True,
                "batch_first": True
            }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(torch.nn.Linear(
            in_features=hidden2tag_in_dim,
            out_features=vocab.get_vocab_size("labels")))
        
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        
        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels")
            )
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True
            )
        
        self.f1 = SpanBasedF1Measure(vocab, 
                                     tag_namespace="labels",
                                     ignore_classes=["news/type","negation",
                                                     "demonstrative_reference",
                                                     "timer/noun","timer/attributes"],
                                     label_encoding="BIO")

Пример #14

0

Показать файл

    def __init__(
        self,
        task: str,
        vocab: Vocabulary,
        input_dim: int,
        loss_weight: float = 1.0,
        label_encoding: Optional[str] = 'BIO',
        include_start_end_transitions: bool = True,
        constrain_crf_decoding: bool = True,
        calculate_span_f1: bool = None,
        verbose_metrics: bool = False,
        metric: str = 'span_f1',
        top_k: int = 1,
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        self.task = task
        self.input_dim = input_dim
        self.loss_weight = loss_weight
        self.num_tags = self.vocab.get_vocab_size(task)
        self.top_k = top_k
        self._verbose_metrics = verbose_metrics

        self.tag_projection_layer = TimeDistributed(
            Linear(input_dim, self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError(
                    "constrain_crf_decoding is True, but no label_encoding was specified."
                )
            labels = self.vocab.get_index_to_token_vocabulary(task)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions)
        self.metrics = {
            "span_f1":
            SpanBasedF1Measure(self.vocab,
                               tag_namespace=self.task,
                               label_encoding="BIO")
        }

Пример #15

0

Показать файл

Файл: crf_tagger.py Проект: pyknife/allennlp

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 feedforward: FeedForward = None,
                 include_start_end_transitions: bool = True,
                 dropout: float = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or "BIO")


        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   "encoder output dim", "feedforward input dim")
        initializer(self)

Пример #16

0

Показать файл

Файл: oie_model_crf.py Проект: Jacobsolawetz/large-scale-oie

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 binary_feature_dim: int,
                 embedding_dropout: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 label_smoothing: float = None,
                 label_namespace: str = "labels",
                 ignore_span_metric: bool = False,
                 label_encoding: Optional[str] = 'BIO',
                 include_start_end_transitions: bool = True,
                 constrain_crf_decoding: bool = True) -> None:
        super(OieLabelerCRF, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")

        # For the span based evaluation, we don't want to consider labels
        # for verb, because the verb index is provided to the model.
        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="labels",
                                              ignore_classes=["V"])
        self.label_namespace = label_namespace
        self.encoder = encoder
        # There are exactly 2 binary features for the verb predicate embedding.
        self.binary_feature_embedding = Embedding(2, binary_feature_dim)
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes))
        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        self.include_start_end_transitions = include_start_end_transitions
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if constrain_crf_decoding:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            print(labels)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None
        self.crf = ConditionalRandomField(
            self.num_classes,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        check_dimensions_match(
            text_field_embedder.get_output_dim() + binary_feature_dim,
            encoder.get_input_dim(),
            "text embedding dim + verb indicator embedding dim",
            "encoder input dim")
        initializer(self)

Пример #17

0

Показать файл

Файл: encoder_crf.py Проект: bitlalala/event-extraction-for-timeline

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 feedforward: Optional[FeedForward] = None,
                 dropout: Optional[float] = None,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:

        super().__init__(vocab, regularizer)
        label_namespace = 'labels'
        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder

        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.


        self.label_encoding = 'BIOUL'
        labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
        constraints = allowed_transitions(self.label_encoding, labels)


        self.include_start_end_transitions = True
        self.crf = ConditionalRandomField(
            self.num_tags, constraints,
            include_start_end_transitions=self.include_start_end_transitions
        )

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }

        self._f1_metric = SpanBasedF1Measure(vocab,
                                             tag_namespace=label_namespace,
                                             label_encoding=self.label_encoding)

Пример #18

0

Показать файл

    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        clauses_encoder: Seq2VecEncoder,
        outer_encoder: Seq2SeqEncoder,
        label_namespace: str = "labels",
        constraint_type: str = None,
        include_start_end_transitions: bool = True,
        dropout: float = None,
        loss_weights: Optional[List] = [],
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super(JCC, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.clauses_encoder = inner_encoder
        self.outer_encoder = outer_encoder
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self.label_projection_layer = TimeDistributed(
            Linear(outer_encoder.get_output_dim(), self.num_tags))

        labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
        constraints = allowed_transitions(constraint_type, labels)
        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions,
        )
        self.metrics = {"accuracy": Accuracy()}

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            clauses_encoder.get_input_dim(),
            "text field embedding dim",
            "clauses encoder input dim",
        )
        check_dimensions_match(
            clauses_encoder.get_output_dim(),
            outer_encoder.get_input_dim(),
            "clauses encoder output dim",
            "outer encoder input dim",
        )
        initializer(self)

Пример #19

0

Показать файл

 def __init__(self, model_path, vocab: Vocabulary):
     super().__init__(vocab)
     self.pretrained_tokenizer = BertForPreTraining.from_pretrained(
         model_path)
     config = BertConfig.from_pretrained(model_path)
     bert_model = BertForPreTraining(config)
     self.bert = bert_model.bert
     tags = vocab.get_index_to_token_vocabulary("tags")
     num_tags = len(tags)
     constraints = allowed_transitions(constraint_type="BMES", labels=tags)
     self.projection = torch.nn.Linear(768, num_tags)
     self.crf = ConditionalRandomField(num_tags=num_tags,
                                       constraints=constraints,
                                       include_start_end_transitions=False)

Пример #20

0

Показать файл

Файл: pnet_crf_tagger.py Проект: Fritz449/ProtonetCode

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder

        self.last_layer = Linear(400, 64)

        self.bias_outside = torch.nn.Parameter(torch.zeros(1) - 1.,
                                               requires_grad=True)

        self.stds = torch.autograd.Variable(torch.ones(self.num_tags))
        self.sums = np.zeros(self.num_tags) + 10
        self.amount = np.zeros(self.num_tags) + 11
        self.loss = torch.nn.CrossEntropyLoss()

        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(self.num_tags, constraints)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace)

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self)

        self.hash = 0
        self.new = True

Пример #21

0

Показать файл

    def __init__(self, num_input_features: '(int) number of input features', hidden_size: '(int) number of\
    hidden features the outputs will also have hidden_size features'                                                                    , num_layers: '(int) number of \
    recursion'              , dropout_gru, bidirectional: '(bool) if True, use bidirectional GRU',\
    tags: "(dict[int: str])example: {0:'I', 1:'B', 2:'O', 3:'<PAD>'}", dropout_FCN: '(double)'):
        super().__init__()
        self.gru = nn.GRU(input_size=num_input_features, hidden_size=hidden_size, \
                                 num_layers=num_layers,batch_first = True, dropout=dropout_gru, \
                                 bidirectional=bidirectional)

        all_transition = allowed_transitions('BIO', tags)
        #self.crf = CRF(num_tags=len(tags), batch_first= True)
        self.linear = nn.Linear(hidden_size * 2, hidden_size)
        self.BN = nn.BatchNorm1d(num_layers)
        self.linear2 = nn.Linear(hidden_size, len(tags))
        self.BN2 = nn.BatchNorm1d(num_layers)
        self.crf = ConditionalRandomField(len(tags), all_transition)
        self.dropout = nn.Dropout(dropout_FCN)

Пример #22

0

Показать файл

Файл: token_classification.py Проект: arunadevikaruppasamy/biome-text

    def __init__(
        self,
        backbone: ModelBackbone,
        labels: List[str],
        label_encoding: Optional[str] = "BIOUL",
        top_k: int = 1,
        dropout: Optional[float] = 0.0,
        feedforward: Optional[FeedForwardConfiguration] = None,
    ) -> None:
        super(TokenClassification, self).__init__(backbone)
        vocabulary.set_labels(self.backbone.vocab, labels)

        self.top_k = top_k
        self.dropout = torch.nn.Dropout(dropout)
        self._feedforward: FeedForward = (
            None if not feedforward else feedforward.input_dim(
                backbone.encoder.get_output_dim()).compile())
        # output layers
        self._classifier_input_dim = (self._feedforward.get_output_dim()
                                      if self._feedforward else
                                      backbone.encoder.get_output_dim())
        # we want this linear applied to each token in the sequence
        self._label_projection_layer = TimeDistributed(
            torch.nn.Linear(self._classifier_input_dim, self.num_labels))
        constraints = allowed_transitions(
            label_encoding,
            vocabulary.get_index_to_labels_dictionary(self.backbone.vocab),
        )
        self._crf = ConditionalRandomField(self.num_labels,
                                           constraints,
                                           include_start_end_transitions=True)

        self.metrics = {"accuracy": CategoricalAccuracy()}
        if self.top_k:
            self.metrics.update({
                f"accuracy_{self.top_k}":
                CategoricalAccuracy(top_k=self.top_k)
            })
        self.f1_metric = SpanBasedF1Measure(
            self.backbone.vocab,
            tag_namespace=vocabulary.LABELS_NAMESPACE,
            label_encoding=label_encoding,
        )

        self.__all_metrics = [self.f1_metric]
        self.__all_metrics.extend(self.metrics.values())

Пример #23

0

Показать файл

    def __init__(self, hparams):
        """
        input:
            hparams: namespace with the following items:
                'data_dir' (str): Data Directory. default: './official/ebm_nlp_1_00'
                'bioelmo_dir' (str): BioELMo Directory. default: './models/bioelmo', help='BioELMo Directory')
                'max_length' (int): Max Length. default: 1024
                'lr' (float): Learning Rate. default: 1e-2
                'fine_tune_bioelmo' (bool): Whether to Fine Tune BioELMo. default: False
                'lr_bioelmo' (float): Learning Rate in BioELMo Fine-tuning. default: 1e-4
        """
        super().__init__()
        self.hparams = hparams
        self.itol = ID_TO_LABEL
        self.ltoi = {v: k for k, v in self.itol.items()}

        # Load Pretrained BioELMo
        DIR_ELMo = Path(str(self.hparams.bioelmo_dir))
        self.bioelmo = Elmo(DIR_ELMo / 'biomed_elmo_options.json',
                            DIR_ELMo / 'biomed_elmo_weights.hdf5',
                            1,
                            requires_grad=bool(self.hparams.fine_tune_bioelmo),
                            dropout=0)
        self.bioelmo_output_dim = self.bioelmo.get_output_dim()

        # ELMo Padding token (In ELMo token with ID 0 is used for padding)
        VOCAB_FILE_PATH = DIR_ELMo / 'vocab.txt'
        command = shlex.split(f"head -n 1 {VOCAB_FILE_PATH}")
        res = subprocess.Popen(command, stdout=subprocess.PIPE)
        self.bioelmo_pad_token = res.communicate()[0].decode('utf-8').strip()

        # Initialize Intermediate Affine Layer
        self.hidden_to_tag = nn.Linear(int(self.bioelmo_output_dim),
                                       len(self.itol))

        # Initialize CRF
        TRANSITIONS = conditional_random_field.allowed_transitions(
            constraint_type='BIO', labels=self.itol)
        self.crf = conditional_random_field.ConditionalRandomField(
            # set to 7 because here "tags" means ['O', 'B-P', 'I-P', 'B-I', 'I-I', 'B-O', 'I-O']
            # no need to include 'BOS' and 'EOS' in "tags"
            num_tags=len(self.itol),
            constraints=TRANSITIONS,
            include_start_end_transitions=False)
        self.crf.reset_parameters()

Пример #24

0

Показать файл

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 lang_dim: int,
                 lang_map: Dict[str, int],
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 include_start_end_transitions: bool = True,
                 dropout: float = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        lang_num = len(lang_map)
        self.lang_embedding = Embedding(lang_num, lang_dim)

        self.encoder = encoder
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or "BIO")

        check_dimensions_match(text_field_embedder.get_output_dim() + lang_dim, encoder.get_input_dim(),
                               "text field embedding dim + lang_dim", "encoder input dim")
        initializer(self)

Пример #25

0

Показать файл

Файл: constrained_conditional_module.py Проект: goelShashank007/BiLSTM-CCM

    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ConstrainedConditionalModule':
        hard_constraints = params.pop("hard_constraints", [])
        soft_constraints = params.pop("soft_constraints", {})
        label_namespace = params.pop("label_namespace", "labels")
        sentence_penalty_map_dict = params.pop("sentence_penalty_map", None)
        constrain_crf_decoding = params.pop("constrain_crf_decoding", False)
        label_encoding = params.pop("label_encoding", None)

        sentence_penalty_map = None
        if sentence_penalty_map_dict:
            assert len(sentence_penalty_map_dict) == 1, "multiple sentence constraints not supported"
            tag, penalty = list(sentence_penalty_map_dict.items())[0]
            tag_index = vocab.get_token_index(tag, label_namespace)
            sentence_penalty_map = (tag_index, penalty)

        hard_constraints_to_indices: Dict[str, List[int]] = {}
        for tag in hard_constraints:
            hard_constraints_to_indices[tag] = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    hard_constraints_to_indices[tag].append(index)
        soft_constraints = soft_constraints or {}
        soft_constraints_to_indices: Dict[str, Tuple[List[int], float]] = {}
        for tag, penalty in soft_constraints.items():
            indices = []
            for label, index in vocab.get_token_to_index_vocabulary(label_namespace).items():
                if re.match(rf"^.*-{tag}", label):
                    indices.append(index)
            soft_constraints_to_indices[tag] = (indices, penalty)
        num_tags = vocab.get_vocab_size(label_namespace)
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None
        params.assert_empty(cls.__name__)
        return ConstrainedConditionalModule(num_tags, constraints,
                                            hard_constraints_to_indices,
                                            soft_constraints_to_indices,
                                            sentence_penalty_map)

Пример #26

0

Показать файл

    def __init__(
        self,
        base_model,
        basic_tokenizer,
        subword_tokenizer,
        model_dir=DEFAULT_MODEL_PATH,
        normalizer=None,
    ):
        """初期化

        非推奨です

        Args:
            base_model (nn.Module): BertCrfモデル
            basic_tokenizer (callable): 単語分割用トークナイザ
            subword_tokenizer (callable): サブワード分割用トークナイザ
            label_vocab (dict): {label:label_idx, ...}
            model_dir (pathlib.Path or str): モデルフォルダのpath．labels.txtとfinal.model
            normalizer (callable): 単語正規化関数
        """
        if not isinstance(model_dir, pathlib.PurePath):
            model_dir = Path(model_dir)

        #self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.device = "cpu"

        label_vocab = create_label_vocab_from_file(
            str(model_dir / "labels.txt"))
        self.itol = {i: l for l, i in label_vocab.items()}
        constraints = allowed_transitions(
            "BIO", {i: w
                    for w, i in label_vocab.items()})
        self.model = BertCrf(base_model, len(label_vocab), constraints)
        self.model.load_state_dict(
            torch.load(str(model_dir / "final.model"),
                       map_location=self.device))
        self.model.to(self.device)
        self.model.eval()

        self.basic_tokenizer = basic_tokenizer
        self.subword_tokenizer = subword_tokenizer

        self.normalizer = normalizer

Пример #27

0

Показать файл

Файл: bert_crf_tagger.py Проект: Shroomi/tal_allennlp

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 dropout: Optional[float] = 0,
                 label_encoding: Optional[str] = 'BIO',
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        """
        :param vocab: ``Vocabulary``
        :param text_field_embedder: ``TextFieldEmbedder``
        Used to embed the ``question`` and ``passage`` ``TextFields`` we get as input to the model.
        :param dropout:
        :param label_encoding: BIO
        :param initializer:``InitializerApplicator``, optional (default=``InitializerApplicator()``)
        Used to initialize the model parameters.
        :param regularizer:``RegularizerApplicator``, optional (default=``None``)
        :param print_bad_case: 是否将出错的case打印出来
        If provided, will be used to calculate the regularization penalty during training.
        """
        super(BertCrfTaggerModel, self).__init__(vocab, regularizer)
        self._text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size('labels')

        self._labels_predictor = Linear(
            self._text_field_embedder.get_output_dim(), self.num_tags)
        self.dropout = torch.nn.Dropout(dropout)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self._f1_metric = SpanBasedF1Measure(vocab,
                                             tag_namespace='labels',
                                             label_encoding=label_encoding)
        labels = self.vocab.get_index_to_token_vocabulary('labels')
        constraints = allowed_transitions(label_encoding, labels)
        self.label_to_index = self.vocab.get_token_to_index_vocabulary(
            'labels')
        self.crf = ConditionalRandomField(self.num_tags,
                                          constraints,
                                          include_start_end_transitions=True)
        self.loss = torch.nn.CrossEntropyLoss()
        initializer(self)

Пример #28

0

Показать файл

    def __init__(
        self,
        vocab: Vocabulary,
        mention_feedforward: FeedForward,
        label_namespace: str = "ner_type_labels",
        label_encoding: str = "BIOUL",
        exact_match: bool = False,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:
        super(NERTagger, self).__init__(vocab, regularizer)

        self._vocab = vocab
        self.label_namespace = label_namespace
        self._n_labels = vocab.get_vocab_size(label_namespace)

        self.label_map = self.vocab.get_index_to_token_vocabulary(label_namespace)
        print(self.label_map)

        self._mention_feedforward = TimeDistributed(mention_feedforward)

        self._ner_scorer = TimeDistributed(
            torch.nn.Linear(mention_feedforward.get_output_dim(), self._n_labels)
        )
        constraints = allowed_transitions(
            label_encoding, self.vocab.get_index_to_token_vocabulary(label_namespace)
        )
        self._ner_crf = ConditionalRandomField(
            self._n_labels, constraints, include_start_end_transitions=False
        )

        if exact_match:
            self._ner_metrics = SpanBasedF1Measure(self.label_map, label_encoding=label_encoding)
        else:
            self._ner_metrics = SpanBasedF1MeasureAllennlp(
                vocabulary=vocab, tag_namespace=label_namespace, label_encoding=label_encoding
            )

        initializer(self)

Пример #29

0

Показать файл

Файл: bilstm_crf.py Проект: Ethan-yt/sciner

    def __init__(self, vocab: Vocabulary, embedding_dim=300, embedder_type=None, bert_trainable=True, **kwargs):
        super().__init__(vocab)
        for k in kwargs:
            self.__setattr__(k, kwargs[k])

        text_field_embedder = get_embeddings(embedder_type, self.vocab, embedding_dim, bert_trainable)
        embedding_dim = text_field_embedder.get_output_dim()

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(embedding_dim, self.num_rnn_units, batch_first=True, bidirectional=True, dropout=self.dropout_rate))

        self.label_namespace = label_namespace = 'ner_bio_labels'
        self.num_tags = self.vocab.get_vocab_size(label_namespace)

        self.text_field_embedder = text_field_embedder
        self.encoder = encoder
        self.dropout = torch.nn.Dropout(self.dropout_rate)

        output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        self.label_encoding = label_encoding = 'BIO'
        labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
        constraints = allowed_transitions(self.label_encoding, labels)

        self.include_start_end_transitions = True
        self.crf = ConditionalRandomField(
            self.num_tags, constraints,
            include_start_end_transitions=True
        )

        self._f1_metric = SpanBasedF1Measure(self.vocab,
                                             tag_namespace=label_namespace,
                                             label_encoding=label_encoding)
        self._verbose_metrics = False

Пример #30

0

Показать файл

Файл: crf_tagger.py Проект: heyihuiforjava/allennlp_ner

    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        label_namespace: str = "labels",
        feedforward: Optional[FeedForward] = None,
        label_encoding: Optional[str] = None,
        include_start_end_transitions: bool = True,
        constrain_crf_decoding: bool = None,
        calculate_span_f1: bool = None,
        dropout: Optional[float] = None,
        verbose_metrics: bool = False,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        top_k: int = 1,
    ) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self.top_k = top_k
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(
            Linear(output_dim, self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
            self.num_tags,
            constraints,
            include_start_end_transitions=include_start_end_transitions)

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3),
        }
        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError("calculate_span_f1 is True, but "
                                         "no label_encoding was specified.")
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )
        if feedforward is not None:
            check_dimensions_match(
                encoder.get_output_dim(),
                feedforward.get_input_dim(),
                "encoder output dim",
                "feedforward input dim",
            )
        initializer(self)

Пример #31

0

Показать файл

Файл: crf_tagger.py Проект: ziaridoy20/allennlp

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 feedforward: Optional[FeedForward] = None,
                 label_encoding: Optional[str] = None,
                 constraint_type: Optional[str] = None,
                 include_start_end_transitions: bool = True,
                 constrain_crf_decoding: bool = None,
                 calculate_span_f1: bool = None,
                 dropout: Optional[float] = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        if constraint_type is not None:
            warnings.warn("'constraint_type' was removed and replaced with"
                          "'label_encoding', 'constrain_crf_decoding', and "
                          "'calculate_span_f1' in version 0.6.1. It will be "
                          "removed in version 0.8.", DeprecationWarning)
            label_encoding = constraint_type

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.metrics = {
                "accuracy": CategoricalAccuracy(),
                "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError("calculate_span_f1 is True, but "
                                         "no label_encoding was specified.")
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)
        elif constraint_type is not None:
            # Maintain deprecated behavior if constraint_type is provided
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=constraint_type)

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   "encoder output dim", "feedforward input dim")
        initializer(self)

Пример #32

0

Показать файл

Файл: joint_classifier.py Проект: DFKI-NLP/defx

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 relation_scorer: RelationScorer,
                 ner_tag_namespace: str = 'tags',
                 evaluated_ner_labels: List[str] = None,
                 re_loss_weight: float = 1.0,
                 ner_tag_embedder: TokenEmbedder = None,
                 use_aux_ner_labels: bool = False,
                 aux_coarse_namespace: str = 'coarse_tags',
                 aux_modifier_namespace: str = 'modifier_tags',
                 aux_loss_weight: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab=vocab, regularizer=regularizer)

        self.text_field_embedder = text_field_embedder
        self.encoder = encoder

        # NER subtask 2
        self._ner_label_encoding = 'BIO'
        self._ner_tag_namespace = ner_tag_namespace
        ner_input_dim = self.encoder.get_output_dim()
        num_ner_tags = self.vocab.get_vocab_size(ner_tag_namespace)
        self.tag_projection_layer = TimeDistributed(
            Linear(ner_input_dim, num_ner_tags))

        self._use_aux_ner_labels = use_aux_ner_labels
        if self._use_aux_ner_labels:
            self._coarse_tag_namespace = aux_coarse_namespace
            self._num_coarse_tags = self.vocab.get_vocab_size(
                self._coarse_tag_namespace)
            self._coarse_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_coarse_tags))
            self._modifier_tag_namespace = aux_modifier_namespace
            self._num_modifier_tags = self.vocab.get_vocab_size(
                self._modifier_tag_namespace)
            self._modifier_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_modifier_tags))
            self._coarse_acc = CategoricalAccuracy()
            self._modifier_acc = CategoricalAccuracy()
            self._aux_loss_weight = aux_loss_weight

        self.ner_accuracy = CategoricalAccuracy()
        if evaluated_ner_labels is None:
            ignored_classes = None
        else:
            assert self._ner_label_encoding == 'BIO', 'expected BIO encoding'
            all_ner_tags = self.vocab.get_token_to_index_vocabulary(
                ner_tag_namespace).keys()
            ner_tag_classes = set(
                [bio_tag[2:] for bio_tag in all_ner_tags if len(bio_tag) > 2])
            ignored_classes = list(
                set(ner_tag_classes).difference(evaluated_ner_labels))
        self.ner_f1 = SpanBasedF1Measure(
            vocabulary=vocab,
            tag_namespace=ner_tag_namespace,
            label_encoding=self._ner_label_encoding,
            ignore_classes=ignored_classes)

        # Use constrained crf decoding with the BIO labeling scheme
        ner_labels = self.vocab.get_index_to_token_vocabulary(
            ner_tag_namespace)
        constraints = allowed_transitions(self._ner_label_encoding, ner_labels)

        self.crf = ConditionalRandomField(num_ner_tags,
                                          constraints,
                                          include_start_end_transitions=True)

        # RE subtask 3
        self.ner_tag_embedder = ner_tag_embedder
        self.relation_scorer = relation_scorer
        self._re_loss_weight = re_loss_weight

        initializer(self)

Пример #33

0

Показать файл

Файл: conditional_random_field_test.py Проект: apmoore1/allennlp

    def test_allowed_transitions(self):
        # pylint: disable=bad-whitespace,bad-continuation
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("BIO", dict(enumerate(bio_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                         # Extra column for end tag.
            (0, 0), (0, 1),         (0, 3),              (0, 6),
            (1, 0), (1, 1), (1, 2), (1, 3),              (1, 6),
            (2, 0), (2, 1), (2, 2), (2, 3),              (2, 6),
            (3, 0), (3, 1),         (3, 3), (3, 4),      (3, 6),
            (4, 0), (4, 1),         (4, 3), (4, 4),      (4, 6),
            (5, 0), (5, 1),         (5, 3)                      # Extra row for start tag
        }

        bioul_labels = ['O', 'B-X', 'I-X', 'L-X', 'U-X', 'B-Y', 'I-Y', 'L-Y', 'U-Y'] # start tag, end tag
        #                0     1      2      3      4      5      6      7      8          9        10
        allowed = allowed_transitions("BIOUL", dict(enumerate(bioul_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                                                   # Extra column for end tag.
            (0, 0), (0, 1),                 (0, 4), (0, 5),                 (0, 8),       (0, 10),
                            (1, 2), (1, 3),
                            (2, 2), (2, 3),
            (3, 0), (3, 1),                 (3, 4), (3, 5),                 (3, 8),       (3, 10),
            (4, 0), (4, 1),                 (4, 4), (4, 5),                 (4, 8),       (4, 10),
                                                            (5, 6), (5, 7),
                                                            (6, 6), (6, 7),
            (7, 0), (7, 1),                 (7, 4), (7, 5),                 (7, 8),       (7, 10),
            (8, 0), (8, 1),                 (8, 4), (8, 5),                 (8, 8),       (8, 10),
            # Extra row for start tag.
            (9, 0), (9, 1),                 (9, 4), (9, 5),                 (9, 8)
        }

        iob1_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag
        #              0     1      2      3      4         5          6
        allowed = allowed_transitions("IOB1", dict(enumerate(iob1_labels)))

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {                            # Extra column for end tag.
            (0, 0),         (0, 2),         (0, 4),         (0, 6),
            (1, 0), (1, 1), (1, 2),         (1, 4),         (1, 6),
            (2, 0), (2, 1), (2, 2),         (2, 4),         (2, 6),
            (3, 0),         (3, 2), (3, 3), (3, 4),         (3, 6),
            (4, 0),         (4, 2), (4, 3), (4, 4),         (4, 6),
            (5, 0),         (5, 2),         (5, 4),                # Extra row for start tag
        }
        with raises(ConfigurationError):
            allowed_transitions("allennlp", {})

        bmes_labels = ['B-X', 'M-X', 'E-X', 'S-X', 'B-Y', 'M-Y', 'E-Y', 'S-Y'] # start tag, end tag
        #               0      1      2      3      4      5      6      7       8          9
        allowed = allowed_transitions("BMES", dict(enumerate(bmes_labels)))
        assert set(allowed) == {
                    (0, 1), (0, 2),
                            (1, 2),                                         # Extra column for end tag.
            (2, 0),                 (2, 3), (2, 4),                 (2, 7), (2, 9),
            (3, 0),                 (3, 3), (3, 4),                 (3, 7), (3, 9),
                                                    (4, 5), (4, 6),
                                                            (5, 6),
            (6, 0),                 (6, 3), (6, 4),                 (6, 7), (6, 9),
            (7, 0),                 (7, 3), (7, 4),                 (7, 7), (7, 9),
            (8, 0),                 (8, 3), (8, 4),                 (8, 7),  # Extra row for start tag
        }

Пример #34

0

Показать файл

def default_crf() -> ConditionalRandomField:
    include_start_end_transitions = True
    constraints = allowed_transitions('BIO', {0: 'O', 1: 'B', 2: 'I'})
    return ConditionalRandomField(3, constraints,
                                  include_start_end_transitions)

Пример #35

0

Показать файл

    }

    task_idx2classnames = {
        idx: classname
        for idx, classname in idx2classnames.items() if idx in range(0, 8)
    }
    process_idx2classnames = {
        idx - 8: classname
        for idx, classname in idx2classnames.items() if idx in range(8, 16)
    }
    material_idx2classnames = {
        idx - 16: classname
        for idx, classname in idx2classnames.items() if idx in range(16, 24)
    }

    task_constraints = allowed_transitions(constraint_type="BIOUL",
                                           labels=task_idx2classnames)
    process_constraints = allowed_transitions(constraint_type="BIOUL",
                                              labels=process_idx2classnames)
    material_constraints = allowed_transitions(constraint_type="BIOUL",
                                               labels=material_idx2classnames)

    embedder = VanillaEmbedder(embedding=embedding,
                               embedding_dim=EMBEDDING_DIMENSION)

    if USE_CHAR_ENCODER:
        char_embedder = VanillaEmbedder(embedding=char_embedding,
                                        embedding_dim=CHAR_EMBEDDING_DIMENSION)
        char_encoder = CharLSTMEncoder(
            char_emb_dim=CHAR_EMBEDDING_DIMENSION,
            char_embedder=char_embedder,
            bidirectional=True,

Python allowed_transitions примеры использования