class SimpleClassifier(Model):
    def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder):
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        num_labels = vocab.get_vocab_size("labels")
        self.classifier = torch.nn.Linear(encoder.get_output_dim(), num_labels)
        self.accuracy = CategoricalAccuracy()
        self.macrof1 = FBetaMeasure(average='macro')
        self.microf1 = FBetaMeasure(average='micro')
        self.weightedf1 = FBetaMeasure(average='weighted')

    def forward(self, text: Dict[str, torch.Tensor],
                label: torch.Tensor) -> Dict[str, torch.Tensor]:
        # Shape: (batch_size, num_tokens, embedding_dim)
        embedded_text = self.embedder(text)
        # Shape: (batch_size, num_tokens)
        mask = util.get_text_field_mask(text)
        # Shape: (batch_size, encoding_dim)
        encoded_text = self.encoder(embedded_text, mask)
        # Shape: (batch_size, num_labels)
        logits = self.classifier(encoded_text)
        self.accuracy(logits, label)
        self.macrof1(logits, label)
        self.microf1(logits, label)
        self.weightedf1(logits, label)
        # Shape: (batch_size, num_labels)
        probs = torch.nn.functional.softmax(logits)
        # Shape: (1,)
        loss = torch.nn.functional.cross_entropy(logits, label)
        return {'loss': loss, 'probs': probs}

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        result_macro = self.macrof1.get_metric(reset)
        result_micro = self.microf1.get_metric(reset)
        result_weighted = self.weightedf1.get_metric(reset)
        return {
            "accuracy": self.accuracy.get_metric(reset),
            "macrof1_precision": result_macro["precision"],
            "macrof1_recall": result_macro["recall"],
            "macrof1_fscore": result_macro["fscore"],
            "microf1_precision": result_micro["precision"],
            "microf1_recall": result_micro["recall"],
            "microf1_fscore": result_macro["fscore"],
            "weightedf1_precision": result_weighted["precision"],
            "weightedf1_recall": result_weighted["recall"],
            "weightedf1_fscore": result_weighted["fscore"]
        }
Exemplo n.º 2
0
class TransformerSrlSpan(SrlBert):
    """

    # Parameters

    vocab : `Vocabulary`, required
        A Vocabulary, required in order to compute sizes for input/output projections.
    model : `Union[str, AutoModel]`, required.
        A string describing the BERT model to load or an already constructed AutoModel.
    initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
        Used to initialize the model parameters.
    label_smoothing : `float`, optional (default = `0.0`)
        Whether or not to use label smoothing on the labels when computing cross entropy loss.
    ignore_span_metric : `bool`, optional (default = `False`)
        Whether to calculate span loss, which is irrelevant when predicting BIO for Open Information Extraction.
    srl_eval_path : `str`, optional (default=`DEFAULT_SRL_EVAL_PATH`)
        The path to the srl-eval.pl script. By default, will use the srl-eval.pl included with allennlp,
        which is located at allennlp/tools/srl-eval.pl . If `None`, srl-eval.pl is not used.
    """
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: Union[str, AutoModel],
        embedding_dropout: float = 0.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        label_smoothing: float = None,
        ignore_span_metric: bool = False,
        srl_eval_path: str = DEFAULT_SRL_EVAL_PATH,
        inventory: str = "verbatlas",
        **kwargs,
    ) -> None:
        # bypass SrlBert constructor
        Model.__init__(self, vocab, **kwargs)
        self.transformer = AutoModel.from_pretrained(bert_model)
        self.frame_criterion = nn.CrossEntropyLoss()
        if inventory == "verbatlas":
            # add missing frame labels
            frame_list = load_label_list(FRAME_LIST_PATH)
            self.vocab.add_tokens_to_namespace(frame_list, "frames_labels")
            # add missing role labels
            role_list = load_label_list(ROLE_LIST_PATH)
            self.vocab.add_tokens_to_namespace(role_list, "labels")
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.frame_num_classes = self.vocab.get_vocab_size("frames_labels")
        if srl_eval_path is not None:
            # For the span based evaluation, we don't want to consider labels
            # for verb, because the verb index is provided to the model.
            self.span_metric = SrlEvalScorer(srl_eval_path,
                                             ignore_classes=["V"])
        else:
            self.span_metric = None
        self.f1_frame_metric = FBetaMeasure(average="micro")
        self.tag_projection_layer = nn.Linear(
            self.transformer.config.hidden_size, self.num_classes)
        self.frame_projection_layer = nn.Linear(
            self.transformer.config.hidden_size, self.frame_num_classes)
        self.embedding_dropout = nn.Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        self.ignore_span_metric = ignore_span_metric
        initializer(self)

    def forward(  # type: ignore
        self,
        tokens: TextFieldTensors,
        verb_indicator: torch.Tensor,
        frame_indicator: torch.Tensor,
        metadata: List[Any],
        tags: torch.LongTensor = None,
        frame_tags: torch.LongTensor = None,
    ):
        """
        # Parameters

        tokens : `TextFieldTensors`, required
            The output of `TextField.as_array()`, which should typically be passed directly to a
            `TextFieldEmbedder`. For this model, this must be a `SingleIdTokenIndexer` which
            indexes wordpieces from the BERT vocabulary.
        verb_indicator: `torch.LongTensor`, required.
            An integer `SequenceFeatureField` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        frame_indicator: torch.LongTensor, required.
            An integer ``SequenceFeatureField`` representation of the position of the frame
            in the sentence. This should have shape (batch_size, num_tokens). Similar to verb_indicator,
            but handles bert wordpiece tokenizer by cosnidering a frame only the first subtoken.
        tags : `torch.LongTensor`, optional (default = `None`)
            A torch tensor representing the sequence of integer gold class labels
            of shape `(batch_size, num_tokens)`
        frame_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the gold frames
            of shape ``(batch_size, num_tokens)``
        metadata : `List[Dict[str, Any]]`, optional, (default = `None`)
            metadata containg the original words in the sentence, the verb to compute the
            frame for, and start offsets for converting wordpieces back to a sequence of words,
            under 'words', 'verb' and 'offsets' keys, respectively.

        # Returns

        An output dictionary consisting of:
        logits : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
            a distribution of the tag classes per word.
        loss : `torch.FloatTensor`, optional
            A scalar loss to be optimised.
        """
        mask = get_text_field_mask(tokens)
        input_ids = util.get_token_ids_from_text_field_tensors(tokens)
        bert_embeddings, _ = self.transformer(
            input_ids=input_ids,
            token_type_ids=verb_indicator,
            attention_mask=mask,
            return_dict=False,
        )
        # extract embeddings
        embedded_text_input = self.embedding_dropout(bert_embeddings)
        frame_embeddings = embedded_text_input[frame_indicator == 1]
        # get sizes
        batch_size, sequence_length, _ = embedded_text_input.size()
        # outputs
        logits = self.tag_projection_layer(embedded_text_input)
        frame_logits = self.frame_projection_layer(frame_embeddings)

        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_classes])

        frame_probabilities = F.softmax(frame_logits, dim=-1)
        # We need to retain the mask in the output dictionary
        # so that we can crop the sequences to remove padding
        # when we do viterbi inference in self.make_output_human_readable.
        output_dict = {
            "logits": logits,
            "frame_logits": frame_logits,
            "class_probabilities": class_probabilities,
            "frame_probabilities": frame_probabilities,
            "mask": mask,
        }
        # We add in the offsets here so we can compute the un-wordpieced tags.
        words, verbs, offsets = zip(*[(x["words"], x["verb"], x["offsets"])
                                      for x in metadata])
        lemmas = [l for x in metadata for l in x["lemmas"]]
        output_dict["words"] = list(words)
        output_dict["lemma"] = list(lemmas)
        output_dict["verb"] = list(verbs)
        output_dict["wordpiece_offsets"] = list(offsets)

        if tags is not None:
            # compute role loss
            role_loss = sequence_cross_entropy_with_logits(
                logits, tags, mask, label_smoothing=self._label_smoothing)
            # compute frame loss
            frame_tags_filtered = frame_tags[frame_indicator == 1]
            frame_loss = self.frame_criterion(frame_logits,
                                              frame_tags_filtered)
            if not self.ignore_span_metric and self.span_metric is not None and not self.training:
                batch_verb_indices = [
                    example_metadata["verb_index"]
                    for example_metadata in metadata
                ]
                batch_sentences = [
                    example_metadata["words"] for example_metadata in metadata
                ]
                # Get the BIO tags from make_output_human_readable()
                batch_bio_predicted_tags = self.make_output_human_readable(
                    output_dict).pop("tags")
                from allennlp_models.structured_prediction.models.srl import (
                    convert_bio_tags_to_conll_format, )

                batch_conll_predicted_tags = [
                    convert_bio_tags_to_conll_format(tags)
                    for tags in batch_bio_predicted_tags
                ]
                batch_bio_gold_tags = [
                    example_metadata["gold_tags"]
                    for example_metadata in metadata
                ]
                batch_conll_gold_tags = [
                    convert_bio_tags_to_conll_format(tags)
                    for tags in batch_bio_gold_tags
                ]
                self.span_metric(
                    batch_verb_indices,
                    batch_sentences,
                    batch_conll_predicted_tags,
                    batch_conll_gold_tags,
                )
            self.f1_frame_metric(frame_logits, frame_tags_filtered)
            output_dict["frame_loss"] = frame_loss
            output_dict["role_loss"] = role_loss
            output_dict["loss"] = (role_loss + frame_loss) / 2
        return output_dict

    def decode_frames(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        # frame prediction
        frame_probabilities = output_dict["frame_probabilities"]
        frame_predictions = frame_probabilities.argmax(
            dim=-1).cpu().data.numpy()
        output_dict["frame_tags"] = [
            self.vocab.get_token_from_index(f, namespace="frames_labels")
            for f in frame_predictions
        ]
        output_dict["frame_scores"] = [
            fp[f] for f, fp in zip(frame_predictions, frame_probabilities)
        ]
        return output_dict

    @overrides
    def make_output_human_readable(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        output_dict = self.decode_frames(output_dict)
        output_dict = super().make_output_human_readable(output_dict)
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False):
        if self.ignore_span_metric:
            # Return an empty dictionary if ignoring the
            # span metric
            return {}

        else:
            metric_dict = self.span_metric.get_metric(reset=reset)
            frame_metric_dict = self.f1_frame_metric.get_metric(reset=reset)
            # This can be a lot of metrics, as there are 3 per class.
            # we only really care about the overall metrics, so we filter for them here.
            metric_dict_filtered = {
                x.split("-")[0] + "_role": y
                for x, y in metric_dict.items() if "overall" in x
            }
            frame_metric_dict = {
                x + "_frame": y
                for x, y in frame_metric_dict.items()
            }
            return {**metric_dict_filtered, **frame_metric_dict}

    def _get_label_tokens(self, namespace: str = "labels"):
        return self.vocab.get_token_to_index_vocabulary(namespace).keys()

    def _get_label_ids(self, namespace: str = "labels"):
        return self.vocab.get_index_to_token_vocabulary(namespace).keys()

    default_predictor = "transformer_srl"
Exemplo n.º 3
0
class BasicClassifierF1(Model):
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        seq2vec_encoder: Seq2VecEncoder,
        seq2seq_encoder: Seq2SeqEncoder = None,
        dropout: float = None,
        num_labels: int = None,
        label_namespace: str = "labels",
        loss: str = None,  # focal_loss
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:

        super().__init__(vocab, regularizer)
        self._text_field_embedder = text_field_embedder

        if seq2seq_encoder:
            self._seq2seq_encoder = seq2seq_encoder
        else:
            self._seq2seq_encoder = None

        self._seq2vec_encoder = seq2vec_encoder
        self._classifier_input_dim = self._seq2vec_encoder.get_output_dim()

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = None

        self._label_namespace = label_namespace
        if num_labels:
            self._num_labels = num_labels
        else:
            self._num_labels = vocab.get_vocab_size(
                namespace=self._label_namespace)

        self._classification_layer = torch.nn.Linear(
            self._classifier_input_dim, self._num_labels)
        self._accuracy = CategoricalAccuracy()

        if loss is None:
            self._loss = torch.nn.CrossEntropyLoss()
        elif loss == 'focal_loss':
            self._loss = FocalLoss(alpha=0.25,
                                   num_classes=self._num_labels)  # focal loss
        elif loss == 'cross_entropy_loss':
            self._loss = torch.nn.CrossEntropyLoss()
        else:
            raise ValueError('wrong loss type')

        self._f1_measure = FBetaMeasure()
        initializer(self)

    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:

        embedded_text = self._text_field_embedder(tokens)
        mask = get_text_field_mask(tokens).float()

        if self._seq2seq_encoder:
            embedded_text = self._seq2seq_encoder(embedded_text, mask=mask)

        embedded_text = self._seq2vec_encoder(embedded_text, mask=mask)

        if self._dropout:
            embedded_text = self._dropout(embedded_text)

        logits = self._classification_layer(embedded_text)
        probs = F.softmax(logits, dim=-1)

        output_dict = {"logits": logits, "probs": probs}

        if label is not None:
            loss = self._loss(logits, label.long().view(-1))
            output_dict["loss"] = loss
            self._accuracy(logits, label)
            self._f1_measure(logits, label)

        return output_dict

    @overrides
    def decode(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Does a simple argmax over the probabilities, converts index to string label, and
        add ``"label"`` key to the dictionary with the result.
        """
        predictions = output_dict["probs"]
        if predictions.dim() == 2:
            predictions_list = [
                predictions[i] for i in range(predictions.shape[0])
            ]
        else:
            predictions_list = [predictions]

        labels = []
        for prediction in predictions_list:
            label_idx = prediction.argmax(dim=-1).item()
            label_str = self.vocab.get_index_to_token_vocabulary(
                self._label_namespace).get(label_idx, str(label_idx))
            labels.append(label_str)

        output_dict["label"] = labels
        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        f1_dict = self._f1_measure.get_metric(reset)
        output = {'accuracy': self._accuracy.get_metric(reset=reset)}
        counter = 0
        for precision, recall, fscore in zip(f1_dict['precision'],
                                             f1_dict['recall'],
                                             f1_dict['fscore']):
            output[str(counter) + '_precision'] = precision
            output[str(counter) + '_recall'] = recall
            output[str(counter) + '_fscore'] = fscore
            counter += 1
        return output
Exemplo n.º 4
0
class TransformerSrlDependency(Model):
    """

    # Parameters

    vocab : `Vocabulary`, required
        A Vocabulary, required in order to compute sizes for input/output projections.
    model : `Union[str, AutoModel]`, required.
        A string describing the BERT model to load or an already constructed AutoModel.
    initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
        Used to initialize the model parameters.
    label_smoothing : `float`, optional (default = `0.0`)
        Whether or not to use label smoothing on the labels when computing cross entropy loss.
    ignore_span_metric : `bool`, optional (default = `False`)
        Whether to calculate span loss, which is irrelevant when predicting BIO for Open Information Extraction.
    srl_eval_path : `str`, optional (default=`DEFAULT_SRL_EVAL_PATH`)
        The path to the srl-eval.pl script. By default, will use the srl-eval.pl included with allennlp,
        which is located at allennlp/tools/srl-eval.pl . If `None`, srl-eval.pl is not used.
    """

    def __init__(
        self,
        vocab: Vocabulary,
        model_name: Union[str, AutoModel],
        embedding_dropout: float = 0.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        label_smoothing: float = None,
        ignore_span_metric: bool = False,
        srl_eval_path: str = DEFAULT_SRL_EVAL_PATH,
        restrict_frames: bool = False,
        restrict_roles: bool = False,
        **kwargs,
    ) -> None:
        # bypass SrlBert constructor
        Model.__init__(self, vocab, **kwargs)
        self.lemma_frame_dict = load_lemma_frame(LEMMA_FRAME_PATH)
        self.frame_role_dict = load_role_frame(FRAME_ROLE_PATH)
        self.restrict_frames = restrict_frames
        self.restrict_roles = restrict_roles

        if isinstance(model_name, str):
            self.transformer = AutoModel.from_pretrained(model_name)
        else:
            self.transformer = model_name
        # loss
        self.role_criterion = nn.CrossEntropyLoss(ignore_index=0)
        self.frame_criterion = nn.CrossEntropyLoss()
        # number of classes
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.frame_num_classes = self.vocab.get_vocab_size("frames_labels")
        # metrics
        role_set = self.vocab.get_token_to_index_vocabulary("labels")
        role_set_filter = [v for k, v in role_set.items() if k != "O"]
        self.f1_role_metric = FBetaMeasure(average="micro", labels=role_set_filter)
        self.f1_frame_metric = FBetaMeasure(average="micro")
        # output layer
        self.tag_projection_layer = nn.Linear(self.transformer.config.hidden_size, self.num_classes)
        self.frame_projection_layer = nn.Linear(
            self.transformer.config.hidden_size, self.frame_num_classes
        )
        self.embedding_dropout = nn.Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing
        initializer(self)

    def forward(  # type: ignore
        self,
        tokens: TextFieldTensors,
        verb_indicator: torch.Tensor,
        frame_indicator: torch.Tensor,
        metadata: List[Any],
        tags: torch.LongTensor = None,
        frame_tags: torch.LongTensor = None,
    ):

        """
        # Parameters

        tokens : `TextFieldTensors`, required
            The output of `TextField.as_array()`, which should typically be passed directly to a
            `TextFieldEmbedder`. For this model, this must be a `SingleIdTokenIndexer` which
            indexes wordpieces from the BERT vocabulary.
        verb_indicator: `torch.LongTensor`, required.
            An integer `SequenceFeatureField` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        tags : `torch.LongTensor`, optional (default = `None`)
            A torch tensor representing the sequence of integer gold class labels
            of shape `(batch_size, num_tokens)`
        frame_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the gold frames
            of shape ``(batch_size, num_tokens)``
        metadata : `List[Dict[str, Any]]`, optional, (default = `None`)
            metadata containg the original words in the sentence, the verb to compute the
            frame for, and start offsets for converting wordpieces back to a sequence of words,
            under 'words', 'verb' and 'offsets' keys, respectively.

        # Returns

        An output dictionary consisting of:
        logits : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
            a distribution of the tag classes per word.
        loss : `torch.FloatTensor`, optional
            A scalar loss to be optimised.
        """
        mask = get_text_field_mask(tokens)
        bert_embeddings, _ = self.transformer(
            input_ids=util.get_token_ids_from_text_field_tensors(tokens),
            token_type_ids=verb_indicator,
            attention_mask=mask,
        )

        # extract embeddings
        embedded_text_input = self.embedding_dropout(bert_embeddings)
        frame_embeddings = embedded_text_input[frame_indicator == 1]
        # get sizes
        batch_size, sequence_length, _ = embedded_text_input.size()
        # outputs
        logits = self.tag_projection_layer(embedded_text_input)
        frame_logits = self.frame_projection_layer(frame_embeddings)

        reshaped_log_probs = logits.view(-1, self.num_classes)
        role_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_classes]
        )
        frame_probabilities = F.softmax(frame_logits, dim=-1)
        # We need to retain the mask in the output dictionary
        # so that we can crop the sequences to remove padding
        # when we do viterbi inference in self.make_output_human_readable.
        output_dict = {
            "logits": logits,
            "frame_logits": frame_logits,
            "role_probabilities": role_probabilities,
            "frame_probabilities": frame_probabilities,
            "mask": mask,
        }
        # We add in the offsets here so we can compute the un-wordpieced tags.
        words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata])
        lemmas = [l for x in metadata for l in x["lemmas"]]
        output_dict["words"] = list(words)
        output_dict["verb"] = list(verbs)
        output_dict["lemma"] = list(lemmas)

        if tags is not None:
            # compute role loss
            # role_loss = sequence_cross_entropy_with_logits(
            #     logits, tags, mask, label_smoothing=self._label_smoothing
            # )
            role_loss = self.role_criterion(logits.view(-1, self.num_classes), tags.view(-1))
            # compute frame loss
            frame_tags_filtered = frame_tags[frame_indicator == 1]
            frame_loss = self.frame_criterion(frame_logits, frame_tags_filtered)

            self.f1_role_metric(role_probabilities, tags)
            self.f1_frame_metric(frame_logits, frame_tags_filtered)

            output_dict["frame_loss"] = frame_loss
            output_dict["role_loss"] = role_loss
            output_dict["loss"] = (role_loss + frame_loss) / 2
        return output_dict

    def decode_frames(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        # frame prediction
        frame_probabilities = output_dict["frame_probabilities"]
        if self.restrict:
            frame_probabilities = frame_probabilities.cpu().data.numpy()
            lemmas = output_dict["lemma"]
            candidate_labels = [self.lemma_frame_dict.get(l, []) for l in lemmas]
            # clear candidates from unknowns
            label_set = set(k for k in self._get_label_tokens("frames_labels"))
            candidate_labels_ids = [
                [
                    self.vocab.get_token_index(l, namespace="frames_labels")
                    for l in cl
                    if l in label_set
                ]
                for cl in candidate_labels
            ]

            frame_predictions = []
            for cl, fp in zip(candidate_labels_ids, frame_probabilities):
                # restrict candidates from verbatlas inventory
                fp_candidates = np.take(fp, cl)
                if fp_candidates.size > 0:
                    frame_predictions.append(cl[fp_candidates.argmax(axis=-1)])
                else:
                    frame_predictions.append(fp.argmax(axis=-1))
        else:
            frame_predictions = frame_probabilities.argmax(dim=-1).cpu().data.numpy()

        output_dict["frame_tags"] = [
            self.vocab.get_token_from_index(f, namespace="frames_labels") for f in frame_predictions
        ]
        output_dict["frame_scores"] = [
            fp[f] for f, fp in zip(frame_predictions, frame_probabilities)
        ]
        return output_dict

    @overrides
    def make_output_human_readable(
        self, output_dict: Dict[str, torch.Tensor], restrict: bool = True
    ) -> Dict[str, torch.Tensor]:
        output_dict = self.decode_frames(output_dict)
        # if self.restrict:
        #     output_dict = self._mask_args(output_dict)
        # output_dict = super().make_output_human_readable(output_dict)
        roles_probabilities = output_dict["role_probabilities"]
        roles_predictions = roles_probabilities.argmax(dim=-1).cpu().data.numpy()

        output_dict["tags"] = [
            [self.vocab.get_token_from_index(r, namespace="labels") for r in roles]
            for roles in roles_predictions
        ]
        return output_dict

    def _mask_args(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        class_probs = output_dict["class_probabilities"]
        device = get_device_of(class_probs)
        lemmas = output_dict["lemma"]
        frames = output_dict["frame_tags"]
        candidate_mask = torch.ones_like(class_probs, dtype=torch.bool).to(device)
        for i, (l, f) in enumerate(zip(lemmas, frames)):
            candidates = self.frame_role_dict.get((l, f), [])
            if candidates:
                canidate_ids = [
                    self.vocab.get_token_index(r, namespace="labels") for r in candidates
                ]
                canidate_ids = torch.tensor(canidate_ids).to(device)
                canidate_ids = canidate_ids.repeat(candidate_mask.shape[1], 1)
                candidate_mask[i].scatter_(1, canidate_ids, False)
            else:
                candidate_mask[i].fill_(False)
        class_probs.masked_fill_(candidate_mask, 0)
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False):
        role_metric_dict = self.f1_role_metric.get_metric(reset=reset)
        frame_metric_dict = self.f1_frame_metric.get_metric(reset=reset)
        # This can be a lot of metrics, as there are 3 per class.
        # we only really care about the overall metrics, so we filter for them here.
        # metric_dict_filtered = {
        #     x.split("-")[0] + "_role": y for x, y in metric_dict.items() if "overall" in x
        # }
        metric_dict = {
            "f1_role": role_metric_dict["fscore"],
            "f1_frame": frame_metric_dict["fscore"],
        }
        return metric_dict

    def _get_label_tokens(self, namespace: str = "labels"):
        return self.vocab.get_token_to_index_vocabulary(namespace).keys()

    def _get_label_ids(self, namespace: str = "labels"):
        return self.vocab.get_index_to_token_vocabulary(namespace).keys()

    default_predictor = "transformer_srl"
Exemplo n.º 5
0
class AlbertClassifierF1(Model):
    """
    文本分类模型。
    """
    def __init__(
        self,
        vocab: Vocabulary,
        vocab_path: str = None,
        config_path: str = None,
        model_path: str = None,
        dropout: float = None,
        label_namespace: str = "labels",
        num_labels: int = None,
        loss: str = None,  # focal_loss
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ) -> None:

        super().__init__(vocab, regularizer)
        config, tokenizer, model = get_albert_total(config_path, vocab_path,
                                                    model_path)

        self._bert = model

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = None
        self._label_namespace = label_namespace

        if num_labels:
            self._num_labels = num_labels
        else:
            self._num_labels = vocab.get_vocab_size(
                namespace=self._label_namespace)

        self._classification_layer = torch.nn.Linear(config.hidden_size,
                                                     config.num_labels)

        self._accuracy = CategoricalAccuracy()
        if loss is None:
            self._loss = torch.nn.CrossEntropyLoss()
        elif loss == 'focal_loss':
            self._loss = FocalLoss(alpha=0.25,
                                   num_classes=self._num_labels)  # focal loss
        elif loss == 'cross_entropy_loss':
            self._loss = torch.nn.CrossEntropyLoss()
        else:
            raise ValueError('wrong loss type')
        self._f1_measure = FBetaMeasure()
        initializer(self)

    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
        # print(tokens)

        outputs = self._bert(tokens['bert'],
                             attention_mask=None,
                             token_type_ids=None,
                             position_ids=None,
                             head_mask=None)
        if self._dropout:
            embedded_text = self._dropout(outputs[1])

        logits = self._classification_layer(embedded_text)
        probs = torch.nn.functional.softmax(logits, dim=-1)

        output_dict = {"logits": logits, "probs": probs}

        if label is not None:
            loss = self._loss(logits, label.long().view(-1))
            output_dict["loss"] = loss
            self._accuracy(logits, label)
            self._f1_measure(logits, label)

        return output_dict

    @overrides
    def decode(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Does a simple argmax over the probabilities, converts index to string label, and
        add ``"label"`` key to the dictionary with the result.
        """
        predictions = output_dict["probs"]
        if predictions.dim() == 2:
            predictions_list = [
                predictions[i] for i in range(predictions.shape[0])
            ]
        else:
            predictions_list = [predictions]
        classes = []
        for prediction in predictions_list:
            label_idx = prediction.argmax(dim=-1).item()
            label_str = self.vocab.get_index_to_token_vocabulary(
                self._label_namespace).get(label_idx, str(label_idx))
            classes.append(label_str)
        output_dict["label"] = classes
        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        f1_dict = self._f1_measure.get_metric(reset)
        output = {}
        output['accuracy'] = self._accuracy.get_metric(reset=reset)
        counter = 0
        for precision, recall, fscore in zip(f1_dict['precision'],
                                             f1_dict['recall'],
                                             f1_dict['fscore']):
            output[str(counter) + '_precision'] = precision
            output[str(counter) + '_recall'] = recall
            output[str(counter) + '_fscore'] = fscore
            counter += 1
        return output
Exemplo n.º 6
0
class LieDetector(Model):
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 posclass_weight: Optional[float] = 1,
                 use_power: Optional[bool] = False,
                 dropout: Optional[float] = 0) -> None:
        super().__init__(vocab)
        
        self.embedder = embedder
        self.encoder = encoder
        if use_power:
            self.classifier = torch.nn.Linear(
                in_features=encoder.get_output_dim() + 1,
                out_features=vocab.get_vocab_size('labels')
            )
        else:
            self.classifier = torch.nn.Linear(
                in_features=encoder.get_output_dim(),
                out_features=vocab.get_vocab_size('labels')
            )
        self.use_power = use_power
    
        self.f1_lie = F1Measure(vocab.get_token_index('False', 'labels'))
        self.f1_truth = F1Measure(vocab.get_token_index('True', 'labels'))
        self.micro_f1 = FBetaMeasure(average='micro')
        self.macro_f1 = FBetaMeasure(average='macro')
        
        weights = [1,1]
        weights[vocab.get_token_index('False', 'labels')] = posclass_weight        
        self.loss = torch.nn.CrossEntropyLoss(weight = torch.Tensor(weights))

        self.dropout = torch.nn.Dropout(dropout)
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        lie_precision, lie_recall, lie_fscore = self.f1_lie.get_metric(reset)
        truth_precision, truth_recall, truth_fscore = self.f1_truth.get_metric(reset)
        micro_metrics = self.micro_f1.get_metric(reset)
        macro_metrics = self.macro_f1.get_metric(reset)

        return {
            'truth_precision': truth_precision,
            'truth_recall': truth_recall,
            'truth_fscore': truth_fscore,
            'lie_precision': lie_precision,
            'lie_recall': lie_recall,
            'lie_fscore': lie_fscore,
            'macro_fscore': macro_metrics['fscore'],
            'micro_precision':micro_metrics['precision'], 
            'micro_recall':micro_metrics['recall'], 
            'micro_fscore':micro_metrics['fscore']            
        }

    def forward(self,
                message: Dict[str, torch.Tensor],                
                score_delta: torch.Tensor,
                label: torch.Tensor = None) -> Dict[str, torch.Tensor]:

        mask = get_text_field_mask(message)
        embedded = self.embedder(message)
        #embedded = self._dropout(embedded)        

        encoded = self.encoder(embedded, mask)
        if self.use_power:
            encoded = torch.cat((score_delta.view(-1,1),encoded),1)         
        encoded = self.dropout(encoded)
        
        classified = self.classifier(encoded)

        output = {}
        output["logits"] = classified
        if label is not None:
            self.f1_lie(classified, label)
            self.f1_truth(classified, label)
            self.micro_f1(classified, label)
            self.macro_f1(classified, label)            
            output["loss"] = self.loss(classified, label)
        
        return output