Exemplo n.º 1
class TagDecoder(Model):
    A basic sequence tagger that decodes from inputs of word embeddings
    def __init__(self,
                 vocab: Vocabulary,
                 task: str,
                 encoder: Seq2SeqEncoder,
                 prev_task: str,
                 prev_task_embed_dim: int = None,
                 label_smoothing: float = 0.0,
                 dropout: float = 0.0,
                 adaptive: bool = False,
                 features: List[str] = None,
                 metric: str = "acc",
                 loss_weight: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(TagDecoder, self).__init__(vocab, regularizer)

        self.task = task
        self.dropout = torch.nn.Dropout(p=dropout)
        self.encoder = encoder
        self.output_dim = encoder.get_output_dim()
        self.label_smoothing = label_smoothing
        self.num_classes = self.vocab.get_vocab_size(task)
        self.adaptive = adaptive
        self.features = features if features else []
        self.metric = metric
        self.loss_weight = loss_weight

        # A: add all possible relative encoding to vocabulary
        if self.vocab.get_token_index('100,root') == 1:
            for head in self.vocab.get_token_to_index_vocabulary('head_tags').keys():
                all_encodings = get_all_relative_encodings(head)
                self.vocab.add_tokens_to_namespace(tokens=all_encodings, namespace='dep_encoded')
            # make sure to put end token '100,root'
            self.vocab.add_token_to_namespace(token='100,root', namespace='dep_encoded')

        self.prev_task_tag_embedding = None
        if prev_task_embed_dim is not None and prev_task_embed_dim is not 0 and prev_task is not None:
            if not prev_task == 'rependency':
                self.prev_task_tag_embedding = Embedding(self.vocab.get_vocab_size(prev_task), prev_task_embed_dim)
                self.prev_task_tag_embedding = Embedding(self.vocab.get_vocab_size('dep_encoded'), prev_task_embed_dim)

        # Choose the metric to use for the evaluation (from the defined
        # "metric" value of the task). If not specified, default to accuracy.
        if self.metric == "acc":
            self.metrics = {"acc": CategoricalAccuracy()}
        elif self.metric == "span_f1":
            self.metrics = {"span_f1": SpanBasedF1Measure(
                self.vocab, tag_namespace=self.task, label_encoding="BIO")}
            logger.warning(f"ERROR. Metric: {self.metric} unrecognized. Using accuracy instead.")
            self.metrics = {"acc": CategoricalAccuracy()}

        if self.adaptive:
            # TODO
            adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)]
            self.task_output = AdaptiveLogSoftmaxWithLoss(self.output_dim,
            self.task_output = TimeDistributed(Linear(self.output_dim, self.num_classes))

        self.feature_outputs = torch.nn.ModuleDict()
        self.features_metrics = {}
        for feature in self.features:
            self.feature_outputs[feature] = TimeDistributed(Linear(self.output_dim,
            self.features_metrics[feature] = {
                "acc": CategoricalAccuracy(),


    def forward(self,
                encoded_text: torch.FloatTensor,
                mask: torch.LongTensor,
                gold_tags: Dict[str, torch.LongTensor],
                prev_task_classes: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:

        batch_size, _, _ = encoded_text.size()

        if prev_task_classes is not None and self.prev_task_tag_embedding is not None:
            if prev_task_classes[1]:
                embedded_tags = torch.matmul(prev_task_classes[0], self.prev_task_tag_embedding.weight)
                prev_embed_size = self.prev_task_tag_embedding.get_output_dim()
                embedded_tags = self.dropout(self.prev_task_tag_embedding(prev_task_classes[0]))
                embedded_tags = embedded_tags.view(batch_size, -1, prev_embed_size)
            encoded_text = torch.cat([encoded_text, embedded_tags], -1)

        hidden = encoded_text
        hidden = self.encoder(hidden, mask)

        batch_size, sequence_length, _ = hidden.size()
        output_dim = [batch_size, sequence_length, self.num_classes]

        loss_fn = self._adaptive_loss if self.adaptive else self._loss

        output_dict = loss_fn(hidden, mask, gold_tags.get(self.task, None), output_dim)
        self._features_loss(hidden, mask, gold_tags, output_dict)

        return output_dict

    def _adaptive_loss(self, hidden, mask, gold_tags, output_dim):
        logits = hidden
        reshaped_log_probs = logits.view(-1, logits.size(2))

        class_probabilities = self.task_output.log_prob(reshaped_log_probs).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = sequence_cross_entropy(class_probabilities,
            for metric in self.metrics.values():
                metric(class_probabilities, gold_tags, mask.float())

        return output_dict

    def _loss(self, hidden, mask, gold_tags, output_dim):
        logits = self.task_output(hidden)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        # print(reshaped_log_probs, reshaped_log_probs.shape)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim)
        # print(class_probabilities, class_probabilities.shape)
        # import sys
        # sys.exit()

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = self.loss_weight * sequence_cross_entropy_with_logits(logits,
            for metric in self.metrics.values():
                metric(logits, gold_tags, mask.float())

        return output_dict

    def _features_loss(self, hidden, mask, gold_tags, output_dict):
        if gold_tags is None:

        for feature in self.features:
            logits = self.feature_outputs[feature](hidden)
            loss = sequence_cross_entropy_with_logits(logits,
            loss /= len(self.features)
            output_dict["loss"] += loss

            for metric in self.features_metrics[feature].values():
                metric(logits, gold_tags[feature], mask.float())

    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        all_words = output_dict["words"]

        all_predictions = output_dict["class_probabilities"][self.task].cpu().data.numpy()
        if all_predictions.ndim == 3:
            predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
            predictions_list = [all_predictions]
        all_tags = []
        for predictions, words in zip(predictions_list, all_words):
            argmax_indices = numpy.argmax(predictions, axis=-1)
            tags = [self.vocab.get_token_from_index(x, namespace=self.task)
                    for x in argmax_indices]

            # TODO: specific task
            if self.task == "lemmas":
                def decode_lemma(word, rule):
                    if rule == "_":
                        return "_"
                    if rule == "@@UNKNOWN@@":
                        return word
                    return apply_lemma_rule(word, rule)
                tags = [decode_lemma(word, rule) for word, rule in zip(words, tags)]

        output_dict[self.task] = all_tags

        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        main_metrics = {
            f".run/{self.task}/{metric_name}": metric.get_metric(reset)
            for metric_name, metric in self.metrics.items()

        features_metrics = {
            f"_run/{self.task}/{feature}/{metric_name}": metric.get_metric(reset)
            for feature in self.features
            for metric_name, metric in self.features_metrics[feature].items()

        return {**main_metrics, **features_metrics}
Exemplo n.º 2
class TagDecoder(Model):
    A basic sequence tagger that decodes from inputs of word embeddings
    def __init__(self,
                 vocab: Vocabulary,
                 task: str,
                 encoder: Seq2SeqEncoder,
                 lang_embed_dim: int = None,
                 use_lang_feedforward: bool = False,
                 lang_feedforward: FeedForward = None,
                 label_smoothing: float = 0.0,
                 dropout: float = 0.0,
                 adaptive: bool = False,
                 features: List[str] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(TagDecoder, self).__init__(vocab, regularizer)

        self.lang_embedding = None
        if lang_embed_dim is not None:
            self.lang_embedding = Embedding(self.vocab.get_vocab_size("langs"), lang_embed_dim)

        self.dropout = torch.nn.Dropout(p=dropout)

        self.task = task
        self.encoder = encoder
        self.output_dim = encoder.get_output_dim()
        self.label_smoothing = label_smoothing
        self.num_classes = self.vocab.get_vocab_size(task)
        self.adaptive = adaptive
        self.features = features if features else []

        self.use_lang_feedforward = use_lang_feedforward
        if self.lang_embedding is not None and use_lang_feedforward:
            self.lang_feedforward = lang_feedforward or \
                                     FeedForward(self.output_dim, 1,

        self.metrics = {
            "acc": CategoricalAccuracy(),
            # "acc3": CategoricalAccuracy(top_k=3)

        if self.adaptive:
            # TODO
            adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)]
            self.task_output = AdaptiveLogSoftmaxWithLoss(self.output_dim,
            self.task_output = TimeDistributed(Linear(self.output_dim, self.num_classes))

        self.feature_outputs = torch.nn.ModuleDict()
        self.features_metrics = {}
        for feature in self.features:
            self.feature_outputs[feature] = TimeDistributed(Linear(self.output_dim,
            self.features_metrics[feature] = {
                "acc": CategoricalAccuracy(),


    def forward(self,
                encoded_text: torch.FloatTensor,
                mask: torch.LongTensor,
                gold_tags: Dict[str, torch.LongTensor],
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:

        if self.lang_embedding is not None:
            batch_size, _, _ = encoded_text.size()
            lang_embed_size = self.lang_embedding.get_output_dim()
            embedded_lang = self.dropout(self.lang_embedding(gold_tags['langs']))
            embedded_lang = embedded_lang.view(batch_size, -1, lang_embed_size)
            encoded_text = torch.cat([encoded_text, embedded_lang], -1)

        if self.lang_embedding is not None and self.use_lang_feedforward:
            encoded_text = self.lang_feedforward(encoded_text)

        hidden = encoded_text
        hidden = self.encoder(hidden, mask)

        batch_size, sequence_length, _ = hidden.size()
        output_dim = [batch_size, sequence_length, self.num_classes]

        loss_fn = self._adaptive_loss if self.adaptive else self._loss

        output_dict = loss_fn(hidden, mask, gold_tags[self.task], output_dim)
        self._features_loss(hidden, mask, gold_tags, output_dict)

        return output_dict

    def _adaptive_loss(self, hidden, mask, gold_tags, output_dim):
        logits = hidden
        reshaped_log_probs = logits.view(-1, logits.size(2))

        class_probabilities = self.task_output.log_prob(reshaped_log_probs).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = sequence_cross_entropy(class_probabilities,
            for metric in self.metrics.values():
                metric(class_probabilities, gold_tags, mask.float())

        return output_dict

    def _loss(self, hidden, mask, gold_tags, output_dim):
        logits = self.task_output(hidden)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = sequence_cross_entropy_with_logits(logits,
            for metric in self.metrics.values():
                metric(logits, gold_tags, mask.float())

        return output_dict

    def _features_loss(self, hidden, mask, gold_tags, output_dict):
        if gold_tags is None:

        for feature in self.features:
            logits = self.feature_outputs[feature](hidden)
            loss = sequence_cross_entropy_with_logits(logits,
            loss /= len(self.features)
            output_dict["loss"] += loss

            for metric in self.features_metrics[feature].values():
                metric(logits, gold_tags[feature], mask.float())

    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        all_words = output_dict["words"]

        all_predictions = output_dict["class_probabilities"][self.task].cpu().data.numpy()
        if all_predictions.ndim == 3:
            predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
            predictions_list = [all_predictions]
        all_tags = []
        for predictions, words in zip(predictions_list, all_words):
            argmax_indices = numpy.argmax(predictions, axis=-1)
            tags = [self.vocab.get_token_from_index(x, namespace=self.task)
                    for x in argmax_indices]

            # TODO: specific task
            if self.task == "lemmas":
                def decode_lemma(word, rule):
                    if rule == "_":
                        return "_"
                    if rule == "@@UNKNOWN@@":
                        return word
                    return apply_lemma_rule(word, rule)
                tags = [decode_lemma(word, rule) for word, rule in zip(words, tags)]

        output_dict[self.task] = all_tags

        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        main_metrics = {
            f".run/{self.task}/{metric_name}": metric.get_metric(reset)
            for metric_name, metric in self.metrics.items()

        features_metrics = {
            f"_run/{self.task}/{feature}/{metric_name}": metric.get_metric(reset)
            for feature in self.features
            for metric_name, metric in self.features_metrics[feature].items()

        return {**main_metrics, **features_metrics}
Exemplo n.º 3
class MultiTagDecoder(Model):
    A basic sequence tagger that decodes from inputs of word embeddings
    def __init__(self,
                 vocab: Vocabulary,
                 task: str,
                 encoder: Seq2SeqEncoder,
                 prev_task: str,
                 prev_task_embed_dim: int = None,
                 label_smoothing: float = 0.0,
                 dropout: float = 0.0,
                 adaptive: bool = False,
                 features: List[str] = None,
                 metric: str = "acc",
                 loss_weight: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 threshold: float = 0.5,
                 max_heads: int = 2,
                 focal_gamma: float = None,
                 focal_alpha: float = None) -> None:
        super(MultiTagDecoder, self).__init__(vocab, regularizer)

        self.task = task
        self.dropout = torch.nn.Dropout(p=dropout)
        self.encoder = encoder
        self.output_dim = encoder.get_output_dim()
        self.label_smoothing = label_smoothing
        self.num_classes = self.vocab.get_vocab_size(task)
        self.adaptive = adaptive
        #self.features = features if features else []
        self.metric = metric

        self._loss3 = torch.nn.BCEWithLogitsLoss()

        self.threshold = threshold
        self.max_heads = max_heads
        self.gamma = focal_gamma
        self.alpha = focal_alpha
        self.loss_weight = loss_weight

        # A: add all possible relative encoding to vocabulary
        if self.vocab.get_token_index('100,root') == 1:
            for head in self.vocab.get_token_to_index_vocabulary('head_tags').keys():
                all_encodings = get_all_relative_encodings(head)
                self.vocab.add_tokens_to_namespace(tokens=all_encodings, namespace='dep_encoded')
            # make sure to put end token '100,root'
            self.vocab.add_token_to_namespace(token='100,root', namespace='dep_encoded')

        self.prev_task_tag_embedding = None
        if prev_task_embed_dim is not None and prev_task_embed_dim is not 0 and prev_task is not None:
            if not prev_task == 'rependency':
                self.prev_task_tag_embedding = Embedding(self.vocab.get_vocab_size(prev_task), prev_task_embed_dim)
                self.prev_task_tag_embedding = Embedding(self.vocab.get_vocab_size('dep_encoded'), prev_task_embed_dim)

        # Choose the metric to use for the evaluation (from the defined
        # "metric" value of the task). If not specified, default to accuracy.
        if self.metric == "acc":
            self.metrics = {"acc": CategoricalAccuracy()}
        elif self.metric == "multi_span_f1":
            self.metrics = {"multi_span_f1": MultiSpanBasedF1Measure(
                self.vocab, tag_namespace=self.task, label_encoding="BIO", threshold=self.threshold, max_heads=self.max_heads)}
            logger.warning(f"ERROR. Metric: {self.metric} unrecognized. Using accuracy instead.")
            self.metrics = {"acc": CategoricalAccuracy()}

        if self.adaptive:
            # TODO
            adaptive_cutoffs = [round(self.num_classes / 15), 3 * round(self.num_classes / 15)]
            self.task_output = AdaptiveLogSoftmaxWithLoss(self.output_dim,
            self.task_output = TimeDistributed(Linear(self.output_dim, self.num_classes))

        # self.feature_outputs = torch.nn.ModuleDict()
        # self.features_metrics = {}
        # for feature in self.features:
        #     self.feature_outputs[feature] = TimeDistributed(Linear(self.output_dim,
        #                                                            vocab.get_vocab_size(feature)))
        #     self.features_metrics[feature] = {
        #         "acc": CategoricalAccuracy(),
        #     }


    def forward(self,
                encoded_text: torch.FloatTensor,
                mask: torch.LongTensor,
                gold_tags: Dict[str, torch.LongTensor],
                prev_task_classes: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:

        batch_size, _, _ = encoded_text.size()

        if prev_task_classes is not None and self.prev_task_tag_embedding is not None:
            if prev_task_classes[1]:
                embedded_tags = torch.matmul(prev_task_classes[0], self.prev_task_tag_embedding.weight)
                prev_embed_size = self.prev_task_tag_embedding.get_output_dim()
                embedded_tags = self.dropout(self.prev_task_tag_embedding(prev_task_classes[0]))
                embedded_tags = embedded_tags.view(batch_size, -1, prev_embed_size)
            encoded_text = torch.cat([encoded_text, embedded_tags], -1)

        hidden = encoded_text
        hidden = self.encoder(hidden, mask)

        batch_size, sequence_length, _ = hidden.size()
        output_dim = [batch_size, sequence_length, self.num_classes]

        #loss_fn = self._adaptive_loss if self.adaptive else self._loss2#self._loss
        loss_fn = self._adaptive_loss if self.adaptive else self._loss

        output_dict = loss_fn(hidden, mask, gold_tags.get(self.task, None), output_dim)
        #self._features_loss(hidden, mask, gold_tags, output_dict)

        return output_dict

    def _adaptive_loss(self, hidden, mask, gold_tags, output_dim):
        logits = hidden
        reshaped_log_probs = logits.view(-1, logits.size(2))

        class_probabilities = self.task_output.log_prob(reshaped_log_probs).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = sequence_cross_entropy(class_probabilities,
            for metric in self.metrics.values():
                metric(class_probabilities, gold_tags, mask.float())

        return output_dict

    def _loss2(self, hidden, mask, gold_tags, output_dim):
        logits = self.task_output(hidden)
        reshaped_log_probs = logits.view(-1, self.num_classes)

        # Use the sigmoid for class_probabilities instead of the softmax
        #class_probabilities = torch.sigmoid(reshaped_log_probs).view(output_dim) #logits)
        class_probabilities = torch.sigmoid(logits)
        # class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            # Compute the loss
            output_dict["loss"] = self.multi_class_cross_entropy_loss(
                scores=logits, labels=gold_tags, mask=mask

            for metric in self.metrics.values():
                # metric(logits, gold_tags, mask.float())
                metric(class_probabilities, gold_tags, mask.float())

        return output_dict

    def multi_class_cross_entropy_loss(self, scores, labels, mask):
        Compute the loss from
        # Compute the mask before computing the loss
        # Transform the mask that is at the sentence level (#Size: n_batches x padded_document_length)
        # to a suitable format for the relation labels level
        #mask (2x3)
        padded_document_length = mask.size(1) # prendi la seconda dimensione (3)
        mask = mask.float()  # Size: n_batches x padded_document_length (2x3)
        # [e.view(padded_document_length, 1) * e for e in mask] ([3x3, 3x3])
        #squared_mask = torch.stack([e.view(padded_document_length, 1) * e for e in mask], dim=0) (2x3x3)
        #squared_mask = squared_mask.unsqueeze(-1).repeat(
        squared_mask = mask.unsqueeze(-1).repeat(
            #1, 1, 1, self._n_classes
            1, 1, scores.size(-1)
        )  # Size: n_batches x padded_document_length x padded_document_length x n_classes (2x3x3x5)

        # The scores (and gold labels) are flattened before using
        # the binary cross entropy loss.
        # We thus transform
        flat_size = scores.size()
        scores = scores * squared_mask  # Size: n_batches x padded_document_length x padded_document_length x n_classes
        scores_flat = scores.view(
            flat_size[0], flat_size[1] * scores.size(-1)
        #    flat_size[0], flat_size[1], flat_size[2] * self._n_classes
        )  # Size: n_batches x padded_document_length x (padded_document_length x n_classes)
        labels = labels * squared_mask  # Size: n_batches x padded_document_length x padded_document_length x n_classes
        labels_flat = labels.view(
            flat_size[0], flat_size[1] * scores.size(-1)
        #    flat_size[0], flat_size[1], flat_size[2] * self._n_classes
        )  # Size: n_batches x padded_document_length x (padded_document_length x n_classes)

        #loss = self._loss_fn(scores_flat, labels_flat)
        loss = self._loss3(scores_flat, labels_flat)

        # Amplify the loss to actually see something...
        return 100 * loss

    def _loss(self, hidden, mask, gold_tags, output_dim):
        logits = self.task_output(hidden)
        reshaped_log_probs = logits.view(-1, self.num_classes)

        # Use the sigmoid for class_probabilities instead of the softmax
        #class_probabilities = torch.sigmoid(reshaped_log_probs).view(output_dim) #logits)
        class_probabilities = torch.sigmoid(logits)
        # class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            # Compute the loss
            #output_dict["loss"] = self.multi_class_cross_entropy_loss(
            #    scores=logits, labels=gold_tags, mask=mask
            output_dict["loss"] = self.loss_weight * self.sequence_cross_entropy_with_logits(logits,

            for metric in self.metrics.values():
                # metric(logits, gold_tags, mask.float())
                metric(class_probabilities, gold_tags, mask.float())

        return output_dict

    def sequence_cross_entropy_with_logits(self,
                                       logits: torch.FloatTensor,
                                       targets: torch.LongTensor,
                                       weights: torch.FloatTensor,
                                       average: str = "batch",
                                       label_smoothing: float = None,
                                       gamma: float = None,
                                       alpha: Union[float, List[float], torch.FloatTensor] = None
                                      ) -> torch.FloatTensor:
        Computes the cross entropy loss of a sequence, weighted with respect to
        some user provided weights. Note that the weighting here is not the same as
        in the :func:`torch.nn.CrossEntropyLoss()` criterion, which is weighting
        classes; here we are weighting the loss contribution from particular elements
        in the sequence. This allows loss computations for models which use padding.

        logits : ``torch.FloatTensor``, required.
            A ``torch.FloatTensor`` of size (batch_size, sequence_length, num_classes)
            which contains the unnormalized probability for each class.
        targets : ``torch.LongTensor``, required.
            A ``torch.LongTensor`` of size (batch, sequence_length) which contains the
            index of the true class for each corresponding step.
        weights : ``torch.FloatTensor``, required.
            A ``torch.FloatTensor`` of size (batch, sequence_length)
        average: str, optional (default = "batch")
            If "batch", average the loss across the batches. If "token", average
            the loss across each item in the input. If ``None``, return a vector
            of losses per batch element.
        label_smoothing : ``float``, optional (default = None)
            Whether or not to apply label smoothing to the cross-entropy loss.
            For example, with a label smoothing value of 0.2, a 4 class classification
            target would look like ``[0.05, 0.05, 0.85, 0.05]`` if the 3rd class was
            the correct label.
        gamma : ``float``, optional (default = None)
            Focal loss[*] focusing parameter ``gamma`` to reduces the relative loss for
            well-classified examples and put more focus on hard. The greater value
            ``gamma`` is, the more focus on hard examples.
        alpha : ``float`` or ``List[float]``, optional (default = None)
            Focal loss[*] weighting factor ``alpha`` to balance between classes. Can be
            used independently with ``gamma``. If a single ``float`` is provided, it
            is assumed binary case using ``alpha`` and ``1 - alpha`` for positive and
            negative respectively. If a list of ``float`` is provided, with the same
            length as the number of classes, the weights will match the classes.
            [*] T. Lin, P. Goyal, R. Girshick, K. He and P. Dollár, "Focal Loss for
            Dense Object Detection," 2017 IEEE International Conference on Computer
            Vision (ICCV), Venice, 2017, pp. 2999-3007.

        A torch.FloatTensor representing the cross entropy loss.
        If ``average=="batch"`` or ``average=="token"``, the returned loss is a scalar.
        If ``average is None``, the returned loss is a vector of shape (batch_size,).

        if average not in {None, "token", "batch"}:
            raise ValueError("Got average f{average}, expected one of "
                             "None, 'token', or 'batch'")

        label_smoothing = None

        # make sure weights are float
        # weights = weights.float()

        # Compute the mask before computing the loss
        # Transform the mask that is at the sentence level (#Size: n_batches x padded_document_length)
        # to a suitable format for the relation labels level
        #mask (2x3)
        padded_document_length = weights.size(1) # prendi la seconda dimensione (3)
        weights = weights.float()  # Size: n_batches x padded_document_length (2x3)

        # Make weights be of the right shape (i.e., extend a dimension to NUM_CLASSES)
        NUM_CLASSES = logits.size(-1)
        #weights = weights.unsqueeze_(-1)
        #weights = weights.expand(weights.shape[0], weights.shape[1], NUM_CLASSES)
        #weights = weights.unsqueeze(2).expand(weights.shape[0], weights.shape[1], NUM_CLASSES)

        # [e.view(padded_document_length, 1) * e for e in mask] ([3x3, 3x3])
        #squared_mask = torch.stack([e.view(padded_document_length, 1) * e for e in mask], dim=0) (2x3x3)
        #squared_mask = squared_mask.unsqueeze(-1).repeat(
        weights = weights.unsqueeze(-1).repeat(
            #1, 1, 1, self._n_classes
            1, 1, logits.size(-1)
        )  # Size: n_batches x padded_document_length x padded_document_length x n_classes (2x3x3x5)

        # sum all dim except batch
        non_batch_dims = tuple(range(1, len(weights.shape)))

        # shape : (batch_size,)
        weights_batch_sum = weights.sum(dim=non_batch_dims)
        weights_batch_sum2 = weights.sum(dim=(1,))[:,0]

        # shape : (batch * sequence_length, num_classes)
        # logits_flat = logits.view(-1, logits.size(-1))

        # Use log_sigmoid instead of log_softmax
        # log_probs_flat = torch.nn.functional.logsigmoid(logits_flat)
        # shape : (batch * sequence_length, num_classes)
        # log_probs_flat = torch.nn.functional.log_softmax(logits_flat, dim=-1)

        # Make the target handle NUM_CLASSES instead of one-best
        # shape : (batch * max_len, NUM_CLASSES)
        # targets_flat = targets.view(-1, NUM_CLASSES)
        # shape : (batch * max_len, 1)
        # targets_flat = targets.view(-1, 1).long()

        # The scores (and gold labels) are flattened before using
        # the binary cross entropy loss.
        # We thus transform
        flat_size = logits.size()
        logits = logits * weights  # Size: n_batches x padded_document_length x padded_document_length x n_classes
        logits_flat = logits.view(
            flat_size[0], flat_size[1] * logits.size(-1)
        #    flat_size[0], flat_size[1], flat_size[2] * self._n_classes
        )  # Size: n_batches x padded_document_length x (padded_document_length x n_classes)
        targets = targets * weights  # Size: n_batches x padded_document_length x padded_document_length x n_classes
        targets_flat = targets.view(
            flat_size[0], flat_size[1] * logits.size(-1)
        #    flat_size[0], flat_size[1], flat_size[2] * self._n_classes
        )  # Size: n_batches x padded_document_length x (padded_document_length x n_classes)

        # focal loss coefficient
        # if gamma:
        #     # shape : (batch * sequence_length, num_classes)
        #     probs_flat = log_probs_flat.exp()
        #     # shape : (batch * sequence_length,)
        #     probs_flat = torch.gather(probs_flat, dim=1, index=targets_flat)
        #     # shape : (batch * sequence_length,)
        #     focal_factor = (1. - probs_flat) ** gamma
        #     # shape : (batch, sequence_length)
        #     focal_factor = focal_factor.view(*targets.size())
        #     weights = weights * focal_factor

        if alpha is not None:
            # shape : () / (num_classes,)
            if isinstance(alpha, (float, int)):
                # pylint: disable=not-callable
                # shape : (2,)
                alpha_factor = torch.tensor([1. - float(alpha), float(alpha)],
                                            dtype=weights.dtype, device=weights.device)
                # pylint: enable=not-callable
            elif isinstance(alpha, (list, numpy.ndarray, torch.Tensor)):
                # pylint: disable=not-callable
                # shape : (c,)
                alpha_factor = torch.tensor(alpha, dtype=weights.dtype, device=weights.device)
                # pylint: enable=not-callable
                if not alpha_factor.size():
                    # shape : (1,)
                    alpha_factor = alpha_factor.view(1)
                    # shape : (2,)
                    alpha_factor = torch.cat([1 - alpha_factor, alpha_factor])
                raise TypeError(('alpha must be float, list of float, or torch.FloatTensor, '
                                 '{} provided.').format(type(alpha)))
            # shape : (batch, max_len)
            #alpha_factor = torch.gather(alpha_factor, dim=0, index=targets_flat.view(-1)).view(*targets.size())
            #weights = weights * alpha_factor

        if label_smoothing is not None and label_smoothing > 0.0:
            negative_log_likelihood_ = torch.nn.functional.binary_cross_entropy_with_logits(logits_flat, targets_flat, reduction='none') 

            num_classes = logits.size(-1)
            smoothing_value = label_smoothing / num_classes
            # Fill all the correct indices with 1 - smoothing value.

            #one_hot_targets = torch.zeros_like(negative_log_likelihood_).scatter_(-1, targets_flat.long(), 1.0 - label_smoothing)
            one_hot_targets = targets_flat.clone()
            one_hot_targets[one_hot_targets==1] = 1.0 - label_smoothing
            smoothed_targets = one_hot_targets + smoothing_value
            #negative_log_likelihood_flat = - logits_flat * smoothed_targets
            negative_log_likelihood_ = negative_log_likelihood_ * smoothed_targets

            # Keep all the classes instead of only the best one
            # negative_log_likelihood_flat = negative_log_likelihood_flat.sum(-1, keepdim=True)
            # Contribution to the negative log likelihood only comes from the exact indices
            # of the targets, as the target distributions are one-hot. Here we use torch.gather
            # to extract the indices of the num_classes dimension which contribute to the loss.
            # shape : (batch * sequence_length, 1)
            # negative_log_likelihood_flat = - torch.gather(log_probs_flat, dim=1, index=targets_flat)
            # negative_log_likelihood_flat = - log_probs_flat
            negative_log_likelihood_ = torch.nn.functional.binary_cross_entropy_with_logits(logits_flat, targets_flat, reduction='none') #self._loss3(logits_new, targets_new)
        # shape : (batch, sequence_length)
        # negative_log_likelihood = negative_log_likelihood_.view(*targets.size())
        # negative_log_likelihood = negative_log_likelihood_flat.view(*targets.size())
        # shape : (batch, sequence_length)
        #negative_log_likelihood = negative_log_likelihood * weights

        if gamma:
            # shape : (batch * sequence_length, num_classes)
            # probs_flat = log_probs_flat.exp()
            probs_flat = negative_log_likelihood_.exp()
            # shape : (batch * sequence_length,)
            # probs_flat = torch.gather(probs_flat, dim=1, index=targets_flat)
            # shape : (batch * sequence_length,)
            focal_factor = (1. - probs_flat) ** gamma
            # shape : (batch, sequence_length)
            focal_factor = focal_factor.view(*targets.size())
            weights = weights * focal_factor

        if alpha is not None:
            # shape : (batch, max_len)
            alpha_factor = torch.gather(alpha_factor, dim=0, index=targets_flat.long().view(-1)).view(*targets.size())
            weights = weights * alpha_factor

        negative_log_likelihood = negative_log_likelihood_.view(*targets.size())
        negative_log_likelihood = negative_log_likelihood * weights

        if average == "batch":
            # shape : (batch_size,)
            per_token_loss = negative_log_likelihood.sum((2,)) / NUM_CLASSES
            #print(per_token_loss, per_token_loss.shape)
            #per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / (weights_batch_sum + 1e-13)
            per_batch_loss = per_token_loss.sum((1,)) / (weights_batch_sum2 + 1e-13)

            num_non_empty_sequences = ((weights_batch_sum2 > 0).float().sum() + 1e-13)

            return (per_batch_loss.sum() / num_non_empty_sequences) * 100 # amplify it to see something
        elif average == "token":
            return negative_log_likelihood.sum() / (weights_batch_sum.sum() + 1e-13)
            # shape : (batch_size,)
            per_batch_loss = negative_log_likelihood.sum(non_batch_dims) / (weights_batch_sum + 1e-13)
            return per_batch_loss

    # def _features_loss(self, hidden, mask, gold_tags, output_dict):
    #     if gold_tags is None:
    #         return

    #     for feature in self.features:
    #         logits = self.feature_outputs[feature](hidden)
    #         loss = sequence_cross_entropy_with_logits(logits,
    #                                                   gold_tags[feature],
    #                                                   mask,
    #                                                   label_smoothing=self.label_smoothing)
    #         loss /= len(self.features)
    #         output_dict["loss"] += loss

    #         for metric in self.features_metrics[feature].values():
    #             metric(logits, gold_tags[feature], mask.float())

    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        all_words = output_dict["words"]

        all_predictions = output_dict["class_probabilities"][self.task].cpu().data.numpy()
        if all_predictions.ndim == 3:
            predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
            predictions_list = [all_predictions]
        all_tags = []
        for predictions, words in zip(predictions_list, all_words):

            # Hard-coded parameters for now
            THRESH = self.threshold
            k = self.max_heads
            outside_index = self.vocab.get_token_index("O", namespace=self.task)

            # Get the thresholded matrix and prepare the prediction sequence
            pred_over_thresh = (predictions >= THRESH) * predictions
            sequence_token_labels = []
            maxxx = numpy.argmax(predictions, axis=-1).tolist()

            # For each label set, check if to apply argmax or sigmoid thresh
            for pred in pred_over_thresh:
                num_pred_over_thresh = numpy.count_nonzero(pred)

                if (num_pred_over_thresh == 0) or (num_pred_over_thresh == 1):
                    pred_idx_list = [maxxx[j]]

                elif num_pred_over_thresh <= k:
                    pred_idx_list = list(numpy.argpartition(pred, -num_pred_over_thresh)[-num_pred_over_thresh:])

                    outside_position = -1
                        outside_position = pred_idx_list.index(outside_index)
                    except ValueError:
                        outside_position = -1
                    # outside_position = None
                    # for el_i in range(len(pred_idx_list)):
                    #     if pred_idx_list[el_i] == outside_index:
                    #         outside_position = el_i
                    #         break
                    if outside_position != -1:
                        pred_len = len(pred_idx_list)-1
                        # If the last (i.e., the best) is "O", ignore/remove the others
                        if outside_position == pred_len:
                            pred_idx_list = [pred_idx_list[-1]]
                        # O.w. get only from the last before the "O"
                            # del pred_idx_list[outside_position]
                            pred_idx_list = pred_idx_list[outside_position+1:]

                    pred_idx_list = list(numpy.argpartition(pred, -k)[-k:])

                    outside_position = -1
                        outside_position = pred_idx_list.index(outside_index)
                    except ValueError:
                        outside_position = -1
                    # outside_position = None
                    # for el_i in range(len(pred_idx_list)):
                    #     if pred_idx_list[el_i] == outside_index:
                    #         outside_position = el_i
                    #         break
                    if outside_position != -1:
                        pred_len = len(pred_idx_list)-1
                        # If the last (i.e., the best) is "O", ignore/remove the others
                        if outside_position == pred_len:
                            pred_idx_list = [pred_idx_list[-1]]
                        # O.w. get only from the last before the "O"
                            # del pred_idx_list[outside_position]
                            pred_idx_list = pred_idx_list[outside_position+1:]

                # if num_pred_over_thresh < k:
                #     pred_idx_list = [maxxx[j]]
                #     # print("argmax  ->", pred_idx_list)
                # else:
                #     #pred_idx_list = [maxxx[j]]
                #     pred_idx_list = list(numpy.argpartition(pred, -k)[-k:])
                #     # # print("sigmoid ->", pred_idx_list)

                #     # # If the first (i.e., second best) is "O", ignore/remove it
                #     if pred_idx_list[0] == outside_index:
                #         pred_idx_list = pred_idx_list[1:]
                #     # If the second (i.e., the best) is "O", ignore/remove the first
                #     elif pred_idx_list[1] == outside_index:
                #         pred_idx_list = pred_idx_list[1:]
                #     else:
                #         pass

                j += 1

            # Create the list of tags to append for the output
            tags = []
            for token_labels in sequence_token_labels:
                curr_labels = []
                for token_label in token_labels:
                        self.vocab.get_token_from_index(token_label, namespace=self.task))
            # print(tags)

            # argmax_indices = numpy.argmax(predictions, axis=-1)
            # tags = [self.vocab.get_token_from_index(x, namespace=self.task)
            #         for x in argmax_indices]

            # # TODO: specific task
            # if self.task == "lemmas":
            #     def decode_lemma(word, rule):
            #         if rule == "_":
            #             return "_"
            #         if rule == "@@UNKNOWN@@":
            #             return word
            #         return apply_lemma_rule(word, rule)
            #     tags = [decode_lemma(word, rule) for word, rule in zip(words, tags)]

        output_dict[self.task] = all_tags

        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        main_metrics = {
            f".run/{self.task}/{metric_name}": metric.get_metric(reset)
            for metric_name, metric in self.metrics.items()

        return {**main_metrics}