def __init__(self, vocab: Vocabulary, bert_model: Union[str, BertModel], embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH) -> None: super().__init__(vocab, regularizer) if isinstance(bert_model, str): self.bert_model = BertModel.from_pretrained(bert_model) else: self.bert_model = bert_model self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.tag_projection_layer = Linear(self.bert_model.config.hidden_size, self.num_classes) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric initializer(self)
def __init__( self, vocab: Vocabulary, bert_model: Union[str, BertModel], embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, parser_path: str = "/home/rizwan/.allennlp/cache/elmo-allennlp_constituency_parser" ) -> None: super().__init__(vocab, regularizer) if isinstance(bert_model, str): self.bert_model = BertModel.from_pretrained(bert_model) else: self.bert_model = bert_model self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.tag_projection_layer = Linear( 2 * self.bert_model.config.hidden_size, self.num_classes) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric device = 0 if torch.cuda.is_available() else -1 self.parser = Predictor.from_path(parser_path, cuda_device=device) self.syntax_roberta = RobertaModel.from_pretrained( '../fairseq/checkpoints_768', 'checkpoint_best.pt') self.syntax_roberta.eval() self.matrix_attention = DotProductMatrixAttention() initializer(self)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, **kwargs, ) -> None: super().__init__(vocab, **kwargs) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.encoder = encoder # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding( num_embeddings=2, embedding_dim=binary_feature_dim ) self.tag_projection_layer = TimeDistributed( Linear(self.encoder.get_output_dim(), self.num_classes) ) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric check_dimensions_match( text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), "text embedding dim + verb indicator embedding dim", "encoder input dim", ) initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH) -> None: super(GCN_model, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.encoder = encoder self.gcn_layer = GCN(nfeat=self.encoder.get_output_dim(), nhid=200, nclass=64, dropout=0.1) self.decoder = PytorchSeq2SeqWrapper( StackedAlternatingLstm(input_size=64, hidden_size=32, num_layers=2, recurrent_dropout_probability=0.1, use_highway=True)) self.tag_projection_layer = TimeDistributed(Linear(32, self.num_classes)) # self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.num_classes)) # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding(2, binary_feature_dim) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), "text embedding dim + verb indicator embedding dim", "encoder input dim") initializer(self)
class SrlBert(Model): """ Parameters ---------- vocab : ``Vocabulary``, required A Vocabulary, required in order to compute sizes for input/output projections. model : ``Union[str, BertModel]``, required. A string describing the BERT model to load or an already constructed BertModel. initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) Used to initialize the model parameters. regularizer : ``RegularizerApplicator``, optional (default=``None``) If provided, will be used to calculate the regularization penalty during training. label_smoothing : ``float``, optional (default = 0.0) Whether or not to use label smoothing on the labels when computing cross entropy loss. ignore_span_metric: ``bool``, optional (default = False) Whether to calculate span loss, which is irrelevant when predicting BIO for Open Information Extraction. srl_eval_path: ``str``, optional (default=``DEFAULT_SRL_EVAL_PATH``) The path to the srl-eval.pl script. By default, will use the srl-eval.pl included with allennlp, which is located at allennlp/tools/srl-eval.pl . If ``None``, srl-eval.pl is not used. """ def __init__(self, vocab: Vocabulary, bert_model: Union[str, BertModel], embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH) -> None: super().__init__(vocab, regularizer) if isinstance(bert_model, str): self.bert_model = BertModel.from_pretrained(bert_model) else: self.bert_model = bert_model self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.tag_projection_layer = Linear(self.bert_model.config.hidden_size, self.num_classes) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric initializer(self) def forward( self, # type: ignore tokens: Dict[str, torch.Tensor], verb_indicator: torch.Tensor, metadata: List[Any], tags: torch.LongTensor = None): # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. For this model, this must be a `SingleIdTokenIndexer` which indexes wordpieces from the BERT vocabulary. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence, the verb to compute the frame for, and start offsets for converting wordpieces back to a sequence of words, under 'words', 'verb' and 'offsets' keys, respectively. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ mask = get_text_field_mask(tokens) bert_embeddings, _ = self.bert_model(input_ids=tokens["tokens"], token_type_ids=verb_indicator, attention_mask=mask, output_all_encoded_layers=False) embedded_text_input = self.embedding_dropout(bert_embeddings) batch_size, sequence_length, _ = embedded_text_input.size() logits = self.tag_projection_layer(embedded_text_input) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities } # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask # We add in the offsets here so we can compute the un-wordpieced tags. words, verbs, offsets = zip(*[(x["words"], x["verb"], x["offsets"]) for x in metadata]) output_dict["words"] = list(words) output_dict["verb"] = list(verbs) output_dict["wordpiece_offsets"] = list(offsets) if tags is not None: loss = sequence_cross_entropy_with_logits( logits, tags, mask, label_smoothing=self._label_smoothing) if not self.ignore_span_metric and self.span_metric is not None and not self.training: batch_verb_indices = [ example_metadata["verb_index"] for example_metadata in metadata ] batch_sentences = [ example_metadata["words"] for example_metadata in metadata ] # Get the BIO tags from decode() # TODO (nfliu): This is kind of a hack, consider splitting out part # of decode() to a separate function. batch_bio_predicted_tags = self.decode(output_dict).pop("tags") batch_conll_predicted_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_predicted_tags ] batch_bio_gold_tags = [ example_metadata["gold_tags"] for example_metadata in metadata ] batch_conll_gold_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_gold_tags ] self.span_metric(batch_verb_indices, batch_sentences, batch_conll_predicted_tags, batch_conll_gold_tags) output_dict["loss"] = loss return output_dict @overrides def decode( self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ Does constrained viterbi decoding on class probabilities output in :func:`forward`. The constraint simply specifies that the output tags must be a valid BIO sequence. We add a ``"tags"`` key to the dictionary with the result. NOTE: First, we decode a BIO sequence on top of the wordpieces. This is important; viterbi decoding produces low quality output if you decode on top of word representations directly, because the model gets confused by the 'missing' positions (which is sensible as it is trained to perform tagging on wordpieces, not words). Secondly, it's important that the indices we use to recover words from the wordpieces are the start_offsets (i.e offsets which correspond to using the first wordpiece of words which are tokenized into multiple wordpieces) as otherwise, we might get an ill-formed BIO sequence when we select out the word tags from the wordpiece tags. This happens in the case that a word is split into multiple word pieces, and then we take the last tag of the word, which might correspond to, e.g, I-V, which would not be allowed as it is not preceeded by a B tag. """ all_predictions = output_dict['class_probabilities'] sequence_lengths = get_lengths_from_binary_sequence_mask( output_dict["mask"]).data.tolist() if all_predictions.dim() == 3: predictions_list = [ all_predictions[i].detach().cpu() for i in range(all_predictions.size(0)) ] else: predictions_list = [all_predictions] wordpiece_tags = [] word_tags = [] transition_matrix = self.get_viterbi_pairwise_potentials() start_transitions = self.get_start_transitions() # **************** Different ******************** # We add in the offsets here so we can compute the un-wordpieced tags. for predictions, length, offsets in zip( predictions_list, sequence_lengths, output_dict["wordpiece_offsets"]): max_likelihood_sequence, _ = viterbi_decode( predictions[:length], transition_matrix, allowed_start_transitions=start_transitions) tags = [ self.vocab.get_token_from_index(x, namespace="labels") for x in max_likelihood_sequence ] wordpiece_tags.append(tags) word_tags.append([tags[i] for i in offsets]) output_dict['wordpiece_tags'] = wordpiece_tags output_dict['tags'] = word_tags return output_dict def get_metrics(self, reset: bool = False): if self.ignore_span_metric: # Return an empty dictionary if ignoring the # span metric return {} else: metric_dict = self.span_metric.get_metric(reset=reset) # This can be a lot of metrics, as there are 3 per class. # we only really care about the overall metrics, so we filter for them here. return {x: y for x, y in metric_dict.items() if "overall" in x} def get_viterbi_pairwise_potentials(self): """ Generate a matrix of pairwise transition potentials for the BIO labels. The only constraint implemented here is that I-XXX labels must be preceded by either an identical I-XXX tag or a B-XXX tag. In order to achieve this constraint, pairs of labels which do not satisfy this constraint have a pairwise potential of -inf. Returns ------- transition_matrix : torch.Tensor A (num_labels, num_labels) matrix of pairwise potentials. """ all_labels = self.vocab.get_index_to_token_vocabulary("labels") num_labels = len(all_labels) transition_matrix = torch.zeros([num_labels, num_labels]) for i, previous_label in all_labels.items(): for j, label in all_labels.items(): # I labels can only be preceded by themselves or # their corresponding B tag. if i != j and label[ 0] == 'I' and not previous_label == 'B' + label[1:]: transition_matrix[i, j] = float("-inf") return transition_matrix def get_start_transitions(self): """ In the BIO sequence, we cannot start the sequence with an I-XXX tag. This transition sequence is passed to viterbi_decode to specify this constraint. Returns ------- start_transitions : torch.Tensor The pairwise potentials between a START token and the first token of the sequence. """ all_labels = self.vocab.get_index_to_token_vocabulary("labels") num_labels = len(all_labels) start_transitions = torch.zeros(num_labels) for i, label in all_labels.items(): if label[0] == "I": start_transitions[i] = float("-inf") return start_transitions
class SemanticRoleLabeler(Model): """ This model performs semantic role labeling using BIO tags using Propbank semantic roles. Specifically, it is an implementation of [Deep Semantic Role Labeling - What works and what's next](https://www.aclweb.org/anthology/P17-1044). This implementation is effectively a series of stacked interleaved LSTMs with highway connections, applied to embedded sequences of words concatenated with a binary indicator containing whether or not a word is the verbal predicate to generate predictions for in the sentence. Additionally, during inference, Viterbi decoding is applied to constrain the predictions to contain valid BIO sequences. Specifically, the model expects and outputs IOB2-formatted tags, where the B- tag is used in the beginning of every chunk (i.e. all chunks start with the B- tag). # Parameters vocab : `Vocabulary`, required A Vocabulary, required in order to compute sizes for input/output projections. text_field_embedder : `TextFieldEmbedder`, required Used to embed the `tokens` `TextField` we get as input to the model. encoder : `Seq2SeqEncoder` The encoder (with its own internal stacking) that we will use in between embedding tokens and predicting output tags. binary_feature_dim : int, required. The dimensionality of the embedding of the binary verb predicate features. initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`) Used to initialize the model parameters. label_smoothing : `float`, optional (default = 0.0) Whether or not to use label smoothing on the labels when computing cross entropy loss. ignore_span_metric : `bool`, optional (default = False) Whether to calculate span loss, which is irrelevant when predicting BIO for Open Information Extraction. srl_eval_path : `str`, optional (default=`DEFAULT_SRL_EVAL_PATH`) The path to the srl-eval.pl script. By default, will use the srl-eval.pl included with allennlp, which is located at allennlp/tools/srl-eval.pl . If `None`, srl-eval.pl is not used. """ def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, **kwargs, ) -> None: super().__init__(vocab, **kwargs) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.encoder = encoder # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding( num_embeddings=2, embedding_dim=binary_feature_dim ) self.tag_projection_layer = TimeDistributed( Linear(self.encoder.get_output_dim(), self.num_classes) ) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric check_dimensions_match( text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), "text embedding dim + verb indicator embedding dim", "encoder input dim", ) initializer(self) def forward( # type: ignore self, tokens: TextFieldTensors, verb_indicator: torch.LongTensor, tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """ # Parameters tokens : TextFieldTensors, required The output of `TextField.as_array()`, which should typically be passed directly to a `TextFieldEmbedder`. This output is a dictionary mapping keys to `TokenIndexer` tensors. At its most basic, using a `SingleIdTokenIndexer` this is : `{"tokens": Tensor(batch_size, num_tokens)}`. This dictionary will have the same keys as were used for the `TokenIndexers` when you created the `TextField` representing your sequence. The dictionary is designed to be passed directly to a `TextFieldEmbedder`, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer `SequenceFeatureField` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape `(batch_size, num_tokens)` metadata : `List[Dict[str, Any]]`, optional, (default = None) metadata containg the original words in the sentence and the verb to compute the frame for, under 'words' and 'verb' keys, respectively. # Returns An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout(self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding(verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1 ) batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size() encoded_text = self.encoder(embedded_text_with_verb_indicator, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes] ) output_dict = {"logits": logits, "class_probabilities": class_probabilities} # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask if tags is not None: loss = sequence_cross_entropy_with_logits( logits, tags, mask, label_smoothing=self._label_smoothing ) if not self.ignore_span_metric and self.span_metric is not None and not self.training: batch_verb_indices = [ example_metadata["verb_index"] for example_metadata in metadata ] batch_sentences = [example_metadata["words"] for example_metadata in metadata] # Get the BIO tags from decode() # TODO (nfliu): This is kind of a hack, consider splitting out part # of decode() to a separate function. batch_bio_predicted_tags = self.decode(output_dict).pop("tags") batch_conll_predicted_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_predicted_tags ] batch_bio_gold_tags = [ example_metadata["gold_tags"] for example_metadata in metadata ] batch_conll_gold_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_gold_tags ] self.span_metric( batch_verb_indices, batch_sentences, batch_conll_predicted_tags, batch_conll_gold_tags, ) output_dict["loss"] = loss words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata]) if metadata is not None: output_dict["words"] = list(words) output_dict["verb"] = list(verbs) return output_dict @overrides def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ Does constrained viterbi decoding on class probabilities output in :func:`forward`. The constraint simply specifies that the output tags must be a valid BIO sequence. We add a `"tags"` key to the dictionary with the result. """ all_predictions = output_dict["class_probabilities"] sequence_lengths = get_lengths_from_binary_sequence_mask(output_dict["mask"]).data.tolist() if all_predictions.dim() == 3: predictions_list = [ all_predictions[i].detach().cpu() for i in range(all_predictions.size(0)) ] else: predictions_list = [all_predictions] all_tags = [] transition_matrix = self.get_viterbi_pairwise_potentials() start_transitions = self.get_start_transitions() for predictions, length in zip(predictions_list, sequence_lengths): max_likelihood_sequence, _ = viterbi_decode( predictions[:length], transition_matrix, allowed_start_transitions=start_transitions ) tags = [ self.vocab.get_token_from_index(x, namespace="labels") for x in max_likelihood_sequence ] all_tags.append(tags) output_dict["tags"] = all_tags return output_dict def get_metrics(self, reset: bool = False): if self.ignore_span_metric: # Return an empty dictionary if ignoring the # span metric return {} else: metric_dict = self.span_metric.get_metric(reset=reset) # This can be a lot of metrics, as there are 3 per class. # we only really care about the overall metrics, so we filter for them here. return {x: y for x, y in metric_dict.items() if "overall" in x} def get_viterbi_pairwise_potentials(self): """ Generate a matrix of pairwise transition potentials for the BIO labels. The only constraint implemented here is that I-XXX labels must be preceded by either an identical I-XXX tag or a B-XXX tag. In order to achieve this constraint, pairs of labels which do not satisfy this constraint have a pairwise potential of -inf. # Returns transition_matrix : torch.Tensor A (num_labels, num_labels) matrix of pairwise potentials. """ all_labels = self.vocab.get_index_to_token_vocabulary("labels") num_labels = len(all_labels) transition_matrix = torch.zeros([num_labels, num_labels]) for i, previous_label in all_labels.items(): for j, label in all_labels.items(): # I labels can only be preceded by themselves or # their corresponding B tag. if i != j and label[0] == "I" and not previous_label == "B" + label[1:]: transition_matrix[i, j] = float("-inf") return transition_matrix def get_start_transitions(self): """ In the BIO sequence, we cannot start the sequence with an I-XXX tag. This transition sequence is passed to viterbi_decode to specify this constraint. # Returns start_transitions : torch.Tensor The pairwise potentials between a START token and the first token of the sequence. """ all_labels = self.vocab.get_index_to_token_vocabulary("labels") num_labels = len(all_labels) start_transitions = torch.zeros(num_labels) for i, label in all_labels.items(): if label[0] == "I": start_transitions[i] = float("-inf") return start_transitions
class GCN_model(Model): """ This model performs semantic role labeling using BIO tags using Propbank semantic roles. Specifically, it is an implementation of `Deep Semantic Role Labeling - What works and what's next <https://homes.cs.washington.edu/~luheng/files/acl2017_hllz.pdf>`_ . This implementation is effectively a series of stacked interleaved LSTMs with highway connections, applied to embedded sequences of words concatenated with a binary indicator containing whether or not a word is the verbal predicate to generate predictions for in the sentence. Additionally, during inference, Viterbi decoding is applied to constrain the predictions to contain valid BIO sequences. Specifically, the model expects and outputs IOB2-formatted tags, where the B- tag is used in the beginning of every chunk (i.e. all chunks start with the B- tag). Parameters ---------- vocab : ``Vocabulary``, required A Vocabulary, required in order to compute sizes for input/output projections. text_field_embedder : ``TextFieldEmbedder``, required Used to embed the ``tokens`` ``TextField`` we get as input to the model. encoder : ``Seq2SeqEncoder`` The encoder (with its own internal stacking) that we will use in between embedding tokens and predicting output tags. binary_feature_dim : int, required. The dimensionality of the embedding of the binary verb predicate features. initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) Used to initialize the model parameters. regularizer : ``RegularizerApplicator``, optional (default=``None``) If provided, will be used to calculate the regularization penalty during training. label_smoothing : ``float``, optional (default = 0.0) Whether or not to use label smoothing on the labels when computing cross entropy loss. ignore_span_metric: ``bool``, optional (default = False) Whether to calculate span loss, which is irrelevant when predicting BIO for Open Information Extraction. srl_eval_path: ``str``, optional (default=``DEFAULT_SRL_EVAL_PATH``) The path to the srl-eval.pl script. By default, will use the srl-eval.pl included with allennlp, which is located at allennlp/tools/srl-eval.pl . If ``None``, srl-eval.pl is not used. """ def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, binary_feature_dim: int, embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH) -> None: super(GCN_model, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.encoder = encoder self.gcn_layer = GCN(nfeat=self.encoder.get_output_dim(), nhid=200, nclass=64, dropout=0.1) self.decoder = PytorchSeq2SeqWrapper( StackedAlternatingLstm(input_size=64, hidden_size=32, num_layers=2, recurrent_dropout_probability=0.1, use_highway=True)) self.tag_projection_layer = TimeDistributed(Linear(32, self.num_classes)) # self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.num_classes)) # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding(2, binary_feature_dim) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), "text embedding dim + verb indicator embedding dim", "encoder input dim") initializer(self) def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, adj: torch.LongTensor, tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence and the verb to compute the frame for, under 'words' and 'verb' keys, respectively. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout(self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding(verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat([embedded_text_input, embedded_verb_indicator], -1) batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size() encoded_text = self.encoder(embedded_text_with_verb_indicator, mask) logits = torch.Tensor() for i, j in enumerate(adj): gcn_output = self.gcn_layer(encoded_text[i], j) gcn_output = gcn_output.expand(1, -1, -1) logits = torch.cat((logits, gcn_output)) ####################################### # print(adj) # print(adj['elmo']) # for j, i in enumerate(adj['tokens']): # # numpy can map, but tensor can not map # idx = tokens['tokens'][j].cpu().numpy() # # idx = np.array(idx, dtype=np.int32) # id_map = {a: b for b, a in enumerate(idx)} # _i = i.cpu().numpy() # _i = _i.ravel()[np.flatnonzero(_i.flatten())] # adj_map = np.array(list(map(id_map.get, _i))).reshape(-1, 2) # # delete_list = [] # # for k in range(adj_map.shape[0]): # # if None in adj_map[k]: # # delete_list.append(k) # # adj_map = np.delete(adj_map, delete_list, 0) # # # because the difference between dependency parsing word and sentence word like U. M's # # # maybe the word in dependency parsing is different with the sentence vocabulary, # # # so there will be None in adj_map # # # be careful with the copy, do not use = # # tmp = adj_map[:] # # len_tmp = len(tmp) # # for k in range(len_tmp): # # if None not in adj_map: # # break # # if tmp[k] == None: # # if tmp[k-1] == None: # # continue # # if k%2: # # adj_map.remove(tmp[k]) # # adj_map.remove(tmp[k-1]) # # else: # # adj_map.remove(tmp[k]) # # adj_map.remove(tmp[k+1]) # # edges = np.array(adj_map, dtype=np.int32).reshape(-1, 2) # single_adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), # shape=(tags.shape[1], tags.shape[1]), dtype=np.float32) # # single_adj = single_adj + single_adj.T.multiply(single_adj.T > single_adj) - single_adj.multiply( # single_adj.T > single_adj) # single_adj = self.normalize(single_adj + sp.eye(single_adj.shape[0])) # single_adj = self.sparse_mx_to_torch_sparse_tensor(single_adj) # # gcn_output = self.gcn_layer(encoded_text[j], single_adj) # # gcn_output = gcn_output.expand(1, -1, -1) # logits = torch.cat((logits, gcn_output)) ######################################### # logits = self.tag_projection_layer(encoded_text) logits = self.decoder(logits, mask) logits = self.tag_projection_layer(logits) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, sequence_length, self.num_classes]) output_dict = {"logits": logits, "class_probabilities": class_probabilities} # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask if tags is not None: loss = sequence_cross_entropy_with_logits(logits, tags, mask, label_smoothing=self._label_smoothing) if not self.ignore_span_metric and self.span_metric is not None and not self.training: batch_verb_indices = [example_metadata["verb_index"] for example_metadata in metadata] batch_sentences = [example_metadata["words"] for example_metadata in metadata] # Get the BIO tags from decode() # TODO (nfliu): This is kind of a hack, consider splitting out part # of decode() to a separate function. batch_bio_predicted_tags = self.decode(output_dict).pop("tags") batch_conll_predicted_tags = [convert_bio_tags_to_conll_format(tags) for tags in batch_bio_predicted_tags] batch_bio_gold_tags = [example_metadata["gold_tags"] for example_metadata in metadata] batch_conll_gold_tags = [convert_bio_tags_to_conll_format(tags) for tags in batch_bio_gold_tags] self.span_metric(batch_verb_indices, batch_sentences, batch_conll_predicted_tags, batch_conll_gold_tags) output_dict["loss"] = loss if metadata is not None: words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata]) output_dict["words"] = list(words) output_dict["verb"] = list(verbs) return output_dict @overrides def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ Does constrained viterbi decoding on class probabilities output in :func:`forward`. The constraint simply specifies that the output tags must be a valid BIO sequence. We add a ``"tags"`` key to the dictionary with the result. """ all_predictions = output_dict['class_probabilities'] sequence_lengths = get_lengths_from_binary_sequence_mask(output_dict["mask"]).data.tolist() if all_predictions.dim() == 3: predictions_list = [all_predictions[i].detach().cpu() for i in range(all_predictions.size(0))] else: predictions_list = [all_predictions] all_tags = [] transition_matrix = self.get_viterbi_pairwise_potentials() start_transitions = self.get_start_transitions() for predictions, length in zip(predictions_list, sequence_lengths): max_likelihood_sequence, _ = viterbi_decode(predictions[:length], transition_matrix, allowed_start_transitions=start_transitions) tags = [self.vocab.get_token_from_index(x, namespace="labels") for x in max_likelihood_sequence] all_tags.append(tags) output_dict['tags'] = all_tags return output_dict def get_metrics(self, reset: bool = False): if self.ignore_span_metric: # Return an empty dictionary if ignoring the # span metric return {} else: metric_dict = self.span_metric.get_metric(reset=reset) # This can be a lot of metrics, as there are 3 per class. # we only really care about the overall metrics, so we filter for them here. return {x: y for x, y in metric_dict.items() if "overall" in x} def get_viterbi_pairwise_potentials(self): """ Generate a matrix of pairwise transition potentials for the BIO labels. The only constraint implemented here is that I-XXX labels must be preceded by either an identical I-XXX tag or a B-XXX tag. In order to achieve this constraint, pairs of labels which do not satisfy this constraint have a pairwise potential of -inf. Returns ------- transition_matrix : torch.Tensor A (num_labels, num_labels) matrix of pairwise potentials. """ all_labels = self.vocab.get_index_to_token_vocabulary("labels") num_labels = len(all_labels) transition_matrix = torch.zeros([num_labels, num_labels]) for i, previous_label in all_labels.items(): for j, label in all_labels.items(): # I labels can only be preceded by themselves or # their corresponding B tag. if i != j and label[0] == 'I' and not previous_label == 'B' + label[1:]: transition_matrix[i, j] = float("-inf") return transition_matrix def get_start_transitions(self): """ In the BIO sequence, we cannot start the sequence with an I-XXX tag. This transition sequence is passed to viterbi_decode to specify this constraint. Returns ------- start_transitions : torch.Tensor The pairwise potentials between a START token and the first token of the sequence. """ all_labels = self.vocab.get_index_to_token_vocabulary("labels") num_labels = len(all_labels) start_transitions = torch.zeros(num_labels) for i, label in all_labels.items(): if label[0] == "I": start_transitions[i] = float("-inf") return start_transitions def normalize(self, mx): """Row-normalize sparse matrix""" rowsum = np.array(mx.sum(1)) r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv) mx = r_mat_inv.dot(mx) return mx def sparse_mx_to_torch_sparse_tensor(self, sparse_mx): """Convert a scipy sparse matrix to a torch sparse tensor.""" sparse_mx = sparse_mx.tocoo().astype(np.float32) indices = torch.from_numpy( np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) values = torch.from_numpy(sparse_mx.data) shape = torch.Size(sparse_mx.shape) return torch.sparse.FloatTensor(indices, values, shape)