def __init__(self, support_dim: int, query_dim: int, candidates_dim: int, num_step: int = 1, reason_type: int = 0, reason_dropout_p: float = 0.2, dropout_p: float = 0.4 ) -> None: """ Parameters ---------- reason_type: 0: random 1: only last 2: avg """ super().__init__() assert num_step > 0 assert reason_type < 3 and reason_type >=0 self.num_step = num_step self.reason_type = reason_type self.dropout_p = dropout_p self.reason_dropout_p = reason_dropout_p self.supports_predictor = BilinearAttention(query_dim, support_dim, normalize=False) self.candidates_predictor = BilinearAttention(support_dim, candidates_dim, normalize=False) self.rnn = nn.GRUCell(support_dim, query_dim) self.alpha = Parameter(torch.zeros(1,1))
def get_attention(st_ds_conf, attn_type, *dims): emb_sz = st_ds_conf[ 'emb_sz'] # dim for both the decoder output and the encoder output attn_type = attn_type.lower() if attn_type == "bilinear": if len(dims) < 2: dims = [emb_sz, emb_sz] attn = BilinearAttention(vector_dim=dims[0], matrix_dim=dims[1]) attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "dot_product": if len(dims) >= 2: assert dims[0] == dims[ 1], "encoder hidden states must be able to multiply with decoder output" attn = DotProductAttention() attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "multihead": attn = GeneralMultiHeadAttention( num_heads=st_ds_conf['num_heads'], input_dim=emb_sz, total_attention_dim=emb_sz, total_value_dim=emb_sz, attend_to_dim=emb_sz, output_dim=emb_sz, attention_dropout=st_ds_conf['attention_dropout'], use_future_blinding=False, ) attn = SingleTokenMHAttentionWrapper(attn) elif attn_type == "none": attn = None else: assert False return attn
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.out = torch.nn.Linear( in_features=self.word_embeddings.get_output_dim() * 4, out_features=vocab.get_vocab_size('labels') ) self.accuracy = CategoricalAccuracy() self.f_score_0 = F1Measure(positive_label=0) self.f_score_1 = F1Measure(positive_label=1) self.f_score_2 = F1Measure(positive_label=2) self.loss = CrossEntropyLoss() self.attention = BilinearAttention(word_embeddings.get_output_dim() * 3, word_embeddings.get_output_dim())
def get_attention(st_ds_conf, attn_type): emb_sz = st_ds_conf[ 'emb_sz'] # dim for both the decoder output and the encoder output attn_type = attn_type.lower() if attn_type == "bilinear": attn = BilinearAttention(vector_dim=emb_sz, matrix_dim=emb_sz) attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "dot_product": attn = DotProductAttention() attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout']) elif attn_type == "multihead": attn = GeneralMultiHeadAttention( num_heads=st_ds_conf['num_heads'], input_dim=emb_sz, total_attention_dim=emb_sz, total_value_dim=emb_sz, attend_to_dim=emb_sz, output_dim=emb_sz, attention_dropout=st_ds_conf['attention_dropout'], use_future_blinding=False, ) attn = SingleTokenMHAttentionWrapper(attn) elif attn_type == "none": attn = None else: assert False return attn
def __init__(self, input_size: int, hidden_size: int, tag_embedding_size: int, num_layers: int, bidirectional: bool, tag_vocab_size: int, output_dropout: float) -> None: super().__init__() self.hidden_size = hidden_size self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True) bidir_mul = 2 if bidirectional else 1 self.attention = BilinearAttention(vector_dim=hidden_size * bidir_mul, matrix_dim=hidden_size * bidir_mul) self.tag_embed = nn.Embedding(num_embeddings=tag_vocab_size, embedding_dim=tag_embedding_size) self.decoder = nn.LSTM(input_size=2 * bidir_mul * hidden_size + tag_embedding_size, hidden_size=bidir_mul * hidden_size, num_layers=num_layers, bidirectional=False, batch_first=True) self.output2tag = torch.nn.Linear(2 * bidir_mul * hidden_size, tag_vocab_size) self.output_dropout = torch.nn.Dropout(p=output_dropout)
def test_forward_does_a_bilinear_product(self): params = Params({"vector_dim": 2, "matrix_dim": 2, "normalize": False}) bilinear = BilinearAttention.from_params(params) bilinear._weight_matrix = Parameter( torch.FloatTensor([[-0.3, 0.5], [2.0, -1.0]])) bilinear._bias = Parameter(torch.FloatTensor([0.1])) a_vectors = torch.FloatTensor([[1, 1]]) b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]]) result = bilinear(a_vectors, b_vectors).detach().numpy() assert result.shape == (1, 2) assert_almost_equal(result, [[1.8, -0.4]])
def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.text_seq_encoder = PytorchSeq2VecWrapper( LSTM(word_embeddings.get_output_dim(), int(word_embeddings.get_output_dim() / 2), batch_first=True, bidirectional=True)) self.out = torch.nn.Linear( in_features=self.word_embeddings.get_output_dim() * 4, out_features=vocab.get_vocab_size('labels')) self.accuracy = CategoricalAccuracy() self.f_score_0 = F1Measure(positive_label=0) self.f_score_1 = F1Measure(positive_label=1) self.f_score_2 = F1Measure(positive_label=2) self.loss = CrossEntropyLoss() self.attention = BilinearAttention( word_embeddings.get_output_dim() * 3, word_embeddings.get_output_dim())
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, dropout: float = 0.0, input_dropout: float = 0.0, label_smoothing: float = 0.1, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SentimentClassifier, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder share_rnn = nn.LSTM(input_size=self._text_field_embedder.get_output_dim(), hidden_size=150, batch_first=True, # dropout=dropout, bidirectional=True) share_encoder = PytorchSeq2SeqWrapper(share_rnn) self._encoder = RNNEncoder(vocab, share_encoder, input_dropout, regularizer) self._seq_vec = CnnEncoder(self._encoder.get_output_dim(), 25) self._de_dim = len(TASKS_NAME) weight = torch.empty(self._de_dim, self._text_field_embedder.get_output_dim()) torch.nn.init.orthogonal_(weight) self._domain_embeddings = Embedding(self._de_dim, self._text_field_embedder.get_output_dim(), weight=weight) self._de_attention = BilinearAttention(self._seq_vec.get_output_dim(), self._domain_embeddings.get_output_dim()) self._de_feedforward = FeedForward(self._domain_embeddings.get_output_dim(), 1, self._seq_vec.get_output_dim(), Activation.by_name("elu")()) self._num_classes = self.vocab.get_vocab_size("label") self._sentiment_discriminator = Discriminator(self._seq_vec.get_output_dim(), self._num_classes) self._s_domain_discriminator = Discriminator(self._seq_vec.get_output_dim(), len(TASKS_NAME)) self._valid_discriminator = Discriminator(self._domain_embeddings.get_output_dim(), 2) self._dropout = InputVariationalDropout(dropout) self._input_dropout = Dropout(input_dropout) self._label_smoothing = label_smoothing self.metrics = { "s_domain_acc": CategoricalAccuracy(), "valid_acc": CategoricalAccuracy() } for task_name in TASKS_NAME: self.metrics["{}_stm_acc".format(task_name)] = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._domain_loss = torch.nn.CrossEntropyLoss() # TODO torch.nn.BCELoss self._valid_loss = torch.nn.BCEWithLogitsLoss() initializer(self)
def test_forward_does_a_bilinear_product(self): params = Params({ 'vector_dim': 2, 'matrix_dim': 2, 'normalize': False, }) bilinear = BilinearAttention.from_params(params) bilinear._weight_matrix = Parameter(torch.FloatTensor([[-.3, .5], [2.0, -1.0]])) bilinear._bias = Parameter(torch.FloatTensor([.1])) a_vectors = torch.FloatTensor([[1, 1]]) b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]]) result = bilinear(a_vectors, b_vectors).detach().numpy() assert result.shape == (1, 2) assert_almost_equal(result, [[1.8, -.4]])
def build_parsing_recombination_seq2seq_copy_model( flags, data_reader, vocab: Vocabulary, source_namespace: str = 'source_tokens', target_namespace: str = 'target_tokens') -> Model: source_embedding = Embedding( vocab.get_vocab_size(namespace=source_namespace), embedding_dim=flags.source_embedding_dim) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(flags.source_embedding_dim, flags.encoder_hidden_dim, batch_first=True, bidirectional=flags.encoder_bidirectional)) attention = BilinearAttention(flags.attention_hidden_dim, flags.attention_hidden_dim, normalize=False) source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding}) initializer = InitializerApplicator.from_params([ (".*bias", Params({ "type": "constant", "val": 0 })), ('.*', Params({ "type": "uniform", "a": -0.1, "b": 0.1 })) ]) metric = SequenceAccuracy() model = RecombinationSeq2SeqWithCopy( vocab, source_embedder, lstm, flags.max_decode_length, seq_metrics=metric, source_namespace=source_namespace, target_namespace=target_namespace, target_embedding_dim=flags.target_embedding_dim, attention=attention, beam_size=flags.beam_size, use_bleu=False, encoder_input_dropout=flags.encoder_input_dropout, encoder_output_dropout=flags.encoder_output_dropout, dropout=flags.dropout, feed_output_attention_to_decoder=True, keep_decoder_output_dim_same_as_encoder=True, initializer=initializer) return model
def __init__(self, embedding_dim): self.embedding_dim = embedding_dim encoder = MultilayerCnnEncoder( embedding_dim=self.embedding_dim, num_filters=self.embedding_dim * 2, layers=2, conv_layer_activation=Activation.by_name('tanh')(), ngram_filter_sizes=(3, ), output_dim=self.embedding_dim, pooling='avg') attention = BilinearAttention(vector_dim=self.embedding_dim * 2, matrix_dim=self.embedding_dim, normalize=True) super(BiLinearSelectionGenerator, self).__init__( encoder=encoder, attention=attention, )
def __init__(self, input_size: int, hidden_size: int, num_layers: int, bidirectional: bool, vocab_size: int, output_dropout: float) -> None: super().__init__() self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, batch_first=True) bidir_mul = 2 if bidirectional else 1 self.attention = BilinearAttention(vector_dim=hidden_size * bidir_mul, matrix_dim=hidden_size * bidir_mul) self.output2label = torch.nn.Linear(2 * bidir_mul * hidden_size, vocab_size) self.output_dropout = torch.nn.Dropout(p=output_dropout)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, seq2seq_encoder: Seq2SeqEncoder, initializer: InitializerApplicator) -> None: super(ProLocalModel, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.seq2seq_encoder = seq2seq_encoder self.attention_layer = \ BilinearAttention(2 * seq2seq_encoder.get_output_dim(), seq2seq_encoder.get_output_dim(), normalize=True) self.num_types = self.vocab.get_vocab_size("state_change_type_labels") self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(), self.num_types) # self.span_metric = SpanBasedF1Measure(vocab, # tag_namespace="state_change_tags") # by default "O" is ignored in metric computation # self.num_tags = self.vocab.get_vocab_size("state_change_tags") # self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2 # , self.num_tags)) self._type_accuracy = CategoricalAccuracy() self.type_f1_metrics = {} self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary( "state_change_type_labels") for type_label in self.type_labels_vocab.values(): self.type_f1_metrics["type_" + type_label] = F1Measure( self.vocab.get_token_index(type_label, "state_change_type_labels")) self._loss = torch.nn.CrossEntropyLoss() initializer(self)
encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)) train_iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("source_tokens", "num_tokens")], instances_per_epoch=INSTANCES_PER_EPOCH) validation_iterator = BucketIterator(batch_size=batch_size, sorting_keys=[("source_tokens", "num_tokens")]) train_iterator.index_with(vocab) validation_iterator.index_with(vocab) model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps=max_decoding_steps, target_embedding_dim=embedding_dim, target_namespace='target_tokens', attention=BilinearAttention(hidden_dim * 2, hidden_dim * 2), beam_size=beam_size) def train(): model.cuda(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=train_iterator, validation_iterator=validation_iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=num_epochs, serialization_dir=serialization_dir,
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, lexical_feedforward: FeedForward, contextual_encoder: Seq2SeqEncoder, attention_feedforward: FeedForward, matrix_attention: MatrixAttention, memory_encoder: Seq2SeqEncoder, output_feedforward: FeedForward, output_logit: FeedForward, answer_steps: int = 5, dropout: float = 0.5, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._lexical_feedforward = TimeDistributed(lexical_feedforward) self._contextual_encoder = contextual_encoder self._attention_feedforward = TimeDistributed(attention_feedforward) self._matrix_attention = matrix_attention self._memory_encoder = memory_encoder self._output_feedforward = output_feedforward self._output_logit = output_logit self._answer_steps = answer_steps self._answer_gru_cell = torch.nn.GRUCell( self._memory_encoder.get_output_dim(), self._memory_encoder.get_output_dim(), ) self._answer_attention = TimeDistributed( torch.nn.Linear(self._memory_encoder.get_output_dim(), 1)) self._answer_bilinear = BilinearAttention( self._memory_encoder.get_output_dim(), self._memory_encoder.get_output_dim(), ) check_dimensions_match(text_field_embedder.get_output_dim(), lexical_feedforward.get_input_dim(), "text field embedding dim", "lexical feedforward input dim") check_dimensions_match(lexical_feedforward.get_output_dim(), contextual_encoder.get_input_dim(), "lexical feedforwrd input dim", "contextual layer input dim") check_dimensions_match(contextual_encoder.get_output_dim(), attention_feedforward.get_input_dim(), "contextual layer output dim", "attention feedforward input dim") check_dimensions_match(contextual_encoder.get_output_dim() * 2, memory_encoder.get_input_dim(), "contextual layer output dim", "memory encoder input dim") check_dimensions_match(memory_encoder.get_output_dim() * 4, output_feedforward.get_input_dim(), "memory encoder output dim", "output feedforward input") check_dimensions_match(output_feedforward.get_output_dim(), output_logit.get_input_dim(), "output feedforward output dim", "output logit input") self._dropout = torch.nn.Dropout(dropout) if dropout else None self._accuracy = CategoricalAccuracy() self._loss = torch.nn.NLLLoss() initializer(self)
def __init__(self, vocab: Vocabulary, context_field_embedder: TextFieldEmbedder, context_encoder: Seq2SeqEncoder, target_encoder: Seq2SeqEncoder, feedforward: Optional[FeedForward] = None, context_attention_activation_function: str = 'tanh', target_attention_activation_function: str = 'tanh', target_field_embedder: Optional[TextFieldEmbedder] = None, inter_target_encoding: Optional[InterTarget] = None, target_position_weight: Optional[TargetPositionWeight] = None, target_position_embedding: Optional[TextFieldEmbedder] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, dropout: float = 0.0, label_name: str = 'target-sentiment-labels', loss_weights: Optional[List[float]] = None, use_target_sequences: bool = False) -> None: super().__init__(vocab, regularizer) ''' :param vocab: A Vocabulary, required in order to compute sizes for input/output projections. :param context_field_embedder: Used to embed the context/sentence and target text if target_field_embedder is None but the target_encoder is NOT None. :param context_encoder: Encoder that will create the representation for the sentence/context that the target appears in. :param target_encoder: Encoder that will create the representation of target text tokens. :param feedforward: An optional feed forward layer to apply after the encoder. :param context_attention_activation_function: The attention method to be used on the context. :param target_attention_activation_function: The attention method to be used on the target text. :param target_field_embedder: Used to embed the target text to give as input to the target_encoder. Thus this allows a separate embedding for context and target text. :param inter_target_encoding: Whether to model the relationship between targets/aspect. :param target_position_weight: Whether to weight the output of the context encoding based on the position of the tokens to the target tokens. This weighting is applied before any attention is applied. :param target_position_embedding: Whether or not to concatenate a position embedding on to the input embeddings before being an input to the `context_encoder`. :param initializer: Used to initialize the model parameters. :param regularizer: If provided, will be used to calculate the regularization penalty during training. :param dropout: To apply dropout after each layer apart from the last layer. All dropout that is applied to timebased data will be `variational dropout <https://arxiv.org/abs/1512.05287>`_ all else will be standard dropout. Variation dropout is applied to the target vectors after they have been processed by the `inter_target_encoding` if this is set. :param label_name: Name of the label name space. :param loss_weights: The amount of weight to give the negative, neutral, positive classes respectively. e.g. [0.2, 0.5, 0.3] would weight the negative class by a factor of 0.2, neutral by 0.5 and positive by 0.3. NOTE It assumes the sentiment labels are the following: [negative, neutral, positive]. :param use_target_sequences: Whether or not to use target tokens within the context as the targets contextualized word representation (CWR). This would only make sense to use if the word representation i.e. field embedder is a contextualized embedder e.g. ELMO etc. This also requires that the dataset reader has the following argument set to True `target_sequences`. ANOTHER reason why you would want to use this even when not using CWR is that you want to get contextualised POS/Dep tags etc. This is based on the `Interactive Attention Networks for Aspect-Level Sentiment Classification <https://www.ijcai.org/proceedings/2017/0568.pdf>`_. The model is also known as `IAN`. .. _variational dropout: https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf ''' self.label_name = label_name self.context_field_embedder = context_field_embedder self.target_field_embedder = target_field_embedder self.num_classes = self.vocab.get_vocab_size(self.label_name) self.target_encoder = target_encoder self.context_encoder = context_encoder self.feedforward = feedforward self._use_target_sequences = use_target_sequences if self._use_target_sequences and self.target_field_embedder: raise ConfigurationError( '`use_target_sequences` cannot be True at' ' the same time as a value for ' '`target_field_embedder` as the embeddings' ' come from the context and not a separate embedder') context_attention_activation_function = Activation.by_name( f'{context_attention_activation_function}')() target_attention_activation_function = Activation.by_name( f'{target_attention_activation_function}')() target_encoder_out = self.target_encoder.get_output_dim() context_encoder_out = self.context_encoder.get_output_dim() self.context_attention_layer = BilinearAttention( target_encoder_out, context_encoder_out, context_attention_activation_function, normalize=True) self.target_attention_layer = BilinearAttention( context_encoder_out, target_encoder_out, target_attention_activation_function, normalize=True) # To be used as the pooled input into the target attention layer as # the query vector. self._context_averager = BagOfEmbeddingsEncoder(context_encoder_out, averaged=True) # To be used as the pooled input into the context attention layer as # the query vector. self._target_averager = BagOfEmbeddingsEncoder(target_encoder_out, averaged=True) # Set the loss weights (have to sort them by order of label index in # the vocab) self.loss_weights = target_sentiment.util.loss_weight_order( self, loss_weights, self.label_name) # Inter target modelling self.inter_target_encoding = inter_target_encoding if feedforward is not None: output_dim = self.feedforward.get_output_dim() elif self.inter_target_encoding is not None: output_dim = self.inter_target_encoding.get_output_dim() else: output_dim = target_encoder_out + context_encoder_out self.label_projection = Linear(output_dim, self.num_classes) self.metrics = {"accuracy": CategoricalAccuracy()} self.f1_metrics = {} # F1 Scores label_index_name = self.vocab.get_index_to_token_vocabulary( self.label_name) for label_index, _label_name in label_index_name.items(): _label_name = f'F1_{_label_name.capitalize()}' self.f1_metrics[_label_name] = F1Measure(label_index) # Dropout self._variational_dropout = InputVariationalDropout(dropout) self._naive_dropout = Dropout(dropout) # position embeddings self.target_position_embedding = target_position_embedding # Ensure that the dimensions of the text field embedder and text encoder # match if self.target_position_embedding: context_and_position_dim = ( context_field_embedder.get_output_dim() + self.target_position_embedding.get_output_dim()) check_dimensions_match( context_and_position_dim, context_encoder.get_input_dim(), "context field embedding dim and the position embeddings", "text encoder input dim") else: check_dimensions_match(context_field_embedder.get_output_dim(), context_encoder.get_input_dim(), "context field embedding dim", "text encoder input dim") # Ensure that the dimensions of the target or text field embedder and # the target encoder match target_field_embedder_dim = context_field_embedder.get_output_dim() target_field_error = "context field embedding dim" if self.target_field_embedder: target_field_embedder_dim = target_field_embedder.get_output_dim() target_field_error = "target field embedding dim" check_dimensions_match(target_field_embedder_dim, target_encoder.get_input_dim(), target_field_error, "target encoder input dim") if self.inter_target_encoding: check_dimensions_match(target_encoder_out + context_encoder_out, self.inter_target_encoding.get_input_dim(), 'Output from target and context encdoers', 'Inter Target encoder input dim') self.target_position_weight = target_position_weight # TimeDistributed anything that is related to the targets. if self.feedforward is not None: self.feedforward = TimeDistributed(self.feedforward) self.label_projection = TimeDistributed(self.label_projection) self._time_naive_dropout = TimeDistributed(self._naive_dropout) initializer(self)
def common_init( self, encoder_output_dim: int, decoder: DecoderNet, decoder_type: str, decoder_num_layers: int, share_decoder_params: bool, start_token: str = "[CLS]", end_token: str = "[SEP]", index_name: str = "bert", beam_size: int = 4, min_dec_len: int = 4, max_dec_len: int = 30, coverage_factor: float = 0.0, device: Union[int, str, List[int]] = -1, metrics: Optional[List[Metric]] = None, valid_metric_keys: List[str] = None, seed: int = 42, initializer: InitializerApplicator = InitializerApplicator()): """几个不同模型通用的初始化过程""" seed_everything(seed) # 初始化随机种子 # ----------- metrics相关初始化 ------------- # 定义metrics self._metrics = [TokenBasedBLEU(), TokenBasedROUGE()] if metrics is not None: self._metrics = metrics self._rewrite_em = RewriteEM() self._restore_score = RestorationScore(compute_restore_tokens=True) self._cov_loss_value = Average() self.valid_metric_keys = valid_metric_keys # ----------- 参数相关初始化 ------------- # 定义token以及其他参数 self._start_token = start_token self._end_token = end_token self._index_name = index_name # 使用bert模型,本质上还是要事先读取词表 # 所以需要将对应的vocabulary的namespace进行修改 # 这里非常重要,如果namespace不对,很容易出现assert_trigger_error if "bert" in self._index_name: self._vocab_namespace = "tokens" else: self._vocab_namespace = self._index_name self.coverage_factor = coverage_factor self.decoder_num_layers = decoder_num_layers decoder_type = decoder_type.lower() # 保存一些重要的参数 self.params = Params( params={ "beam_size": beam_size, "min_dec_len": min_dec_len, "max_dec_len": max_dec_len, "decoder_type": decoder_type }) # ----------- device相关初始化 ------------- device = parse_cuda_device(device) check_for_gpu(device) # 检查gpu设置是否超过可用范围 if isinstance(device, list): device = device[0] if device < 0: self._device = torch.device("cpu") else: self._device = torch.device(f"cuda:{device}") # ----------- decoder相关初始化 ------------- # 定义decoder self.decoder = decoder self._share_decoder_params = share_decoder_params # 如果解码器是lstm,需要判断是否使用coverage机制 # transformer使用coverage机制比较麻烦,所以直接使用内部计算出来的attention分布 if self.params['decoder_type'] == 'lstm': # 用于LSTM解码器 if self.coverage_factor > 0.0: # 定义用于计算decoder中的每一个step对应encoder结果的attention层 # 以及计算对于当前轮和历史轮的attention分布的权重 self.attention = BilinearAttention( vector_dim=encoder_output_dim, matrix_dim=encoder_output_dim + 1, activation=Activation.by_name('linear')()) self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3 + 2, 2) else: self.attention = BilinearAttention( vector_dim=encoder_output_dim, matrix_dim=encoder_output_dim, activation=Activation.by_name('linear')()) self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3, 2) else: # 用于Transformer解码器 self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3, 2) # ----------- 词表相关初始化 ------------- self._vocab_size = self.vocab.get_vocab_size( namespace=self._vocab_namespace) self._unk_id = self.vocab.get_token_index( self.vocab._oov_token, namespace=self._vocab_namespace) # ----------- 初始化模型参数 ------------- self._initializer = initializer self._initializer(self.lamb_linear) self._initializer(self.decoder)
embedding_dim2 = 16 sequence_length = 10 # Attention # dot product attention only allows vector/matrix of the same size vector = torch.rand((batch_size, embedding_dim1,)) matrix = torch.rand((batch_size, sequence_length, embedding_dim1)) attention = DotProductAttention() output = attention(vector, matrix) print('Output from DotProductAttention:', output.size(), output) # bilinear & linear attention allows inputs of different sizes vector = torch.rand((batch_size, embedding_dim1,)) matrix = torch.rand((batch_size, sequence_length, embedding_dim2)) attention = BilinearAttention(vector_dim=embedding_dim1, matrix_dim=embedding_dim2) output = attention(vector, matrix) print('Output from BilinearAttention:', output.size(), output) tanh = Activation.by_name('tanh')() attention = LinearAttention( tensor_1_dim=embedding_dim1, tensor_2_dim=embedding_dim2, combination='x,y', activation=tanh) output = attention(vector, matrix) print('Output from LinearAttention:', output) # MatrixAttention sequence_length1 = 10 sequence_length2 = 15 # dot product attention only allows matrices of the same size
def __init__( self, # Vocabluary. vocab: Vocabulary, # Embeddings. source_field_embedder: TextFieldEmbedder, target_embedding_size: int, # Encoders and Decoders. encoder: Seq2SeqEncoder, decoder_type: str, output_projection_layer: FeedForward, source_namespace: str = "source", target_namespace: str = "target", # Hyperparamters and flags. decoder_attention_function: BilinearAttention = None, decoder_is_bidirectional: bool = False, decoder_num_layers: int = 1, apply_attention: Optional[bool] = False, max_decoding_steps: int = 100, scheduled_sampling_ratio: float = 0.4, # Logistical. initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) if encoder.get_input_dim() != source_field_embedder.get_output_dim(): raise ConfigurationError( "The input dimension of the encoder must match the embedding" "size of the source_field_embedder. Found {} and {}, respectively." .format(encoder.get_input_dim(), source_field_embedder.get_output_dim())) if output_projection_layer.get_output_dim() != vocab.get_vocab_size( target_namespace): raise ConfigurationError( "The output dimension of the output_projection_layer must match the " "size of the French vocabulary. Found {} and {}, " "respectively.".format( output_projection_layer.get_output_dim(), vocab.get_vocab_size(target_namespace))) if decoder_type not in SequenceToSequence.DECODERS: raise ConfigurationError( "Unrecognized decoder option '{}'".format(decoder_type)) # For dealing with input. self.source_vocab_size = vocab.get_vocab_size(source_namespace) self.target_vocab_size = vocab.get_vocab_size(target_namespace) self.source_field_embedder = source_field_embedder or TextFieldEmbedder( ) self.encoder = encoder # For dealing with / producing output. self.target_vocab_size = vocab.get_vocab_size(target_namespace) self.target_embedder = Embedding(self.target_vocab_size, target_embedding_size) # Input size will either be the target embedding size or the target embedding size plus the # encoder hidden size to attend on the input. # # When making a custom attention function that uses neither of those input sizes, you will # have to define the decoder yourself. decoder_input_size = target_embedding_size if apply_attention: decoder_input_size += encoder.get_output_dim() # Hidden size of the encoder and decoder should match. decoder_hidden_size = encoder.get_output_dim() self.decoder = SequenceToSequence.DECODERS[decoder_type]( decoder_input_size, decoder_hidden_size, num_layers=decoder_num_layers, batch_first=True, bias=True, bidirectional=decoder_is_bidirectional) self.output_projection_layer = output_projection_layer self.apply_attention = apply_attention self.decoder_attention_function = decoder_attention_function or BilinearAttention( matrix_dim=encoder.get_output_dim(), vector_dim=encoder.get_output_dim()) # Hyperparameters. self._max_decoding_steps = max_decoding_steps self._scheduled_sampling_ratio = scheduled_sampling_ratio # Used for prepping the translation primer (initialization of the target word-level # encoder's hidden state). # # If the decoder is an LSTM, both hidden states and cell states must be initialized. # Also, hidden states that prime translation via this encoder must be duplicated # across by number of layers they has. self._decoder_is_lstm = isinstance(self.decoder, torch.nn.LSTM) self._decoder_num_layers = decoder_num_layers self._start_index = vocab.get_token_index(START_SYMBOL, target_namespace) self._end_index = vocab.get_token_index(END_SYMBOL, target_namespace) self._source_namespace = source_namespace self._target_namespace = target_namespace self._batch_size = None initializer(self)
def __init__(self, options: Options, statistics: GraphEmbeddingStatisticsBase): super().__init__() self.use_char_embedding = options.use_char_embedding self.use_property_embeddings = options.use_property_embeddings self.use_highway = options.use_highway self.compress_node_embedding = options.compress_node_embedding self.num_rnn_layers = options.num_rnn_layers self.dropout = nn.Dropout(p=options.dropout_rate) self.word_embedding = Embedding(len(statistics.words), statistics.get_embedding_dim_of("words"), padding_idx=0 ) self.conn_label_embedding = Embedding(len(statistics.conn_labels), statistics.get_embedding_dim_of("conn_labels"), padding_idx=0 ) self.embeddings = {} self.properties = statistics.get_properties() for vocab_name in self.properties: vocab = getattr(statistics, vocab_name) embedding = Embedding(len(vocab), statistics.get_embedding_dim_of(vocab_name), padding_idx=0 ) embedding_name = '{}_embedding'.format(vocab_name) self.embeddings[vocab_name] = embedding self.add_module(embedding_name, embedding) if not options.word_vector_trainable: # Do not train embeddings self.word_embedding.weight.requires_grad_(False) node_embedding_dim = self.word_embedding.embedding_dim if self.use_char_embedding: char_dim = self.char_embedding.embedding_dim node_embedding_dim += options.char_lstm_hidden_size # TODO: change data format to avoid batch_fisrt=True self.char_lstm = nn.LSTM(char_dim, options.char_lstm_hidden_size, options.num_char_lstm_layers, batch_first=True) if self.use_property_embeddings: for prop_name in self.properties: node_embedding_dim += self.embeddings[prop_name].embedding_dim if self.compress_node_embedding: self.compress_linear = nn.Linear(node_embedding_dim, options.compressed_embedding_dim) node_embedding_dim = options.compressed_embedding_dim if self.use_highway: self.multi_highway = Highway(node_embedding_dim, options.num_highway_layers, f=torch.tanh) conn_label_dim = self.conn_label_embedding.embedding_dim hidden_size = options.model_hidden_size self.hidden_size = hidden_size self.node_embedding_dim = node_embedding_dim self.neighbor_linear = nn.Linear(node_embedding_dim + conn_label_dim, hidden_size) self.use_out = use_out = statistics.use_out self.input_gate = GraphRNNGate(hidden_size, torch.sigmoid, use_out) self.output_gate = GraphRNNGate(hidden_size, torch.sigmoid, use_out) self.forget_gate = GraphRNNGate(hidden_size, torch.sigmoid, use_out) self.cell = GraphRNNGate(hidden_size, torch.tanh, use_out) if options.use_attention: self.embedding_attention = BilinearAttention( self.node_embedding_dim, self.node_embedding_dim + self.conn_label_embedding.embedding_dim, activation=torch.nn.functional.tanh ) self.hidden_attention = BilinearAttention( self.hidden_size, self.hidden_size, activation=torch.nn.functional.tanh ) else: # use sum instead of attention self.embedding_attention = self.hidden_attention = None
def __init__(self, vocab: Vocabulary, model_name: str = None, start_attention: Attention = None, end_attention: Attention = None, text_field_embedder: TextFieldEmbedder = None, task_pretrained_file: str = None, neg_sample_ratio: float = 0.0, max_turn_len: int = 3, start_token: str = "[CLS]", end_token: str = "[SEP]", index_name: str = "bert", eps: float = 1e-8, seed: int = 42, loss_factor: float = 1.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: RegularizerApplicator = None): super().__init__(vocab, regularizer) if model_name is None and text_field_embedder is None: raise ValueError( f"`model_name` and `text_field_embedder` can't both equal to None." ) # 单纯的resolution任务,只需要返回最后一层的embedding表征即可 self._text_field_embedder = text_field_embedder or PretrainedChineseBertMismatchedEmbedder( model_name, return_all=False, output_hidden_states=False, max_turn_length=max_turn_len) seed_everything(seed) self._neg_sample_ratio = neg_sample_ratio self._start_token = start_token self._end_token = end_token self._index_name = index_name self._initializer = initializer linear_input_size = self._text_field_embedder.get_output_dim() # 使用attention的方法 self.start_attention = start_attention or BilinearAttention( vector_dim=linear_input_size, matrix_dim=linear_input_size) self.end_attention = end_attention or BilinearAttention( vector_dim=linear_input_size, matrix_dim=linear_input_size) # mask的指标,主要考虑F-score,而且我们更加关注`1`的召回率 self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._rewrite_em = RewriteEM(valid_keys="semr,nr_semr,re_semr") self._restore_score = RestorationScore(compute_restore_tokens=True) self._metrics = [ TokenBasedBLEU(mode="1,2"), TokenBasedROUGE(mode="1r,2r") ] self._eps = eps self._loss_factor = loss_factor self._initializer(self.start_attention) self._initializer(self.end_attention) # 加载其他任务预训练的模型 if task_pretrained_file is not None and os.path.isfile( task_pretrained_file): logger.info("loading related task pretrained weights...") self.load_state_dict(torch.load(task_pretrained_file), strict=False)
def __init__( self, # Vocabluary. vocab: Vocabulary, cuda_device, # Embeddings. source_text_field_embedder: TextFieldEmbedder, target_embedding_size: int, hidden_size: int, decoder_type: str = "gru", source_namespace: str = "tokens", target_namespace: str = "target", # Hyperparamters and flags. drop_out_rate: float = 0.0, decoder_attention_function: BilinearAttention = None, decoder_is_bidirectional: bool = False, decoder_num_layers: int = 1, apply_attention: bool = False, max_decoding_steps: int = 100, # scheduled_sampling_ratio: float = 0.0, attention_file: str = "attention_data.jsonl", regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) assert decoder_type in SequenceToSequence.DECODERS self.source_vocab_size = vocab.get_vocab_size(source_namespace) self.target_vocab_size = vocab.get_vocab_size(target_namespace) self.source_field_embedder = source_text_field_embedder self.encoder = torch.nn.LSTM( self.source_field_embedder.get_output_dim(), hidden_size, num_layers=1, bidirectional=False, batch_first=True) self.metrics = {"BELU": BELU()} self.target_vocab_size = vocab.get_vocab_size(target_namespace) self.target_embedder = Embedding(self.target_vocab_size, target_embedding_size) if apply_attention: decoder_input_size = target_embedding_size + hidden_size else: decoder_input_size = target_embedding_size + hidden_size # self.analyze_this_target = START_SYMBOL + " S T A I R C A S E . . . " + END_SYMBOL self.attention_file = attention_file self.dropout = torch.nn.Dropout(p=drop_out_rate) # Hidden size of the encoder and decoder should match. decoder_hidden_size = hidden_size self.decoder = SequenceToSequence.DECODERS[decoder_type]( decoder_input_size, decoder_hidden_size, num_layers=decoder_num_layers, batch_first=True, bias=True, bidirectional=decoder_is_bidirectional) self.output_projection_layer = torch.nn.Linear( hidden_size, len(vocab._token_to_index['target'])) self.apply_attention = apply_attention self.decoder_attention_function = decoder_attention_function or BilinearAttention( matrix_dim=hidden_size, vector_dim=hidden_size) # Hyperparameters. self._max_decoding_steps = max_decoding_steps # self._scheduled_sampling_ratio = scheduled_sampling_ratio self._decoder_is_lstm = isinstance(self.decoder, torch.nn.LSTM) self._decoder_is_gru = isinstance(self.decoder, torch.nn.GRU) self._decoder_num_layers = decoder_num_layers self._start_index = vocab.get_token_index(START_SYMBOL, target_namespace) self._end_index = vocab.get_token_index(END_SYMBOL, target_namespace) self._source_namespace = source_namespace self._target_namespace = target_namespace self.count = 0 self.first_dump = True if cuda_device[0] == -1 or cuda_device == -1: self.device = torch.device("cpu") else: cuda = "cuda:" + str(cuda_device[0]) self.device = torch.device( cuda if torch.cuda.is_available() else "cpu")
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, target_encoder: Seq2VecEncoder, feedforward: Optional[FeedForward] = None, target_field_embedder: Optional[TextFieldEmbedder] = None, attention_activation_function: Optional[str] = 'tanh', initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, word_dropout: float = 0.0, dropout: float = 0.0) -> None: ''' :param vocab: vocab : A Vocabulary, required in order to compute sizes for input/output projections. :param text_field_embedder: Used to embed the text and target text if target_field_embedder is None but the target_encoder is not None. :param text_encoder: Sequence Encoder that will create the representation of each token in the context sentence. :param target_encoder: Encoder that will create the representation of target text tokens. :param feedforward: An optional feed forward layer to apply after either the text encoder if target encoder is None. Else it would be after the target and the text encoded representations have been concatenated. :param target_field_embedder: Used to embed the target text to give as input to the target_encoder. Thus this allows a seperate embedding for text and target text. :param attention_activation_function: The name of the activation function applied after the ``h^T W t + b`` calculation. Activation names can be found `here <https://allenai.github.io/ allennlp-docs/api/allennlp.nn. activations.html>`_. Default is tanh. :param initializer: Used to initialize the model parameters. :param regularizer: If provided, will be used to calculate the regularization penalty during training. :param word_dropout: Dropout that is applied after the embedding of the tokens/words. It will drop entire words with this probabilty. :param dropout: To apply dropout after each layer apart from the last layer. All dropout that is applied to timebased data will be `variational dropout`_ all else will be standard dropout. This attention target classifier is based on the model in `Exploiting Document Knowledge for Aspect-level Sentiment Classification Ruidan <https://aclanthology.info/papers/P18-2092/p18-2092>`_ where the attention on the encoded context words are based on the encoded target vector. .. _variational dropout: https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf ''' super().__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.target_field_embedder = target_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.text_encoder = text_encoder self.target_encoder = target_encoder self.feedforward = feedforward attention_activation_function = Activation.by_name( f'{attention_activation_function}')() self.attention_layer = BilinearAttention( self.target_encoder.get_output_dim(), self.text_encoder.get_output_dim(), attention_activation_function, normalize=True) if feedforward is not None: output_dim = self.feedforward.get_output_dim() else: output_dim = self.text_encoder.get_output_dim() self.label_projection = Linear(output_dim, self.num_classes) self.metrics = {"accuracy": CategoricalAccuracy()} self.f1_metrics = {} # F1 Scores label_index_name = self.vocab.get_index_to_token_vocabulary('labels') for label_index, label_name in label_index_name.items(): label_name = f'F1_{label_name.capitalize()}' self.f1_metrics[label_name] = F1Measure(label_index) self._word_dropout = WordDrouput(word_dropout) self._variational_dropout = InputVariationalDropout(dropout) self._naive_dropout = Dropout(dropout) self.loss = torch.nn.CrossEntropyLoss() # Ensure that the dimensions of the text field embedder and text encoder # match check_dimensions_match(text_field_embedder.get_output_dim(), text_encoder.get_input_dim(), "text field embedding dim", "text encoder input dim") # Ensure that the dimensions of the target or text field embedder and # the target encoder match target_field_embedder_dim = text_field_embedder.get_output_dim() target_field_error = "text field embedding dim" if self.target_field_embedder: target_field_embedder_dim = target_field_embedder.get_output_dim() target_field_error = "target field embedding dim" check_dimensions_match(target_field_embedder_dim, target_encoder.get_input_dim(), target_field_error, "target encoder input dim") initializer(self)
def __init__( self, vocab: Vocabulary, input_dim: int, decoder_hidden_size: int, max_decoding_steps: int, output_proj_input_dim: int, target_namespace: str = "targets", target_embedding_dim: int = None, attention: str = "none", dropout: float = 0.0, scheduled_sampling_ratio: float = 0.0, ) -> None: super(Seq2SeqDecoder, self).__init__(vocab) self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._unk_index = self.vocab.get_token_index("@@UNKNOWN@@", self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._encoder_output_dim = input_dim self._decoder_hidden_dim = decoder_hidden_size if self._encoder_output_dim != self._decoder_hidden_dim: self._projection_encoder_out = Linear(self._encoder_output_dim, self._decoder_hidden_dim) else: self._projection_encoder_out = lambda x: x self._decoder_output_dim = self._decoder_hidden_dim self._output_proj_input_dim = output_proj_input_dim self._target_embedding_dim = target_embedding_dim self._target_embedder = Embedding(num_classes, self._target_embedding_dim) # Used to get an initial hidden state from the encoder states self._sent_pooler = Pooler(project=True, d_inp=input_dim, d_proj=decoder_hidden_size) if attention == "Bahdanau": self._decoder_attention = BahdanauAttention( decoder_hidden_size + target_embedding_dim, input_dim) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time # step. self._decoder_input_dim = input_dim + target_embedding_dim elif attention == "bilinear": self._decoder_attention = BilinearAttention( decoder_hidden_size + target_embedding_dim, input_dim) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time # step. self._decoder_input_dim = input_dim + target_embedding_dim elif attention == "none": self._decoder_attention = None self._decoder_input_dim = target_embedding_dim else: raise Exception("attention not implemented {}".format(attention)) self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim) # Allow for a bottleneck layer between encoder outputs and distribution over vocab # The bottleneck layer consists of a linear transform and helps to reduce # number of parameters if self._output_proj_input_dim != self._decoder_output_dim: self._projection_bottleneck = Linear(self._decoder_output_dim, self._output_proj_input_dim) else: self._projection_bottleneck = lambda x: x self._output_projection_layer = Linear(self._output_proj_input_dim, num_classes) self._dropout = torch.nn.Dropout(p=dropout)
class Baseline(Model): def __init__(self, word_embeddings: TextFieldEmbedder, vocab: Vocabulary) -> None: super().__init__(vocab) self.word_embeddings = word_embeddings self.text_seq_encoder = PytorchSeq2VecWrapper( LSTM(word_embeddings.get_output_dim(), int(word_embeddings.get_output_dim() / 2), batch_first=True, bidirectional=True)) self.out = torch.nn.Linear( in_features=self.word_embeddings.get_output_dim() * 4, out_features=vocab.get_vocab_size('labels')) self.accuracy = CategoricalAccuracy() self.f_score_0 = F1Measure(positive_label=0) self.f_score_1 = F1Measure(positive_label=1) self.f_score_2 = F1Measure(positive_label=2) self.loss = CrossEntropyLoss() self.attention = BilinearAttention( word_embeddings.get_output_dim() * 3, word_embeddings.get_output_dim()) def forward(self, article: Dict[str, torch.Tensor], outcome: Dict[str, torch.Tensor], intervention: Dict[str, torch.Tensor], comparator: Dict[str, torch.Tensor], labels: torch.Tensor = None, evidence: torch.Tensor = None) -> Dict[str, torch.Tensor]: p_mask = get_text_field_mask(article, 1) p_size = p_mask.size() a_mask = (torch.sum(p_mask, dim=2) > 0) unf_p_mask = p_mask.reshape(p_size[0] * p_size[1], p_size[2]) a_embeddings = self.word_embeddings(article) unf_a_embeddings = a_embeddings.reshape(p_size[0] * p_size[1], p_size[2], -1) unf_a_vec = self.text_seq_encoder(unf_a_embeddings, unf_p_mask) a_vec = unf_a_vec.reshape(p_size[0], p_size[1], -1) o_mask = get_text_field_mask(outcome) o_embeddings = self.word_embeddings(outcome) o_vec = self.text_seq_encoder(o_embeddings, o_mask) i_mask = get_text_field_mask(intervention) i_embeddings = self.word_embeddings(intervention) i_vec = self.text_seq_encoder(i_embeddings, i_mask) c_mask = get_text_field_mask(comparator) c_embeddings = self.word_embeddings(comparator) c_vec = self.text_seq_encoder(c_embeddings, c_mask) prompt_vec = torch.cat((o_vec, i_vec, c_vec), dim=1) a_attentions = self.attention.forward(prompt_vec, a_vec, a_mask) attended_a_vec = torch.sum(a_vec * a_attentions.unsqueeze(2), dim=1) logits = self.out( torch.cat((attended_a_vec, o_vec, i_vec, c_vec), dim=1)) output = {'logits': logits, 'attentions': a_attentions} if (labels is not None) and (evidence is not None): evidence_one_hot = get_one_hot(evidence, p_mask.size(1)) skip_no_evidence_mask = (torch.sum(evidence_one_hot, dim=1) > 0).unsqueeze(1).float() att_loss = -1 * torch.mean( ((evidence_one_hot * torch.log(torch.clamp(a_attentions, min=1e-9, max=1))) + ((1 - evidence_one_hot) * torch.log(torch.clamp(1 - a_attentions, min=1e-9, max=1)))) * a_mask.float() * skip_no_evidence_mask) classification_loss = self.loss(logits, labels) self.accuracy(logits, labels) self.f_score_0(logits, labels) self.f_score_1(logits, labels) self.f_score_2(logits, labels) output['loss'] = classification_loss + (5 * att_loss) return output def get_metrics(self, reset: bool = False) -> Dict[str, float]: _, _, f_score0 = self.f_score_0.get_metric(reset) _, _, f_score1 = self.f_score_1.get_metric(reset) _, _, f_score2 = self.f_score_2.get_metric(reset) return { 'accuracy': self.accuracy.get_metric(reset), 'f-score': np.mean([f_score0, f_score1, f_score2]) }