Exemplo n.º 1
0
    def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 question_encoder: Seq2SeqEncoder,
                 passage_encoder: Seq2SeqEncoder,
                 r: float = 0.8,
                 dropout: float = 0.1,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(EvidenceExtraction, self).__init__(vocab, regularizer)

        self._embedder = embedder

        self._question_encoder = question_encoder
        self._passage_encoder = passage_encoder

        # size: 2H
        encoding_dim = question_encoder.get_output_dim()

        self._gru_cell = nn.GRUCell(2 * encoding_dim, encoding_dim)

        self._gate = nn.Linear(2 * encoding_dim, 2 * encoding_dim)

        self._match_layer_1 = nn.Linear(2 * encoding_dim, encoding_dim)
        self._match_layer_2 = nn.Linear(encoding_dim, 1)

        self._question_attention_for_passage = Attention(
            NonlinearSimilarity(encoding_dim))
        self._question_attention_for_question = Attention(
            NonlinearSimilarity(encoding_dim))
        self._passage_attention_for_answer = Attention(
            NonlinearSimilarity(encoding_dim), normalize=False)
        self._passage_attention_for_ranking = Attention(
            NonlinearSimilarity(encoding_dim))

        self._passage_self_attention = Attention(
            NonlinearSimilarity(encoding_dim))
        self._self_gru_cell = nn.GRUCell(2 * encoding_dim, encoding_dim)
        self._self_gate = nn.Linear(2 * encoding_dim, encoding_dim)

        self._answer_net = nn.GRUCell(encoding_dim, encoding_dim)

        self._v_r_Q = nn.Parameter(torch.rand(encoding_dim))
        self._r = r

        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._squad_metrics = SquadEmAndF1()

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x

        initializer(self)
Exemplo n.º 2
0
    def __init__(self,
                 encoder_output_dim: int,
                 action_embedding_dim: int,
                 attention_function: SimilarityFunction,
                 checklist_size: int = None) -> None:
        super(NlvrDecoderStep, self).__init__()
        self._input_attention = Attention(attention_function)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`, which we
        # arbitrarily set to be the same as `output_dim`.
        self._input_projection_layer = Linear(
            output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state, and optionally a difference between the current checklist vector and its
        # target, if we are training to maximize coverage using a checklist. Then we concatenate
        # those with the decoder state and project to `action_embedding_dim` to make a prediction.
        if checklist_size is None:
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim, action_embedding_dim)
        else:
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim + checklist_size,
                action_embedding_dim)

        # TODO(pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(input_dim, output_dim)
Exemplo n.º 3
0
    def __init__(self,
                 encoder_output_dim: int,
                 action_embedding_dim: int,
                 attention_function: SimilarityFunction,
                 dropout: float = 0.0,
                 use_coverage: bool = False) -> None:
        super(NlvrDecoderStep, self).__init__()
        self._input_attention = Attention(attention_function)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`, which we
        # arbitrarily set to be the same as `output_dim`.
        self._input_projection_layer = Linear(
            output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state. Then we concatenate those with the decoder state and project to
        # `action_embedding_dim` to make a prediction.
        self._output_projection_layer = Linear(output_dim + encoder_output_dim,
                                               action_embedding_dim)
        if use_coverage:
            # This is a multiplicative factor that is used to add the embeddings of yet to be
            # produced actions to the predicted embedding and bias it.
            self._checklist_embedding_multiplier = Parameter(
                torch.FloatTensor([1.0]))
        # TODO(pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(input_dim, output_dim)
        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
Exemplo n.º 4
0
    def __init__(self, vocab_size, max_len, embed_size, hidden_size, sos_id=2, eos_id=3, n_layers=1, rnn_cell='GRU',
            input_dropout_p=0, dropout_p=0, use_attention=False):
        super(Decoder, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.input_dropout = nn.Dropout(p=input_dropout_p)
        if rnn_cell == 'LSTM':
            self.rnn_cell = nn.LSTM
        elif rnn_cell == 'GRU':
            self.rnn_cell = nn.GRU
        else:
            raise ValueError("Unsupported RNN Cell: {0}".format(rnn_cell))
        self.rnn = self.rnn_cell(embed_size, hidden_size, n_layers, batch_first=True, dropout=dropout_p)

        self.output_size = vocab_size
        self.max_length = max_len
        self.use_attention = use_attention
        self.eos_id = eos_id
        self.sos_id = sos_id

        self.init_input = None

        self.embedding = nn.Embedding(self.output_size, embed_size)
        if use_attention:
            self.attention = Attention(self.hidden_size)

        self.out = nn.Linear(self.hidden_size, self.output_size)
Exemplo n.º 5
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 seq2seq_encoder: Seq2SeqEncoder,
                 initializer: InitializerApplicator) -> None:
        super(ProLocalModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.seq2seq_encoder = seq2seq_encoder

        self.attention_layer = \
            Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(),
                                                             seq2seq_encoder.get_output_dim()), normalize=True)

        self.num_types = self.vocab.get_vocab_size("state_change_type_labels")
        self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(),
                                            self.num_types)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="state_change_tags")  # by default "O" is ignored in metric computation
        self.num_tags = self.vocab.get_vocab_size("state_change_tags")

        self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2
                                                           , self.num_tags))
        self._type_accuracy = CategoricalAccuracy()

        self.type_f1_metrics = {}
        self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary("state_change_type_labels")
        for type_label in self.type_labels_vocab.values():
            self.type_f1_metrics["type_" + type_label] = F1Measure(self.vocab.get_token_index(type_label, "state_change_type_labels"))

        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
Exemplo n.º 6
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "target_tags",
                 target_embedding_dim: int = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SimpleSeq2SeqCrf, self).__init__(vocab, regularizer)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        self._crf = ConditionalRandomField(num_classes)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = Attention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_output_dim)
        # self._decoder_cell = GRUCell(self._decoder_input_dim, self._decoder_output_dim, bias=False)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.span_metric = SpanBasedF1Measure(
            vocab,
            tag_namespace=target_namespace,
            ignore_classes=[START_SYMBOL[2:], END_SYMBOL[2:]])
        initializer(self)

        # Initialize forget gate
        """
 def test_masked(self):
     attention = Attention()
     # Testing general masked non-batched case.
     vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5]]))
     matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.1, 0.4, 0.3]]]))
     mask = Variable(torch.FloatTensor([[1.0, 0.0, 1.0]]))
     result = attention(vector, matrix, mask).data.numpy()
     assert_almost_equal(result, numpy.array([[0.52248482, 0.0, 0.47751518]]))
    def __init__(self,
                 encoder_output_dim: int,
                 action_embedding_dim: int,
                 attention_function: SimilarityFunction,
                 num_start_types: int,
                 num_entity_types: int,
                 mixture_feedforward: FeedForward = None,
                 dropout: float = 0.0,
                 unlinked_terminal_indices: List[int] = None) -> None:
        super(WikiTablesDecoderStep, self).__init__()
        self._mixture_feedforward = mixture_feedforward
        self._entity_type_embedding = Embedding(num_entity_types,
                                                action_embedding_dim)
        self._input_attention = Attention(attention_function)

        self._num_start_types = num_start_types
        self._start_type_predictor = Linear(encoder_output_dim,
                                            num_start_types)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`, which we
        # arbitrarily set to be the same as `output_dim`.
        self._input_projection_layer = Linear(
            output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state, and optionally a difference between the current checklist vector and its
        # target, if we are training to maximize coverage using a checklist. Then we concatenate
        # those with the decoder state and project to `action_embedding_dim` to make a prediction.
        if unlinked_terminal_indices is None:
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim, action_embedding_dim)
        else:
            unlinked_checklist_size = len(unlinked_terminal_indices)
            self._output_projection_layer = Linear(
                output_dim + encoder_output_dim + unlinked_checklist_size,
                action_embedding_dim)

        self._unlinked_terminal_indices = unlinked_terminal_indices
        # TODO(pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(input_dim, output_dim)

        if mixture_feedforward is not None:
            check_dimensions_match(output_dim,
                                   mixture_feedforward.get_input_dim(),
                                   "hidden state embedding dim",
                                   "mixture feedforward input dim")
            check_dimensions_match(mixture_feedforward.get_output_dim(), 1,
                                   "mixture feedforward output dim",
                                   "dimension for scalar value")

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
 def test_non_normalized_attention_works(self):
     attention = Attention(normalize=False)
     sentence_tensor = Variable(torch.FloatTensor([[[-1, 0, 4],
                                                    [1, 1, 1],
                                                    [-1, 0, 4],
                                                    [-1, 0, -1]]]))
     query_tensor = Variable(torch.FloatTensor([[.1, .8, .5]]))
     result = attention(query_tensor, sentence_tensor).data.numpy()
     assert_almost_equal(result, [[1.9, 1.4, 1.9, -.6]])
Exemplo n.º 10
0
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 spans_per_word: float,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.0,
                 spans_extractor: SpanExtractor = None,
                 spans_scorer_feedforward: FeedForward = None) -> None:
        super(SpanAe, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim() + 1
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
        )
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = Attention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim(
            ) + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        self._decoder_cell = LSTMCell(self._decoder_input_dim + 1,
                                      self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)

        self._span_extractor = spans_extractor

        feedforward_scorer = torch.nn.Sequential(
            TimeDistributed(spans_scorer_feedforward),
            TimeDistributed(
                torch.nn.Linear(spans_scorer_feedforward.get_output_dim(), 1)))
        self._span_pruner = SpanPruner(feedforward_scorer)

        self._spans_per_word = spans_per_word
    def test_batched_no_mask(self):
        attention = Attention()

        # Testing general batched case.
        vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5], [0.3, 0.1, 0.5]]))
        matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2]],
                                             [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2]]]))

        result = attention(vector, matrix).data.numpy()
        assert_almost_equal(result, numpy.array([[0.52871835, 0.47128162],
                                                 [0.52871835, 0.47128162]]))
Exemplo n.º 12
0
 def __init__(self,
              vocab: Vocabulary,
              source_embedder: TextFieldEmbedder,
              encoder: Seq2SeqEncoder,
              max_decoding_steps: int,
              target_namespace: str = "tokens",
              target_embedding_dim: int = None,
              attention_function: SimilarityFunction = None,
              scheduled_sampling_ratio: float = 0.0) -> None:
     super(SimpleSeq2Seq, self).__init__(vocab)
     self._source_embedder = source_embedder
     self._encoder = encoder
     self._max_decoding_steps = max_decoding_steps
     self._target_namespace = target_namespace
     self._attention_function = attention_function
     self._scheduled_sampling_ratio = scheduled_sampling_ratio
     # We need the start symbol to provide as the input at the first timestep of decoding, and
     # end symbol as a way to indicate the end of the decoded sequence.
     self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                    self._target_namespace)
     self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                  self._target_namespace)
     num_classes = self.vocab.get_vocab_size(self._target_namespace)
     self.num_classes = num_classes
     # Decoder output dim needs to be the same as the encoder output dim since we initialize the
     # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
     # we're using attention with ``DotProductSimilarity``, this is needed.
     self._decoder_output_dim = self._encoder.get_output_dim()
     target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim(
     )
     self._target_embedder = Embedding(num_classes, target_embedding_dim)
     if self._attention_function:
         self._decoder_attention = Attention(self._attention_function)
         # The output of attention, a weighted average over encoder outputs, will be
         # concatenated to the input vector of the decoder at each time step.
         self._decoder_input_dim = self._encoder.get_output_dim(
         ) + target_embedding_dim
     else:
         self._decoder_input_dim = target_embedding_dim
     # TODO (pradeep): Do not hardcode decoder cell type.
     self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                   self._decoder_output_dim)
     self._output_projection_layer = Linear(self._decoder_output_dim,
                                            num_classes)
    def test_batched_masked(self):
        attention = Attention()

        # Testing general masked non-batched case.
        vector = Variable(torch.FloatTensor([[0.3, 0.1, 0.5], [0.3, 0.1, 0.5]]))
        matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]],
                                             [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]]]))
        mask = Variable(torch.FloatTensor([[1.0, 1.0, 0.0], [1.0, 0.0, 1.0]]))
        result = attention(vector, matrix, mask).data.numpy()
        assert_almost_equal(result, numpy.array([[0.52871835, 0.47128162, 0.0],
                                                 [0.50749944, 0.0, 0.49250056]]))

        # Test the case where a mask is all 0s and an input is all 0s.
        vector = Variable(torch.FloatTensor([[0.0, 0.0, 0.0], [0.3, 0.1, 0.5]]))
        matrix = Variable(torch.FloatTensor([[[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]],
                                             [[0.6, 0.8, 0.1], [0.15, 0.5, 0.2], [0.5, 0.3, 0.2]]]))
        mask = Variable(torch.FloatTensor([[1.0, 1.0, 0.0], [0.0, 0.0, 0.0]]))
        result = attention(vector, matrix, mask).data.numpy()
        assert_almost_equal(result, numpy.array([[0.5, 0.5, 0.0],
                                                 [0.0, 0.0, 0.0]]))
Exemplo n.º 14
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder, use_attention: bool,
                 seq2seq_encoder: Seq2SeqEncoder,
                 seq2vec_encoder: Seq2VecEncoder,
                 span_end_encoder_after: Seq2SeqEncoder,
                 use_decoder_trainer: bool, decoder_beam_search: BeamSearch,
                 kb_configs: dict, other_configs: dict,
                 initializer: InitializerApplicator) -> None:
        super(ProStructModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.num_actions = len(Action)  # number of actions is hardcoded here.
        # They are defined in Action enum in propara_dataset_reader.py
        self.other_configs = other_configs

        # kb_coefficient * kb_score + (1-kb_coefficient) * model_score
        self.kb_coefficient = torch.nn.Parameter(
            torch.ones(1).mul(kb_configs.get('kb_coefficient', 0.5)))

        self.use_attention = use_attention
        self.use_decoder_trainer = use_decoder_trainer
        if self.use_attention:
            self.seq2seq_encoder = seq2seq_encoder
            self.time_distributed_seq2seq_encoder = TimeDistributed(
                TimeDistributed(self.seq2seq_encoder))
            self.time_distributed_attention_layer = \
                TimeDistributed(TimeDistributed(
                    Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(),
                                                                     seq2seq_encoder.get_output_dim()),
                              normalize=True)))
            self.aggregate_feedforward = Linear(
                seq2seq_encoder.get_output_dim(), self.num_actions)
        else:
            self.seq2vec_encoder = seq2vec_encoder
            self.time_distributed_seq2vec_encoder = TimeDistributed(
                TimeDistributed(self.seq2vec_encoder))
            self.aggregate_feedforward = Linear(
                seq2vec_encoder.get_output_dim(), self.num_actions)

        self.span_end_encoder_after = span_end_encoder_after
        # per step per participant
        self.time_distributed_encoder_span_end_after = TimeDistributed(
            TimeDistributed(self.span_end_encoder_after))

        # Fixme: dimensions

        self._span_start_predictor_after = TimeDistributed(
            TimeDistributed(
                torch.nn.Linear(2 + 2 * seq2seq_encoder.get_output_dim(), 1)))

        self._span_end_predictor_after = TimeDistributed(
            TimeDistributed(
                torch.nn.Linear(span_end_encoder_after.get_output_dim(), 1)))

        self._type_accuracy = BooleanAccuracy(
        )  # Fixme WRONG. Categorical accuracy should be right!
        self._loss = torch.nn.CrossEntropyLoss(
            ignore_index=-1
        )  # Fixme: This is less robust. If the masking value

        # Fixme: add a metric for location span strings
        self.span_metric = SquadEmAndF1()

        if self.use_decoder_trainer:
            self.decoder_trainer = MaximumMarginalLikelihood()
            if kb_configs['kb_to_use'] == 'lexicalkb':
                kb = KBLexical(lexical_kb_path=kb_configs['lexical_kb_path'],
                               fullgrid_prompts_load_path=kb_configs[
                                   'fullgrid_prompts_load_path'])

            # Makeshift arrangement to get number of participants in tiny.tsv .
            self.commonsense_based_action_generator = CommonsenseBasedActionGenerator(
                self.num_actions)
            self.rules_activated = [
                int(rule_val.strip()) > 0
                for rule_val in self.other_configs.get(
                    'constraint_rules_to_turn_on', '0,0,0,1').split(",")
            ]
            self.rule_2_fraction_participants = self.other_configs.get(
                'rule_2_fraction_participants', 0.5)
            self.rule_3_fraction_steps = self.other_configs.get(
                'rule_3_fraction_steps', 0.5)

            self.commonsense_based_action_generator.set_rules_used(
                self.rules_activated, self.rule_2_fraction_participants,
                self.rule_3_fraction_steps)
            # [self.rules_activated[0],  # C/D/C/D cannot happen
            #  self.rules_activated[1],  # > 1/2 partic
            #  self.rules_activated[2],  # > 1/2 steps cannot change
            #  self.rules_activated[3]  # until mentioned
            #  ])
            self.decoder_step = ProParaDecoderStep(
                KBBasedActionScorer(kb=kb, kb_coefficient=self.kb_coefficient),
                valid_action_generator=self.commonsense_based_action_generator)

        self.beam_search = decoder_beam_search
        initializer(self)
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 max_decoding_steps: int,
                 target_namespace: str = "tokens",
                 target_embedder: TextFieldEmbedder = None,
                 attention_function: SimilarityFunction = None,
                 scheduled_sampling_ratio: float = 0.25) -> None:
        super(PointerGeneratorPattern, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        self._pattern_pos = [
            '@@np@@', '@@ns@@', '@@ni@@', '@@nz@@', '@@m@@', '@@i@@', '@@id@@',
            '@@t@@', '@@j@@'
        ]
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        self._target_embedder = target_embedder or source_embedder
        #!!! attention on decoder output, not on decoder input !!!#
        self._decoder_input_dim = self._target_embedder.get_output_dim()

        # decoder use UniLSTM while encoder use BiLSTM
        self._decoder_hidden_dim = self._encoder.get_output_dim()

        # decoder: h0 c0 projection_layer from final_encoder_output
        self.decode_h0_projection_layer = Linear(
            self._encoder.get_output_dim(), self._decoder_hidden_dim)
        self.decode_c0_projection_layer = Linear(
            self._encoder.get_output_dim(), self._decoder_hidden_dim)

        self._decoder_attention = Attention(self._attention_function)
        # The output of attention, a weighted average over encoder outputs, will be
        # concatenated to the decoder_hidden of the decoder at each time step.
        # V[s_t, h*_t] + b
        self._decoder_output_dim = self._decoder_hidden_dim + self._encoder.get_output_dim(
        )  #[s_t, h*_t]

        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_hidden_dim)
        self._output_attention_layer = Linear(self._decoder_output_dim,
                                              self._decoder_hidden_dim)
        #V[s_t, h*_t] + b
        self._output_projection_layer = Linear(self._decoder_hidden_dim,
                                               num_classes)
        # num_classes->V'
        # generationp robability
        self._pointer_gen_layer = Linear(
            self._decoder_hidden_dim + self._encoder.get_output_dim() +
            self._decoder_input_dim, 1)
        # metrics
        self.metrics = {
            "ROUGE-1": Rouge(1),
            "ROUGE-2": Rouge(2),
        }