예제 #1
0
  def build(self):
    """Build the model """
    print("Building the bow - sequence to sequence model ... ")

    vocab_size = self.vocab_size
    state_size = self.state_size
    enc_layers = self.enc_layers
    max_enc_bow = self.max_enc_bow
    num_paraphrase = self.num_paraphrase

    # Placeholders
    with tf.name_scope("placeholders"):
      enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs")
      enc_lens = tf.placeholder(tf.int32, [None], "enc_lens")
      self.drop_out = tf.placeholder(tf.float32, (), "drop_out")
      self.max_len = tf.placeholder(tf.int32, (), "max_len")
      dec_bow = tf.placeholder(tf.int32, [None, None], "dec_bow")
      dec_bow_len = tf.placeholder(tf.int32, [None], "dec_bow_len")

      self.enc_inputs = enc_inputs
      self.enc_lens = enc_lens
      self.dec_bow = dec_bow 
      self.dec_bow_len = dec_bow_len

      if(self.mode == "train"):
        enc_targets = tf.placeholder(tf.int32, [None, None], "enc_targets")
        enc_seq2seq_inputs = tf.placeholder(
          tf.int32, [None, num_paraphrase, None], "enc_seq2seq_inputs")
        enc_seq2seq_targets = tf.placeholder(
          tf.int32, [None, num_paraphrase, None], "enc_seq2seq_targets")
        enc_seq2seq_lens = tf.placeholder(
          tf.int32, [None, num_paraphrase], "enc_seq2seq_lens")

        dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs")
        dec_targets = tf.placeholder(tf.int32, [None, None], "dec_targets")
        dec_lens = tf.placeholder(tf.int32, [None], "dec_lens")

        self.enc_targets = enc_targets
        self.enc_seq2seq_inputs = enc_seq2seq_inputs
        self.enc_seq2seq_targets = enc_seq2seq_targets
        self.enc_seq2seq_lens = enc_seq2seq_lens
        self.dec_inputs = dec_inputs
        self.dec_targets = dec_targets
        self.dec_lens = dec_lens

    enc_batch_size = tf.shape(enc_inputs)[0]
    max_len = self.max_len

    dec_batch_size = tf.shape(dec_bow)[0]
    max_dec_bow = tf.shape(dec_bow)[1]

    # Embedding 
    with tf.variable_scope("embeddings"):
      embedding_matrix = tf.get_variable(
        name="embedding_matrix", 
        shape=[vocab_size, state_size],
        dtype=tf.float32,
        initializer=tf.random_normal_initializer(stddev=0.05))
      enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs)

      if(self.mode == "train"): 
        dec_inputs = tf.nn.embedding_lookup(embedding_matrix, dec_inputs)
        dec_bow = tf.nn.embedding_lookup(embedding_matrix, dec_bow)

    # Encoder
    with tf.variable_scope("encoder"):
      # TODO: residual LSTM, layer normalization
      enc_cell = [create_cell("enc-%d" % i, state_size, self.drop_out) 
        for i in range(enc_layers)]
      enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell)
      enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_inputs,
        sequence_length=enc_lens, dtype=tf.float32)

    # Encoder bow prediction
    with tf.variable_scope("bow_output"):
      if(self.bow_pred_method == "mix_softmax"): 
        bow_topk_prob = bow_predict_mix_softmax(
          enc_batch_size, vocab_size, max_enc_bow, enc_state)

      elif(self.bow_pred_method == "seq_tag"):
        bow_topk_prob, _, _, _ = bow_predict_seq_tag(
          vocab_size, enc_batch_size, enc_outputs, enc_lens, max_len)

      elif(self.bow_pred_method == "seq2seq"):
        bow_topk_prob, enc_seq2seq_loss, enc_infer_pred = \
                                    bow_predict_seq2seq(enc_seq2seq_inputs, 
                                                        enc_seq2seq_targets,
                                                        enc_seq2seq_lens, 
                                                        embedding_matrix,
                                                        enc_outputs,
                                                        enc_state,
                                                        enc_layers,
                                                        num_paraphrase,
                                                        max_len,
                                                        enc_lens,
                                                        enc_batch_size,
                                                        vocab_size,
                                                        state_size, 
                                                        self.drop_out, 
                                                        self.dec_start_id)
      
    with tf.variable_scope("enc_optimizer"):
      enc_optimizer = tf.train.AdamOptimizer(self.learning_rate_enc)

    with tf.name_scope("enc_output"):
      # top k prediction 
      pred_prob, pred_ind = tf.nn.top_k(bow_topk_prob, max_enc_bow)
      pred_prob_unnorm = pred_prob
      pred_prob /= tf.expand_dims(tf.reduce_sum(pred_prob, axis=1), [1])

      pred_prob_dec, pred_ind_dec = tf.nn.top_k(bow_topk_prob, self.sample_size)
      pred_prob_dec /= tf.expand_dims(tf.reduce_sum(pred_prob_dec, axis=1), [1])

      if(self.mode == "train"):
        with tf.name_scope("enc_loss"):
          # loss function 
          enc_targets = _enc_target_list_to_khot(
            enc_targets, vocab_size, self.pad_id)
          enc_bow_loss = enc_loss_fn(
            self.bow_loss_fn, enc_targets, bow_topk_prob, max_enc_bow)
          if(self.bow_pred_method == "seq2seq"): 
            # pure sequence to sequence for now 
            enc_loss = enc_seq2seq_loss + 0.0 * enc_bow_loss
          else: 
            enc_loss = enc_bow_loss
          enc_train_op = enc_optimizer.minimize(enc_loss)

        # prediction preformance monitor during training 
        # write this in a function 
        # TODO: top 10 recall 
        with tf.name_scope("train_output"):
          # encoder training output
          self.enc_train_output = { "enc_train_op": enc_train_op, 
                                    "enc_bow_loss": enc_bow_loss,
                                    "enc_loss": enc_loss}
          bow_metrics_dict = bow_train_monitor(
            bow_topk_prob, pred_ind, vocab_size, enc_batch_size, enc_targets)
          self.enc_train_output.update(bow_metrics_dict)

          if(self.bow_pred_method == "seq2seq"): 
            self.enc_train_output["enc_seq2seq_loss"] = enc_seq2seq_loss

      # encoder inference output
      with tf.name_scope("infer_output"):
        if(self.bow_pred_method == "seq2seq"): 
          (infer_overlap, infer_pred_support, infer_target_support, infer_prec, 
            infer_recl) = bow_seq2seq_metrics(
              enc_targets, enc_infer_pred, vocab_size, self.pad_id)
          self.enc_infer_output = { 
            "enc_infer_overlap": infer_overlap,
            "enc_infer_pred_support": infer_pred_support,
            "enc_infer_target_support": infer_target_support,
            "enc_infer_precision": infer_prec,
            "enc_infer_recall": infer_recl,
            "enc_infer_pred": enc_infer_pred}
        else:
          self.enc_infer_output = { "pred_prob": pred_prob,
                                    "pred_ind": pred_ind,
                                    "pred_prob_dec": pred_prob_dec,
                                    "pred_ind_dec": pred_ind_dec}
        

    # Decoder bow encoding
    # TODO: sampling from encoder topk prediction
    with tf.variable_scope("dec_bow_encoding"):
      dec_bow_mask = tf.expand_dims(
        tf.sequence_mask(dec_bow_len, max_dec_bow, dtype=tf.float32), [2])
      
      # TODO: transformer based encoding, but our primary goal is to test the 
      # effectiveness of sampling, so we skip it for now 
      dec_bow_enc = tf.reduce_mean(dec_bow_mask * dec_bow, axis = 1) # [B, S]

    with tf.variable_scope("decoder"):
      dec_cell = [create_cell("dec-%d" % i, state_size, self.drop_out) 
        for i in range(enc_layers)]
      dec_cell = tf.nn.rnn_cell.MultiRNNCell(dec_cell)

      dec_init_state = (LSTMStateTuple(dec_bow_enc, dec_bow_enc), 
                        LSTMStateTuple(dec_bow_enc, dec_bow_enc))
      dec_proj = tf.layers.Dense(vocab_size, name="dec_proj",
        kernel_initializer=tf.random_normal_initializer(stddev=0.05),
        bias_initializer=tf.constant_initializer(0.))
      dec_memory = dec_bow
      dec_mem_len = dec_bow_len
      dec_max_mem_len = max_dec_bow


      # greedy decoding
      # _, dec_outputs_predict = decoding_infer(self.dec_start_id,
      #                                         dec_cell,
      #                                         dec_proj,
      #                                         embedding_matrix,
      #                                         dec_init_state,
      #                                         dec_bow,
      #                                         dec_batch_size,
      #                                         max_len,
      #                                         dec_bow_len,
      #                                         max_dec_bow,
      #                                         self.is_attn)

      # if(self.mode == "train"):
      #   # training decoding
      #   dec_outputs_train = decoding_train( dec_inputs, 
      #                                       dec_cell, 
      #                                       dec_init_state, 
      #                                       dec_bow,  
      #                                       max_len, 
      #                                       dec_bow_len, 
      #                                       max_dec_bow,
      #                                       self.is_attn)
      #   dec_logits_train = dec_proj(dec_outputs_train)

      dec_outputs_predict, dec_logits_train = decode( 
        self.dec_start_id, dec_inputs, 
        dec_cell, dec_proj, embedding_matrix, 
        dec_init_state, dec_memory, dec_mem_len, dec_max_mem_len, 
        dec_batch_size, max_len, self.sampling_method, self.topk_sampling_size,
        state_size, multi_source=False)

    all_variables = slim.get_variables_to_restore()
    model_variables = [var for var in all_variables 
      if var.name.split("/")[0] == self.model_name]
    print("%s model, variable list:" % self.model_name)
    for v in model_variables: print("  %s" % v.name)
    self.model_saver = tf.train.Saver(model_variables, max_to_keep=3)  

    with tf.variable_scope("dec_optimizer"):
      dec_optimizer = tf.train.AdamOptimizer(self.learning_rate_dec)

    with tf.name_scope("dec_output"):
      if(self.mode == "train"):
        dec_mask = tf.sequence_mask(dec_lens, max_len, dtype=tf.float32)
        dec_loss = tf.contrib.seq2seq.sequence_loss(
          dec_logits_train, dec_targets, dec_mask)
        dec_train_op = dec_optimizer.minimize(dec_loss)

        self.dec_train_output = { "dec_train_op": dec_train_op, 
                                  "dec_loss": dec_loss}
    
      self.dec_infer_output = {"dec_predict": dec_outputs_predict}
    return 
예제 #2
0
    def build(self):
        print("Building the language model ... ")

        vocab_size = self.vocab_size
        state_size = self.state_size
        enc_layers = self.enc_layers

        with tf.name_scope("placeholders"):
            enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs")
            targets = tf.placeholder(tf.int32, [None, None], "targets")
            inp_lens = tf.placeholder(tf.int32, [None], "inp_lens")
            self.drop_out = tf.placeholder(tf.float32, (), "drop_out")

            self.enc_inputs = enc_inputs
            self.inp_lens = inp_lens
            self.targets = targets

        batch_size = tf.shape(enc_inputs)[0]
        max_len = tf.shape(enc_inputs)[1]

        with tf.variable_scope("embeddings"):
            embedding_matrix = tf.get_variable("embedding_matrix",
                                               [vocab_size, state_size])
            enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs)

        with tf.variable_scope("encoder"):
            # TODO: residual LSTM, layer normalization
            enc_cell = [
                create_cell("enc-%d" % i, state_size, self.drop_out)
                for i in range(enc_layers)
            ]
            enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell)
            enc_outputs, enc_state = tf.nn.dynamic_rnn(
                enc_cell,
                enc_inputs,
                sequence_length=inp_lens,
                dtype=tf.float32)

            enc_proj = tf.layers.Dense(vocab_size, name="enc_proj")
            enc_logits = enc_proj(enc_outputs)

            mask = tf.sequence_mask(inp_lens, max_len, dtype=tf.float32)
            loss = tf.contrib.seq2seq.sequence_loss(enc_logits, targets, mask)

            # get variables before optimizer
            all_variables = slim.get_variables_to_restore()
            lm_variables = [
                var for var in all_variables if var.name[:2] == "lm"
            ]
            print("lm model, variable list:")
            for v in lm_variables:
                print("  %s" % v.name)
            self.model_saver = tf.compat.v1.train.Saver(lm_variables,
                                                        max_to_keep=10)

            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=self.learning_rate)
            train_op = optimizer.minimize(loss)

            self.train_output = {
                "train_op": train_op,
                "loss": loss,
                "ppl": tf.exp(loss)
            }
            self.eval_output = {"loss": loss, "ppl": tf.exp(loss)}

        return
예제 #3
0
def bow_predict_seq2seq(enc_seq2seq_inputs, 
                        enc_seq2seq_targets,
                        enc_seq2seq_lens, 
                        embedding_matrix, 
                        enc_outputs, 
                        enc_state, 
                        enc_layers,
                        num_paraphrase,  
                        max_len, 
                        enc_lens, 
                        batch_size,
                        vocab_size, 
                        state_size, 
                        drop_out, 
                        dec_start_id):
  """bow prediction as sequence to sequence"""

  enc_seq2seq_inputs = tf.nn.embedding_lookup(
    embedding_matrix, enc_seq2seq_inputs) 
  # [B, P, T, S] -> [P, B, T, S]
  enc_seq2seq_inputs = tf.transpose(enc_seq2seq_inputs, [1, 0, 2, 3]) 
  # [B, P, T] -> [P, B, T]
  enc_seq2seq_targets = tf.transpose(enc_seq2seq_targets, [1, 0, 2]) 
  # [B, P] -> [P, B]
  enc_seq2seq_lens = tf.transpose(enc_seq2seq_lens, [1, 0])
  
  init_state = enc_state
  enc_pred_loss = 0.0
  bow_topk_prob = tf.zeros([batch_size, vocab_size])
  enc_infer_pred = []
  for i in range(num_paraphrase):

    # encoder prediction cell 
    enc_pred_cell = [create_cell("enc_pred_p_%d_l_%d" % (i, j), state_size, drop_out) 
          for j in range(enc_layers)]
    enc_pred_cell = tf.nn.rnn_cell.MultiRNNCell(enc_pred_cell)

    # projection 
    enc_pred_proj = tf.layers.Dense(vocab_size, name="enc_pred_proj",
        kernel_initializer=tf.random_normal_initializer(stddev=0.05),
        bias_initializer=tf.constant_initializer(0.))

    # greedy decoding and training 
    _, enc_seq_predict = decoding_infer(dec_start_id,
                                        enc_pred_cell,
                                        enc_pred_proj,
                                        embedding_matrix,
                                        init_state,
                                        enc_outputs,
                                        batch_size,
                                        max_len,
                                        enc_lens,
                                        max_len,
                                        is_attn=True)
    enc_infer_pred.append(enc_seq_predict)

    enc_pred_inputs = enc_seq2seq_inputs[i]
    enc_seq_train = decoding_train( enc_pred_inputs, 
                                    enc_pred_cell, 
                                    init_state, 
                                    enc_outputs,  
                                    max_len, 
                                    enc_lens, 
                                    max_len,
                                    is_attn=True)
    enc_seq_train_logits = enc_pred_proj(enc_seq_train)

    # sequence to sequence loss 
    enc_seq_mask = tf.sequence_mask(
      enc_seq2seq_lens[i], max_len, dtype=tf.float32)
    enc_seq_loss = tf.contrib.seq2seq.sequence_loss(
      enc_seq_train_logits, enc_seq2seq_targets[i], enc_seq_mask)
    enc_pred_loss += enc_seq_loss

    # prediction probability 
    enc_pred_prob = tf.nn.softmax(enc_seq_train_logits) # [B, T, V]
    enc_pred_prob *= tf.expand_dims(enc_seq_mask, [2]) # [B, T, 1]
    enc_pred_prob = tf.reduce_sum(enc_pred_prob, axis=1) # [B, V]
    # NOTE: prob of certain words will be repeatedly calculated
    bow_topk_prob += enc_pred_prob 

  enc_pred_loss /= num_paraphrase

  enc_infer_pred = tf.stack(enc_infer_pred) # [P, B, T]
  enc_infer_pred = tf.transpose(enc_infer_pred, [1, 0, 2]) # [B, P, T]
  return bow_topk_prob, enc_pred_loss, enc_infer_pred
  def build(self):
    """Build the model"""
    print("Building the Latent BOW - sequence to sequence model ... ")

    vocab_size = self.vocab_size
    key_size = self.key_size
    state_size = self.state_size
    enc_layers = self.enc_layers
    max_enc_bow = self.max_enc_bow
    lambda_enc_loss = self.lambda_enc_loss

    # Placeholders
    with tf.name_scope("placeholders"):
      enc_keys = tf.placeholder(tf.int32, [None, None], "enc_keys")
      enc_locs = tf.placeholder(tf.int32, [None, None], "enc_locs")
      enc_vals = tf.placeholder(tf.int32, [None, None], "enc_vals")
      enc_lens = tf.placeholder(tf.int32, [None], "enc_lens")
      self.drop_out = tf.placeholder(tf.float32, (), "drop_out")
      self.gumbel_tau = tf.placeholder(tf.float32, (), "gumbel_tau")

      self.enc_keys = enc_keys
      self.enc_locs = enc_locs
      self.enc_vals = enc_vals
      self.enc_lens = enc_lens

      enc_targets = tf.placeholder(tf.int32, [None, None], "enc_targets")
      dec_inputs = tf.placeholder(tf.int32, [None, None], "dec_inputs")
      dec_targets = tf.placeholder(tf.int32, [None, None], "dec_targets")
      dec_lens = tf.placeholder(tf.int32, [None], "dec_lens")

      self.enc_targets = enc_targets
      self.dec_inputs = dec_inputs
      self.dec_targets = dec_targets
      self.dec_lens = dec_lens

    batch_size = tf.shape(enc_keys)[0]
    max_enc_len = tf.shape(enc_keys)[1]
    max_dec_len = tf.shape(dec_targets)[1]

    # Embedding 
    with tf.variable_scope("embeddings"):
      embedding_matrix_vals = tf.get_variable(
        name="embedding_matrix_vals", 
        shape=[vocab_size, state_size],
        dtype=tf.float32,
        initializer=tf.random_normal_initializer(stddev=0.05))
      embedding_matrix_keys = tf.get_variable(
        name="embedding_matrix_keys", 
        shape=[key_size, state_size],
        dtype=tf.float32,
        initializer=tf.random_normal_initializer(stddev=0.05))
      embedding_matrix_locs = tf.get_variable(
        name="embedding_matrix_locs", 
        shape=[100, state_size],
        dtype=tf.float32,
        initializer=tf.random_normal_initializer(stddev=0.05))

      enc_keys = tf.nn.embedding_lookup(embedding_matrix_keys, enc_keys)
      enc_vals = tf.nn.embedding_lookup(embedding_matrix_vals, enc_vals)
      enc_locs = tf.nn.embedding_lookup(embedding_matrix_locs, enc_locs)
      enc_inputs = (enc_keys + enc_vals + enc_locs) / 3.
      dec_inputs = tf.nn.embedding_lookup(embedding_matrix_vals, dec_inputs)

    # Encoder
    with tf.variable_scope("encoder"):
      # TODO: residual LSTM, layer normalization
      enc_cell = [create_cell(
        "enc-%d" % i, state_size, self.drop_out, self.no_residual) 
        for i in range(enc_layers)]
      enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell)
      enc_outputs, enc_state = tf.nn.dynamic_rnn(enc_cell, enc_inputs,
        sequence_length=enc_lens, dtype=tf.float32)

    # Encoder bow prediction
    with tf.variable_scope("bow_output"):
      bow_topk_prob, gumbel_topk_prob, seq_neighbor_ind, seq_neighbor_prob = \
        bow_predict_seq_tag(vocab_size, batch_size, enc_outputs, enc_lens, 
        max_enc_len, self.is_gumbel, self.gumbel_tau)
      seq_neighbor_output = {"seq_neighbor_ind": seq_neighbor_ind, 
        "seq_neighbor_prob": seq_neighbor_prob}
  
    # Encoder output, loss and metrics 
    with tf.name_scope("enc_output"):
      # top k prediction 
      bow_pred_prob, pred_ind = tf.nn.top_k(bow_topk_prob, max_enc_bow)

      # loss function 
      enc_targets = _enc_target_list_to_khot(
        enc_targets, vocab_size, self.pad_id)
      enc_loss = enc_loss_fn(
        self.bow_loss_fn, enc_targets, bow_topk_prob, max_enc_bow)
      self.train_output = {"enc_loss": enc_loss}

      # performance monitor 
      bow_metrics_dict = bow_train_monitor(
        bow_topk_prob, pred_ind, vocab_size, batch_size, enc_targets)
      self.train_output.update(bow_metrics_dict)

    # Encoder soft sampling 
    with tf.name_scope("gumbel_topk_sampling"):
      sample_ind, sample_prob, sample_memory = bow_gumbel_topk_sampling(
        gumbel_topk_prob, embedding_matrix_vals, self.sample_size, vocab_size)
      sample_memory_lens = tf.ones(batch_size, tf.int32) * self.sample_size
      sample_memory_avg = tf.reduce_mean(sample_memory, 1) # [B, S]

      sample_memory_output = {"bow_pred_ind": pred_ind, 
                              "bow_pred_prob": bow_pred_prob, 
                              "sample_memory_ind": sample_ind, 
                              "sample_memory_prob": sample_prob }

    # Decoder 
    # The initial state of the decoder = 
    #   encoder meaning vector z + encoder bow vector b 
    with tf.variable_scope("decoder"):
      dec_cell = [create_cell(
        "dec-%d" % i, state_size, self.drop_out, self.no_residual) 
        for i in range(enc_layers)]
      dec_cell = tf.nn.rnn_cell.MultiRNNCell(dec_cell)
      dec_proj = tf.layers.Dense(vocab_size, name="dec_proj",
        kernel_initializer=tf.random_normal_initializer(stddev=0.05),
        bias_initializer=tf.constant_initializer(0.))
      dec_ptr_k_proj = [
        tf.layers.Dense(state_size, name="dec_ptr_k_proj_%d" % pi,
        kernel_initializer=tf.random_normal_initializer(stddev=0.05),
        bias_initializer=tf.constant_initializer(0.)) 
        for pi in range(self.num_pointers)]
      dec_ptr_g_proj = tf.layers.Dense(1, name="dec_ptr_g_proj",
        kernel_initializer=tf.random_normal_initializer(stddev=0.05),
        bias_initializer=tf.constant_initializer(0.),
        activation=tf.nn.sigmoid)
      bow_cond_gate_proj = tf.layers.Dense(1, name="bow_cond_gate_proj",
        kernel_initializer=tf.random_normal_initializer(stddev=0.05),
        bias_initializer=tf.constant_initializer(0.),
        activation=tf.nn.sigmoid)

      dec_init_state = []
      for l in range(enc_layers):
        dec_init_state.append(LSTMStateTuple(c=enc_state[0].c, 
                                h=enc_state[0].h + sample_memory_avg))
      dec_init_state = tuple(dec_init_state)

      # if(enc_layers == 2):
      #   dec_init_state = (LSTMStateTuple( c=enc_state[0].c, 
      #                                     h=enc_state[0].h + sample_memory_avg),
      #                     LSTMStateTuple( c=enc_state[1].c, 
      #                                     h=enc_state[1].h + sample_memory_avg) )
      # elif(enc_layers == 4):
      #   dec_init_state = (LSTMStateTuple(c=enc_state[0].c, 
      #                       h=enc_state[0].h + sample_memory_avg),
      #                     LSTMStateTuple( c=enc_state[1].c, 
      #                       h=enc_state[1].h + sample_memory_avg) )
      # else: raise Exception('enc_layers not in [2, 4]')

      if(self.source_attn):
        # [B, M + T, S]
        dec_memory = [sample_memory, enc_outputs]
        dec_mem_len = [sample_memory_lens, enc_lens]
        dec_max_mem_len = [self.sample_size, max_enc_len]
      else:
        dec_memory = sample_memory
        dec_mem_len = sample_memory_lens
        dec_max_mem_len = tf.shape(dec_memory)[1] 

      if(self.bow_cond): bow_cond = sample_memory_avg
      else: bow_cond = None

      if(self.bow_cond_gate == False): bow_cond_gate_proj = None

      (dec_outputs_predict, dec_logits_train, dec_prob_train, pointer_ent, 
        avg_max_ptr, avg_num_copy) = decode( 
        self.dec_start_id, dec_inputs, 
        dec_cell, dec_proj, embedding_matrix_vals, 
        dec_init_state, dec_memory, dec_mem_len, dec_max_mem_len, 
        batch_size, max_dec_len, self.sampling_method, self.topk_sampling_size,
        state_size, multi_source=True, copy=self.copy, copy_ind=sample_ind,
        dec_ptr_g_proj=dec_ptr_g_proj, dec_ptr_k_proj=dec_ptr_k_proj,
        bow_cond=bow_cond, bow_cond_gate_proj=bow_cond_gate_proj)

    # model saver, before the optimizer 
    all_variables = slim.get_variables_to_restore()
    model_variables = [var for var in all_variables 
      if var.name.split("/")[0] == self.model_name]
    print("%s model, variable list:" % self.model_name)
    for v in model_variables: print("  %s" % v.name)
    self.model_saver = tf.train.Saver(model_variables, max_to_keep=3)

    with tf.variable_scope("optimizer"):
      optimizer = tf.train.AdamOptimizer(self.learning_rate)

    # decoder output, training and inference, combined with encoder loss 
    with tf.name_scope("dec_output"):
      dec_mask = tf.sequence_mask(dec_lens, max_dec_len, dtype=tf.float32)
      if(self.copy == False):
        dec_loss = tf.contrib.seq2seq.sequence_loss(
          dec_logits_train, dec_targets, dec_mask)
      else: 
        dec_loss = _copy_loss(dec_prob_train, dec_targets, dec_mask)

      loss = dec_loss + lambda_enc_loss * enc_loss
      train_op = optimizer.minimize(loss)

      dec_output = {"train_op": train_op, "dec_loss": dec_loss, "loss": loss}
      self.train_output.update(dec_output)
      if(self.copy):
        pointer_ent =\
          tf.reduce_sum(pointer_ent * dec_mask) / tf.reduce_sum(dec_mask)
        self.train_output['pointer_ent'] = pointer_ent
        avg_max_ptr =\
          tf.reduce_sum(avg_max_ptr * dec_mask) / tf.reduce_sum(dec_mask)
        self.train_output['avg_max_ptr'] = avg_max_ptr
        avg_num_copy = tf.reduce_sum(avg_num_copy * dec_mask, 1)
        avg_num_copy = tf.reduce_mean(avg_num_copy)
        self.train_output['avg_num_copy'] = avg_num_copy

      self.infer_output = {"dec_predict": dec_outputs_predict}
      dec_out_mem_ratio = _calculate_dec_out_mem_ratio(dec_outputs_predict, 
        sample_ind, vocab_size, self.pad_id, self.dec_start_id, self.dec_end_id)
      self.infer_output.update(dec_out_mem_ratio)
      self.infer_output.update(sample_memory_output)
      self.infer_output.update(seq_neighbor_output)
    return