예제 #1
0
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)
예제 #2
0
def train_action(opt, logger, writer, train_num):
    try:
        opt.rl_weight = 1 - opt.mle_weight

        if opt.load_model:
            opt.load_model = "/%s/%s" % (opt.word_emb_type, opt.load_model)

        logger.info(u'------Training Setting--------')

        logger.info("Traing Type :%s" % (config.data_type))
        if opt.train_mle == True:
            logger.info("Training mle: %s, mle weight: %.2f" %
                        (opt.train_mle, opt.mle_weight))

        if opt.train_rl == True:
            logger.info("Training rl: %s, rl weight: %.2f \n" %
                        (opt.train_rl, opt.rl_weight))

        if opt.word_emb_type == 'bert': config.emb_dim = 768
        if opt.pre_train_emb:
            logger.info('use pre_train_%s vocab_size %s \n' %
                        (opt.word_emb_type, config.vocab_size))

        else:
            logger.info('use %s vocab_size %s \n' %
                        (opt.word_emb_type, config.vocab_size))

        logger.info("intra_encoder: %s intra_decoder: %s \n" %
                    (config.intra_encoder, config.intra_decoder))
        if opt.word_emb_type in ['word2Vec', 'glove', 'FastText']:
            config.vocab_path = config.Data_path + "Embedding/%s/word.vocab" % (
                opt.word_emb_type)
            # config.vocab_size = len(open(config.vocab_path).readlines())
            vocab = Vocab(config.vocab_path, config.vocab_size)
        train_processor = Train(opt, vocab, logger, writer, train_num)
        train_processor.trainIters()
    except KeyError as e:
        print(e)
        traceback = sys.exc_info()[2]
        logger.error(sys.exc_info())
        logger.error(traceback.tb_lineno)
        logger.error(e)
    logger.info(u'------Training END--------')
예제 #3
0
 def __init__(self, model_file_or_model, vocab=None):
     if vocab is None:
         self.vocab = Vocab(config.vocab_path, config.vocab_size)
     else:
         assert isinstance(vocab, Vocab)
         self.vocab = vocab
     self.batcher = Batcher(config.eval_data_path,
                            self.vocab,
                            mode='eval',
                            batch_size=config.batch_size,
                            single_pass=True)
     time.sleep(15)
     if isinstance(model_file_or_model, str):
         self.model = Model(device, model_file_or_model, is_eval=True)
     elif isinstance(model_file_or_model, Model):
         self.model = model_file_or_model
     else:
         raise ValueError("Cannot build model from type %s" %
                          type(model_file_or_model))
예제 #4
0
    def __init__(self, model_file_path, vocab=None):
        if vocab != None:
            self.vocab = vocab
        else:
            self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path,
                               self.vocab,
                               mode='eval',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.compat.v1.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)
예제 #5
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)
    def __init__(self):
        """
        Input:
            vocab_path = "xxx/finished_files/vocab",
            vocab_size = 50000

        Output:
            class object: self.vocab --> (dicts `_word_to_id` and `_id_to_word`)
        """
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        """
        Input:
            train_data_path = "xxx/finished_files/chunked/train_*",
            self.vocab: class object,
            mode = 'train', for training,
            batch_size = 8,
            single_pass = False

        Output:
            class object: self.vocab,
                (dicts `_word_to_id` and `_id_to_word`)
        """
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)
예제 #7
0
    def __init__(self, model_file_path=None):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        if not model_file_path:
            train_dir = os.path.join(config.log_root,
                                     'train_%d' % (int(time.time())))
            if not os.path.exists(train_dir):
                os.mkdir(train_dir)
        else:
            train_dir = re.sub('/model/model.*', '', model_file_path)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.create_file_writer(train_dir)
예제 #8
0
    def __init__(self, args, model_name = None):
        self.args = args
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        print(args.vocab_path)
        print(vocab)
        self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file)
        self.batcher = Batcher(args.train_data_path, self.vocab, mode='train',
                               batch_size=args.batch_size, single_pass=False, args=args)
        self.eval_batcher = Batcher(args.eval_data_path, self.vocab, mode='eval',
                                    batch_size=args.batch_size, single_pass=True, args=args)
        time.sleep(15)

        if model_name is None:
            self.train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        else:
            self.train_dir = os.path.join(config.log_root, model_name)

        if not os.path.exists(self.train_dir):
            os.mkdir(self.train_dir)

        self.model_dir = os.path.join(self.train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
예제 #9
0
    print("Finished writing vocab file")

# In[51]:

word = wvmodel.wv.index2entity[25]
vector = wvmodel.wv.vectors[25]
print(word)
# print(vector)

# In[52]:

import torch
from data_util.data import Vocab
vocab_size = len(wvmodel.vocab) + 1

vocab = Vocab('Embedding/category/word2Vec/word.vocab', vocab_size)

embed_size = 300
weight = torch.zeros(vocab_size, embed_size)

for i in range(len(vocab._id_to_word.keys())):
    try:
        vocab_word = vocab._id_to_word[i + 4]
        w2vec_word = w2vec.wv.index2entity[i]
    except Exception as e:
        continue
    if i + 4 > vocab_size: break
    #     print(vocab_word,w2vec_word)
    weight[i + 4, :] = torch.from_numpy(w2vec.wv.vectors[i])

embedding = torch.nn.Embedding.from_pretrained(weight)
예제 #10
0
 def __init__(self, vocab_path, model_path, model):
     self.vocab = vocab = Vocab(vocab_path)
     self.model_path = model_path
     self.model = model
예제 #11
0
class Train(object):
    def __init__(self, opt):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        self.opt = opt
        self.start_id = self.vocab.word2id(data.START_DECODING)
        self.end_id = self.vocab.word2id(data.STOP_DECODING)
        self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
        self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
        time.sleep(5)

    def save_model(self, iter):
        save_path = config.save_model_path + "/%07d.tar" % iter
        T.save(
            {
                "iter": iter + 1,
                "model_dict": self.model.state_dict(),
                "trainer_dict": self.trainer.state_dict()
            }, save_path)

    def setup_train(self):
        self.model = Model()
        self.model = get_cuda(self.model)
        self.trainer = T.optim.Adam(self.model.parameters(), lr=config.lr)
        start_iter = 0
        if self.opt.load_model is not None:
            load_model_path = os.path.join(config.save_model_path,
                                           self.opt.load_model)
            checkpoint = T.load(load_model_path)
            start_iter = checkpoint["iter"]
            self.model.load_state_dict(checkpoint["model_dict"])
            self.trainer.load_state_dict(checkpoint["trainer_dict"])
            print("Loaded model at " + load_model_path)
        if self.opt.new_lr is not None:
            self.trainer = T.optim.Adam(self.model.parameters(),
                                        lr=self.opt.new_lr)
        return start_iter

    def train_batch_MLE(self, enc_out, enc_hidden, enc_padding_mask, ct_e,
                        extra_zeros, enc_batch_extend_vocab, batch):
        ''' Calculate Negative Log Likelihood Loss for the given batch. In order to reduce exposure bias,
                pass the previous generated token as input with a probability of 0.25 instead of ground truth label
        Args:
        :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size)
        :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size)
        :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others
        :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf)
        :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism
        :param enc_batch_extend_vocab: Input batch that stores OOV ids
        :param batch: batch object
        '''
        dec_batch, max_dec_len, dec_lens, target_batch = get_dec_data(
            batch)  #Get input and target batchs for training decoder
        step_losses = []
        s_t = (enc_hidden[0], enc_hidden[1])  #Decoder hidden states
        x_t = get_cuda(T.LongTensor(len(enc_out)).fill_(
            self.start_id))  #Input to the decoder
        prev_s = None  #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf)
        sum_temporal_srcs = None  #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf)
        for t in range(min(max_dec_len, config.max_dec_steps)):
            use_gound_truth = get_cuda((T.rand(len(enc_out)) > 0.25)).long(
            )  #Probabilities indicating whether to use ground truth labels instead of previous decoded tokens
            x_t = use_gound_truth * dec_batch[:, t] + (
                1 - use_gound_truth
            ) * x_t  #Select decoder input based on use_ground_truth probabilities
            x_t = self.model.embeds(x_t)
            final_dist, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder(
                x_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros,
                enc_batch_extend_vocab, sum_temporal_srcs, prev_s)
            target = target_batch[:, t]
            log_probs = T.log(final_dist + config.eps)
            step_loss = F.nll_loss(log_probs,
                                   target,
                                   reduction="none",
                                   ignore_index=self.pad_id)
            step_losses.append(step_loss)
            x_t = T.multinomial(final_dist, 1).squeeze(
            )  #Sample words from final distribution which can be used as input in next time step
            is_oov = (x_t >= config.vocab_size
                      ).long()  #Mask indicating whether sampled word is OOV
            x_t = (1 - is_oov) * x_t.detach() + (
                is_oov) * self.unk_id  #Replace OOVs with [UNK] token

        losses = T.sum(
            T.stack(step_losses, 1), 1
        )  #unnormalized losses for each example in the batch; (batch_size)
        batch_avg_loss = losses / dec_lens  #Normalized losses; (batch_size)
        mle_loss = T.mean(batch_avg_loss)  #Average batch loss
        return mle_loss

    def train_batch_RL(self, enc_out, enc_hidden, enc_padding_mask, ct_e,
                       extra_zeros, enc_batch_extend_vocab, article_oovs,
                       greedy):
        '''Generate sentences from decoder entirely using sampled tokens as input. These sentences are used for ROUGE evaluation
        Args
        :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size)
        :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size)
        :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others
        :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf)
        :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism
        :param enc_batch_extend_vocab: Input batch that stores OOV ids
        :param article_oovs: Batch containing list of OOVs in each example
        :param greedy: If true, performs greedy based sampling, else performs multinomial sampling
        Returns:
        :decoded_strs: List of decoded sentences
        :log_probs: Log probabilities of sampled words
        '''
        s_t = enc_hidden  #Decoder hidden states
        x_t = get_cuda(T.LongTensor(len(enc_out)).fill_(
            self.start_id))  #Input to the decoder
        prev_s = None  #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf)
        sum_temporal_srcs = None  #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf)
        inds = []  #Stores sampled indices for each time step
        decoder_padding_mask = []  #Stores padding masks of generated samples
        log_probs = []  #Stores log probabilites of generated samples
        mask = get_cuda(
            T.LongTensor(len(enc_out)).fill_(1)
        )  #Values that indicate whether [STOP] token has already been encountered; 1 => Not encountered, 0 otherwise

        for t in range(config.max_dec_steps):
            x_t = self.model.embeds(x_t)
            probs, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder(
                x_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros,
                enc_batch_extend_vocab, sum_temporal_srcs, prev_s)
            if greedy is False:
                multi_dist = Categorical(probs)
                x_t = multi_dist.sample()  #perform multinomial sampling
                log_prob = multi_dist.log_prob(x_t)
                log_probs.append(log_prob)
            else:
                _, x_t = T.max(probs, dim=1)  #perform greedy sampling
            x_t = x_t.detach()
            inds.append(x_t)
            mask_t = get_cuda(T.zeros(
                len(enc_out)))  #Padding mask of batch for current time step
            mask_t[
                mask ==
                1] = 1  #If [STOP] is not encountered till previous time step, mask_t = 1 else mask_t = 0
            mask[
                (mask == 1) + (x_t == self.end_id) ==
                2] = 0  #If [STOP] is not encountered till previous time step and current word is [STOP], make mask = 0
            decoder_padding_mask.append(mask_t)
            is_oov = (x_t >= config.vocab_size
                      ).long()  #Mask indicating whether sampled word is OOV
            x_t = (1 - is_oov) * x_t + (
                is_oov) * self.unk_id  #Replace OOVs with [UNK] token

        inds = T.stack(inds, dim=1)
        decoder_padding_mask = T.stack(decoder_padding_mask, dim=1)
        if greedy is False:  #If multinomial based sampling, compute log probabilites of sampled words
            log_probs = T.stack(log_probs, dim=1)
            log_probs = log_probs * decoder_padding_mask  #Not considering sampled words with padding mask = 0
            lens = T.sum(decoder_padding_mask,
                         dim=1)  #Length of sampled sentence
            log_probs = T.sum(
                log_probs, dim=1
            ) / lens  # (bs,)                                     #compute normalizied log probability of a sentence
        decoded_strs = []
        for i in range(len(enc_out)):
            id_list = inds[i].cpu().numpy()
            oovs = article_oovs[i]
            S = data.outputids2words(
                id_list, self.vocab,
                oovs)  # Generate sentence corresponding to sampled words
            try:
                end_idx = S.index(data.STOP_DECODING)
                S = S[:end_idx]
            except ValueError:
                S = S
            if len(
                    S
            ) < 2:  #If length of sentence is less than 2 words, replace it with "xxx"; Avoids setences like "." which throws error while calculating ROUGE
                S = ["xxx"]
            S = " ".join(S)
            decoded_strs.append(S)

        return decoded_strs, log_probs

    def reward_function(self, decoded_sents, original_sents):
        rouge = Rouge()
        try:
            scores = rouge.get_scores(decoded_sents, original_sents)
        except Exception:
            print(
                "Rouge failed for multi sentence evaluation.. Finding exact pair"
            )
            scores = []
            for i in range(len(decoded_sents)):
                try:
                    score = rouge.get_scores(decoded_sents[i],
                                             original_sents[i])
                except Exception:
                    print("Error occured at:")
                    print("decoded_sents:", decoded_sents[i])
                    print("original_sents:", original_sents[i])
                    score = [{"rouge-l": {"f": 0.0}}]
                scores.append(score[0])
        rouge_l_f1 = [score["rouge-l"]["f"] for score in scores]
        rouge_l_f1 = get_cuda(T.FloatTensor(rouge_l_f1))
        return rouge_l_f1

    # def write_to_file(self, decoded, max, original, sample_r, baseline_r, iter):
    #     with open("temp.txt", "w") as f:
    #         f.write("iter:"+str(iter)+"\n")
    #         for i in range(len(original)):
    #             f.write("dec: "+decoded[i]+"\n")
    #             f.write("max: "+max[i]+"\n")
    #             f.write("org: "+original[i]+"\n")
    #             f.write("Sample_R: %.4f, Baseline_R: %.4f\n\n"%(sample_r[i].item(), baseline_r[i].item()))

    def train_one_batch(self, batch, iter):
        enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data(
            batch)

        enc_batch = self.model.embeds(
            enc_batch)  #Get embeddings for encoder input
        enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

        # -------------------------------Summarization-----------------------
        if self.opt.train_mle == "yes":  #perform MLE training
            mle_loss = self.train_batch_MLE(enc_out, enc_hidden,
                                            enc_padding_mask, context,
                                            extra_zeros,
                                            enc_batch_extend_vocab, batch)
        else:
            mle_loss = get_cuda(T.FloatTensor([0]))
        # --------------RL training-----------------------------------------------------
        if self.opt.train_rl == "yes":  #perform reinforcement learning training
            # multinomial sampling
            sample_sents, RL_log_probs = self.train_batch_RL(
                enc_out,
                enc_hidden,
                enc_padding_mask,
                context,
                extra_zeros,
                enc_batch_extend_vocab,
                batch.art_oovs,
                greedy=False)
            with T.autograd.no_grad():
                # greedy sampling
                greedy_sents, _ = self.train_batch_RL(enc_out,
                                                      enc_hidden,
                                                      enc_padding_mask,
                                                      context,
                                                      extra_zeros,
                                                      enc_batch_extend_vocab,
                                                      batch.art_oovs,
                                                      greedy=True)

            sample_reward = self.reward_function(sample_sents,
                                                 batch.original_abstracts)
            baseline_reward = self.reward_function(greedy_sents,
                                                   batch.original_abstracts)
            # if iter%200 == 0:
            #     self.write_to_file(sample_sents, greedy_sents, batch.original_abstracts, sample_reward, baseline_reward, iter)
            rl_loss = -(
                sample_reward - baseline_reward
            ) * RL_log_probs  #Self-critic policy gradient training (eq 15 in https://arxiv.org/pdf/1705.04304.pdf)
            rl_loss = T.mean(rl_loss)

            batch_reward = T.mean(sample_reward).item()
        else:
            rl_loss = get_cuda(T.FloatTensor([0]))
            batch_reward = 0

    # ------------------------------------------------------------------------------------
        self.trainer.zero_grad()
        (self.opt.mle_weight * mle_loss +
         self.opt.rl_weight * rl_loss).backward()
        self.trainer.step()
        return mle_loss.item(), batch_reward

    def trainIters(self):
        iter = self.setup_train()
        count = mle_total = r_total = 0
        while iter <= config.max_iterations:
            batch = self.batcher.next_batch()
            try:
                mle_loss, r = self.train_one_batch(batch, iter)
            except KeyboardInterrupt:
                print(
                    "-------------------Keyboard Interrupt------------------")
                exit(0)

            mle_total += mle_loss
            r_total += r
            count += 1
            iter += 1

            if iter % 50 == 0:
                mle_avg = mle_total / count
                r_avg = r_total / count
                logger.info("iter:" + str(iter) + "  mle_loss:" +
                            "%.3f" % mle_avg + "  reward:" + "%.4f" % r_avg)
                count = mle_total = r_total = 0

            if iter % 5000 == 0:
                self.save_model(iter)
예제 #12
0
class Validate(object):
    def __init__(self, data_path, batch_size=config.batch_size):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(data_path,
                               self.vocab,
                               mode='eval',
                               batch_size=batch_size,
                               single_pass=True)

        time.sleep(5)

    def setup_valid(self):
        self.model = Model()
        self.model = get_cuda(self.model)
        checkpoint = T.load(config.load_model_path)
        self.model.load_state_dict(checkpoint["model_dict"])

    def validate_batch(self):

        self.setup_valid()
        batch = self.batcher.next_batch()
        start_id = self.vocab.word2id(data.START_DECODING)
        end_id = self.vocab.word2id(data.STOP_DECODING)
        unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
        decoded_sents = []
        original_sents = []
        rouge = Rouge()
        while batch is not None:
            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, c_t_1 = get_enc_data(
                batch)

            with T.autograd.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            with T.autograd.no_grad():
                pred_ids = beam_search_on_batch(enc_hidden, enc_out,
                                                enc_padding_mask, c_t_1,
                                                extra_zeros,
                                                enc_batch_extend_vocab,
                                                self.model, start_id, end_id,
                                                unk_id)

            for i in range(len(pred_ids)):
                decoded_words = data.outputids2words(pred_ids[i], self.vocab,
                                                     batch.art_oovs[i])
                if len(decoded_words) < 2:
                    decoded_words = "xxx"
                else:
                    decoded_words = " ".join(decoded_words)
                decoded_sents.append(decoded_words)
                tar = batch.original_abstracts[i]
                original_sents.append(tar)

            batch = self.batcher.next_batch()

        load_file = config.load_model_path.split("/")[-1]

        scores = rouge.get_scores(decoded_sents, original_sents, avg=True)
        rouge_l = scores["rouge-l"]["f"]
        print(load_file, "rouge_l:", "%.4f" % rouge_l)
예제 #13
0
파일: sandbox.py 프로젝트: amitz25/LongSum
from data_util.batcher import Batcher
from data_util.data import Vocab
import numpy as np
import json
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
#import nltk
#from nltk.corpus import stopwords

vocab = Vocab(config.vocab_path, config.vocab_size)
batcher = Batcher(config.train_data_path, vocab, mode='train', batch_size=config.batch_size, single_pass=False)
batches = 1

def google_encoder_metric(abstract_sents, article_sents):
    embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
    rotation = 90

    flatten = lambda l: [item for article_sents in l for item in article_sents]
    article_sentences = flatten(article_sents)

    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])

        #article_sentences = article_sentences[:3]
        #abstract_sents = abstract_sents[:2]
예제 #14
0
파일: decode.py 프로젝트: amitz25/LongSum
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)


    def decode(self):
        start = time.time()
        counter = 0
        batch = self.batcher.next_batch()
        while batch is not None:
            # Run beam search to get best Hypothesis
            with torch.no_grad():
                best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(output_ids, self.vocab,
                                                 (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            print("===============SUMMARY=============")
            print(' '.join(decoded_words))

            original_abstract_sents = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract_sents, decoded_words, counter,
                            self._rouge_ref_dir, self._rouge_dec_dir)
            counter += 1
            if counter % 1000 == 0:
                print('%d example in %d sec'%(counter, time.time() - start))
                start = time.time()

            batch = self.batcher.next_batch()

        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)


    def beam_search(self, batch):
        #batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0, sent_lens = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)
        if config.use_maxpool_init_ctx:
            c_t_0 = max_encoder_output

        gamma = None
        if config.is_sentence_filtering:
            gamma, sent_dists = self.model.sentence_filterer(encoder_outputs, sent_lens)

        section_outputs, section_hidden = self.model.section_encoder(s_t_0)
        s_t_0 = self.model.section_reduce_state(section_hidden)

        dec_h, dec_c = s_t_0 # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        #decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                      log_probs=[0.0],
                      state=(dec_h[0], dec_c[0]),
                      context = c_t_0[0],
                      coverage=(coverage_t_0[0] if config.is_coverage else None))
                 for _ in range(config.beam_size)]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda:
                y_t_1 = y_t_1.cuda()
            all_state_h =[]
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1,
                                                        encoder_outputs, section_outputs, enc_padding_mask,
                                                        c_t_1, extra_zeros, enc_batch_extend_vocab, coverage_t_1, gamma)

            topk_log_probs, topk_ids = torch.topk(final_dist, config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size * 2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                   log_prob=topk_log_probs[i, j].item(),
                                   state=state_i,
                                   context=context_i,
                                   coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
class Main(object):
    def __init__(self):
        self.vocab = Vocab(VOCAB_PATH, VOCAB_SIZE)
        self.batcher = Batcher(TRAIN_DATA_PATH, self.vocab, mode = 'train',batch_size = BATCH_SIZE, single_pass = False)
        self.start_id = self.vocab.word2id(data.START_DECODING)
        self.end_id = self.vocab.word2id(data.STOP_DECODING)
        self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
        self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
        self.model = MyModel().to(DEVICE)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR)


    def beamSearch(self,enc_hid, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab):
        batch_size = len(enc_hid[0])
        beam_idx = torch.LongTensor(list(range(batch_size)))
        beams = [Beam(self.start_id, self.end_id, self.unk_id, (enc_hid[0][i], enc_hid[1][i]), ct_e[i]) for i in range(batch_size)]
        n_rem = batch_size
        sum_exp = None
        prev_s = None

        for t in range(MAX_DEC_STEPS):
            x_t = torch.stack(
                [beam.getTokens() for beam in beams if beam.done == False]  # remaining(rem),beam
            ).contiguous().view(-1)
            x_t = self.model.embeds(x_t)

            dec_h = torch.stack(
                [beam.hid_h for beam in beams if beam.done == False]
            ).contiguous().view(-1, HIDDEN_DIM)
            dec_c = torch.stack(
                [beam.hid_c for beam in beams if beam.done == False]
            ).contiguous().view(-1, HIDDEN_DIM)

            ct_e = torch.stack(
                [beam.context for beam in beams if beam.done == False]
            ).contiguous().view(-1, 2 * HIDDEN_DIM)

            if sum_exp is not None:
                sum_exp = torch.stack(
                    [beam.sum_exp for beam in beams if beam.done == False]
                ).contiguous().view(-1, enc_out.size(1))

            if prev_s is not None:
                prev_s = torch.stack(
                    [beam.prev_s for beam in beams if beam.done == False]
                )
                # try:
                prev_s = prev_s.contiguous().view(-1, t + 1, HIDDEN_DIM)
                # print(prev_s.shape)
                # except:
                #     continue
            s_t = (dec_h, dec_c)
            enc_out_beam = enc_out[beam_idx].view(n_rem, -1).repeat(1, BEAM_SIZE).view(-1, enc_out.size(1),
                                                                                              enc_out.size(2))
            enc_pad_mask_beam = enc_padding_mask[beam_idx].repeat(1, BEAM_SIZE).view(-1,
                                                                                            enc_padding_mask.size(1))

            extra_zeros_beam = None
            if extra_zeros is not None:
                extra_zeros_beam = extra_zeros[beam_idx].repeat(1, BEAM_SIZE).view(-1, extra_zeros.size(1))

            enc_extend_vocab_beam = enc_batch_extend_vocab[beam_idx].repeat(1, BEAM_SIZE).view(-1,enc_batch_extend_vocab.size(1))
            # print(enc_out_beam.shape)
            # try:
            final_dist, (dec_h, dec_c), ct_e, sum_exp, prev_s = self.model.decoder(x_t, s_t, enc_out_beam,enc_pad_mask_beam, ct_e,extra_zeros_beam,enc_extend_vocab_beam,sum_exp,prev_s)

            # except:
            #     continue
            # print(prev_s.shape)
            final_dist = final_dist.view(n_rem, BEAM_SIZE, -1)
            dec_h = dec_h.view(n_rem, BEAM_SIZE, -1)
            dec_c = dec_c.view(n_rem, BEAM_SIZE, -1)
            ct_e = ct_e.view(n_rem, BEAM_SIZE, -1)

            if sum_exp is not None:
                sum_exp = sum_exp.view(n_rem, BEAM_SIZE, -1)  # rem, beam, n_seq

            if prev_s is not None:
                prev_s = prev_s.view(n_rem, BEAM_SIZE, -1, HIDDEN_DIM)  # rem, beam, t
                # print("prev_s",prev_s.shape)

            active = []

            for i in range(n_rem):
                b = beam_idx[i].item()
                beam = beams[b]
                if beam.done:
                    continue

                sum_exp_i = prev_s_i = None
                if sum_exp is not None:
                    sum_exp_i = sum_exp[i]
                if prev_s is not None:
                    prev_s_i = prev_s[i]
                beam.advance(final_dist[i], (dec_h[i], dec_c[i]), ct_e[i], sum_exp_i, prev_s_i)
                if beam.done == False:
                    active.append(b)

            # print(len(active))
            if len(active) == 0:
                break

            beam_idx = torch.LongTensor(active)
            n_rem = len(beam_idx)

        predicted_words = []
        for beam in beams:
            predicted_words.append(beam.getBest())
        # print(len(predicted_words[0]))
        return predicted_words

    #获取encode的数据
    def getEncData(self,batch):
        batch_size = len(batch.enc_lens)
        enc_batch = torch.from_numpy(batch.enc_batch).long()
        enc_padding_mask = torch.from_numpy(batch.enc_padding_mask).float()

        enc_lens = batch.enc_lens

        ct_e = torch.zeros(batch_size, 2 * HIDDEN_DIM)

        enc_batch = enc_batch.to(DEVICE)
        enc_padding_mask = enc_padding_mask.to(DEVICE)

        ct_e = ct_e.to(DEVICE)

        enc_batch_extend_vocab = None
        if batch.enc_batch_extend_vocab is not None:
            enc_batch_extend_vocab = torch.from_numpy(batch.enc_batch_extend_vocab).long()
            enc_batch_extend_vocab = enc_batch_extend_vocab.to(DEVICE)

        extra_zeros = None
        if batch.max_art_oovs > 0:
            extra_zeros = torch.zeros(batch_size, batch.max_art_oovs)
            extra_zeros = extra_zeros.to(DEVICE)

        return enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e

    #获取decode数据
    def getDecData(self,batch):
        dec_batch = torch.from_numpy(batch.dec_batch).long()
        dec_lens = batch.dec_lens
        max_dec_len = np.max(dec_lens)
        dec_lens = torch.from_numpy(batch.dec_lens).float()

        target_batch = torch.from_numpy(batch.target_batch).long()

        dec_batch = dec_batch.to(DEVICE)
        dec_lens = dec_lens.to(DEVICE)
        target_batch = target_batch.to(DEVICE)

        return dec_batch, max_dec_len, dec_lens, target_batch

    #最大似然训练
    def trainMLEStep(self, batch):
        enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = self.getEncData(batch)
        # print(enc_lens)
        enc_batch = self.model.embeds(enc_batch)
        #将输入直接传入编码器
        enc_output,enc_hidden = self.model.encoder(enc_batch,enc_lens)
        # print(enc_output.shape)
        dec_batch, max_dec_len, dec_lens, target_batch = self.getDecData(batch)
        total_loss = 0
        s_t = (enc_hidden[0], enc_hidden[1])
        x_t =torch.LongTensor(len(enc_output)).fill_(self.start_id).to(DEVICE)
        # print(x_t.shape)
        prev_s = None
        sum_exp = None
        #最大解码步数,每次解码一个单词
        for t in range(min(max_dec_len, MAX_DEC_STEPS)):
            # print(max_dec_len)
            choice = (torch.rand(len(enc_output)) > 0.25).long().to(DEVICE)
            #选取部分decoder_input和一部分
            x_t = choice * dec_batch[:, t] + (1 - choice) * x_t
            x_t = self.model.embeds(x_t)
            # print(x_t.shape)
            final_dist, s_t, ct_e, sum_exp, prev_s = self.model.decoder(x_t, s_t, enc_output, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_exp, prev_s)
            # print(prev_s.shape)
            target = target_batch[:, t]
            # print(target_batch.shape)
            log_probs = torch.log(final_dist + EPS)
            step_loss = F.nll_loss(log_probs, target, reduction="none", ignore_index=self.pad_id)
            total_loss = total_loss + step_loss
            x_t = torch.multinomial(final_dist, 1).squeeze()
            is_oov = (x_t >= VOCAB_SIZE).long()
            x_t = (1 - is_oov) * x_t.detach() + (is_oov) * self.unk_id

        batch_avg_loss = total_loss / dec_lens
        loss = torch.mean(batch_avg_loss)
        return loss

    #强化学习
    def trainRLStep(self,enc_output, enc_hidden, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, article_oovs, type):


        s_t = enc_hidden
        x_t = torch.LongTensor(len(enc_output)).fill_(self.start_id).to(DEVICE)
        prev_s = None
        sum_exp = None
        inds = []
        decoder_padding_mask = []
        log_probs = []
        mask = torch.LongTensor(len(enc_output)).fill_(1).to(DEVICE)

        for t in range(MAX_DEC_STEPS):
            x_t = self.model.embeds(x_t)
            probs, s_t, ct_e, sum_exp, prev_s = self.model.decoder(x_t, s_t, enc_output, enc_padding_mask, ct_e,
                                                                             extra_zeros, enc_batch_extend_vocab,
                                                                             sum_exp, prev_s)
            if type == "sample":
                #根据概率产生sample
                multi_dist = Categorical(probs)
                # print(multi_dist)
                x_t = multi_dist.sample()
                # print(x_t.shape)
                log_prob = multi_dist.log_prob(x_t)
                log_probs.append(log_prob)
            else:
                #greedy sample
                _, x_t = torch.max(probs, dim=1)
            x_t = x_t.detach()
            inds.append(x_t)
            mask_t = torch.zeros(len(enc_output)).to(DEVICE)
            mask_t[mask == 1] = 1
            mask[(mask == 1) + (x_t == self.end_id) == 2] = 0
            decoder_padding_mask.append(mask_t)
            is_oov = (x_t >= VOCAB_SIZE).long()
            #判断是否有超限,若有则用UNK
            x_t = (1 - is_oov) * x_t + (is_oov) * self.unk_id

        inds = torch.stack(inds, dim=1)
        decoder_padding_mask = torch.stack(decoder_padding_mask, dim=1)
        if type == "sample":
            log_probs = torch.stack(log_probs, dim=1)
            #将pad的去除
            log_probs = log_probs * decoder_padding_mask
            lens = torch.sum(decoder_padding_mask, dim=1)
            #对应公式15 logp
            log_probs = torch.sum(log_probs,dim=1) / lens
            # print(log_prob.shape)
        decoded_strs = []
        #将output的id转换为word
        for i in range(len(enc_output)):
            id_list = inds[i].cpu().numpy()
            oovs = article_oovs[i]
            S = data.outputids2words(id_list, self.vocab, oovs)  # Generate sentence corresponding to sampled words
            try:
                end_idx = S.index(data.STOP_DECODING)
                S = S[:end_idx]
            except ValueError:
                S = S
            if len(S) < 2:
                S = ["xxx"]
            S = " ".join(S)
            decoded_strs.append(S)

        return decoded_strs, log_probs


    def rewardFunction(self, decoded_sents, original_sents):
        rouge = Rouge()
        scores = rouge.get_scores(decoded_sents, original_sents)
        rouge_l_f1 = [score["rouge-l"]["f"] for score in scores]
        rouge_l_f1 = (torch.FloatTensor(rouge_l_f1)).to(DEVICE)
        return rouge_l_f1



    #利用beamSearch测试
    def test(self):
        # time.sleep(5)

        batcher = Batcher(TEST_DATA_PATH, self.vocab, mode='test',batch_size=BATCH_SIZE, single_pass=True)
        batch = batcher.next_batch()
        decoded_sents = []
        ref_sents = []
        article_sents = []
        rouge = Rouge()
        count = 0
        while batch is not None:
            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = self.getEncData(batch)
            with torch.autograd.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            with torch.autograd.no_grad():
                pred_ids = self.beamSearch(enc_hidden, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab)
                # print(len(pred_ids[0]))
            for i in range(len(pred_ids)):
                # print('t',pred_ids[i])
                decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i])
                # print(decoded_words)
                if len(decoded_words) < 2:
                    decoded_words = "xxx"
                else:
                    decoded_words = " ".join(decoded_words)
                decoded_sents.append(decoded_words)
                abstract = batch.original_abstracts[i]
                article = batch.original_articles[i]
                ref_sents.append(abstract)
                article_sents.append(article)
            # print(decoded_sents)
            batch = batcher.next_batch()
            scores = rouge.get_scores(decoded_sents, ref_sents, avg=True)
            #统计结果
            if count == 1:
                k0_sum = scores[KEYS[0]]
                k1_sum = scores[KEYS[1]]
                k2_sum = scores[KEYS[2]]

            if count > 1:
                k0_sum = dict(Counter(Counter(k0_sum) + Counter(scores[KEYS[0]])))
                k1_sum = dict(Counter(Counter(k1_sum) + Counter(scores[KEYS[1]])))
                k2_sum = dict(Counter(Counter(k2_sum) + Counter(scores[KEYS[2]])))
            if count == 10:
                break

            count += 1


        # print(scores)
        print(KEYS[0], end=' ')
        for k in k0_sum:
            print(k,k0_sum[k] / count,end = ' ')
        print('\n')
        print(KEYS[1],end = ' ')
        for k in k1_sum:
            print(k,k1_sum[k] / count,end = ' ')
        print('\n')
        print(KEYS[2], end=' ')
        for k in k2_sum:
            print(k,k2_sum[k] / count,end = ' ')
        print('\n')






    def train(self):
        iter = 1
        count = 0
        total_loss = 0
        total_reward = 0
        while iter <= MAX_ITERATIONS:
            batch = self.batcher.next_batch()

            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = self.getEncData(batch)

            enc_batch = self.model.embeds(enc_batch)  # Get embeddings for encoder input
            enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)
            # print(enc_out.shape)
            #将enc_batch传入
            mle_loss = self.trainMLEStep(batch)
            sample_sents, RL_log_probs = self.trainRLStep(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs,"sample")
            with torch.autograd.no_grad():
                # greedy sampling
                greedy_sents, _ = self.trainRLStep(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs, "greedy")

            sample_reward = self.rewardFunction(sample_sents, batch.original_abstracts)
            baseline_reward = self.rewardFunction(greedy_sents, batch.original_abstracts)
            #公式15
            rl_loss = -(sample_reward - baseline_reward) * RL_log_probs
            rl_loss = torch.mean(rl_loss)

            batch_reward = torch.mean(sample_reward).item()
            loss = LAMBDA * mle_loss + (1 - LAMBDA) * rl_loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            #计算批loss值
            total_loss += loss.item()
            #计算批reward值
            total_reward += batch_reward
            count += 1
            iter += 1

            if iter % PRINT_PER_ITER == 0:


                loss_avg = total_loss / count
                reward_avg = total_reward / count
                total_loss = 0
                print("iter:", iter, "loss:", "%.3f" % loss_avg ,"reward:", "%.3f" % reward_avg)
                count = 0


            if iter % TEST_PER_ITER == 0:
                self.test()
예제 #16
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.decode_dir, model_name)
        self._decode_dir = os.path.splitext(self._decode_dir)[0]
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')

        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            Path(p).mkdir(parents=True, exist_ok=True)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        # self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
        #                        batch_size=config.beam_size, single_pass=True)
        # time.sleep(15)
        self.get_batches(config.decode_pk_path)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def get_batches(self, path):
        """
        load batches dumped by pickle
        see batch_saver.py for more information
        """
        with open(path, 'rb') as f:
            batches = pickle.load(f, encoding="bytes")
        self.batches = batches

        print("loaded: {}".format(path))

    def if_already_exists(self, idx):
        ref_file = os.path.join(self._rouge_ref_dir,
                                "{}_reference.txt".format(idx))
        decoded_file = os.path.join(self._rouge_dec_dir,
                                    "{}_decoded.txt".format(idx))
        return os.path.isfile(ref_file) and os.path.isfile(decoded_file)

    def decode(self, file_id_start, file_id_stop):
        if file_id_stop > MAX_TEST_ID: file_id_stop = MAX_TEST_ID

        # while batch is not None:

        # do this for faster stack CPU machines - to replace those that fail!!
        idx_list = [i for i in range(file_id_start, file_id_stop)]
        random.shuffle(idx_list)

        for idx in idx_list:

            # check if this is written already
            if self.if_already_exists(idx):
                # print("ID {} already exists".format(idx))
                continue

            # batch = self.batcher.next_batch()
            batch = self.batches[idx]

            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self.vocab,
                (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract_sents = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract_sents, decoded_words, idx,
                            self._rouge_ref_dir, self._rouge_dec_dir)

            print("decoded idx = {}".format(idx))
        print("Finished decoding idx [{},{})".format(file_id_start,
                                                     file_id_stop))

        # print("Starting ROUGE eval...")
        # results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        # rouge_log(results_dict, self._decode_dir)

    def beam_search(self, batch):
        # batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch, use_cuda)

        if not config.is_hierarchical:

            encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
                enc_batch, enc_lens)
            s_t_0 = self.model.reduce_state(encoder_hidden)

        else:
            stop_id = self.vocab.word2id('.')
            enc_sent_pos = get_sent_position(enc_batch, stop_id)

            encoder_outputs, encoder_feature, encoder_hidden, sent_enc_outputs, sent_enc_feature, sent_enc_hidden, sent_enc_padding_mask = \
                                                                    self.model.encoder(enc_batch, enc_lens, enc_sent_pos)
            s_t_0, _ = self.model.reduce_state(encoder_hidden, sent_enc_hidden)

        dec_h, dec_c = s_t_0  # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        # decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [
            Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t_0[0],
                 coverage=(coverage_t_0[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]

        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda: y_t_1 = y_t_1.cuda()
            all_state_h = []
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h,
                                 0).unsqueeze(0), torch.stack(all_state_c,
                                                              0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            if not config.is_hierarchical:
                final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                    y_t_1, s_t_1, encoder_outputs, encoder_feature,
                    enc_padding_mask, c_t_1, extra_zeros,
                    enc_batch_extend_vocab, coverage_t_1, steps)

            else:
                final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                    y_t_1, s_t_1, enc_sent_pos, encoder_outputs,
                    encoder_feature, enc_padding_mask, None, sent_enc_outputs,
                    sent_enc_feature, sent_enc_padding_mask, c_t_1,
                    extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps)

            log_probs = torch.log(final_dist)
            topk_log_probs, topk_ids = torch.topk(log_probs,
                                                  config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                        log_prob=topk_log_probs[i, j].item(),
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
class Decoder(object):
    def __init__(self):
        self.vocab = Vocab(args.vocab_path, args.vocab_size)
        self.batcher = Batcher(
            args.decode_data_path,
            self.vocab,
            mode='decode',
            batch_size=1,
            single_pass=True)  # support only 1 item at a time
        time.sleep(15)
        vocab_size = self.vocab.size()
        self.beam_size = args.beam_size
        # self.bertClient = BertClient()
        self.encoder = EncoderLSTM(args.hidden_size, self.vocab.size())
        self.decoder = DecoderLSTM(args.hidden_size, self.vocab.size())
        if use_cuda:
            self.encoder = self.encoder.cuda()
            self.decoder = self.decoder.cuda()

        # Prepare the output folder and files
        output_dir = os.path.join(args.logs, "outputs")
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        output_file = os.path.join(output_dir,
                                   "decoder_{}.txt".format(args.output_name))
        self.file = open(output_file, "w+")

    def load_model(self, checkpoint_file):
        print("Loading Checkpoint: ", checkpoint_file)
        checkpoint = torch.load(checkpoint_file)
        encoder_state_dict = checkpoint['encoder_state_dict']
        decoder_state_dict = checkpoint['decoder_state_dict']
        self.encoder.load_state_dict(encoder_state_dict)
        self.decoder.load_state_dict(decoder_state_dict)
        self.encoder.eval()
        self.decoder.eval()
        print("Weights Loaded")

    def decode(self):
        batch = self.batcher.next_batch()
        count = 0

        while batch is not None:
            # Preparing
            enc_batch, enc_padding_mask, enc_lens = get_input_from_batch(
                batch, use_cuda=use_cuda)
            dec_batch, target_batch, dec_padding_mask, max_dec_len = get_output_from_batch(
                batch, use_cuda=use_cuda)
            batch_size = len(enc_batch)
            proba_list = [0] * batch_size
            generated_list = [[]]

            # Encoding sentences
            outputs, hidden_state = self.encoder(enc_batch)

            x = torch.LongTensor([2])  # start sequence
            if use_cuda:
                x = x.cuda()
            """ Normal Approach """
            #answers = torch.ones((batch_size, args.max_dec_steps), dtype=torch.long)
            #for t in range(max(args.max_dec_steps, max_dec_len)):
            #    output, hidden_state = self.decoder(x, hidden_state)  # Output: batch * vocab_size (prob.)
            #    idx = torch.argmax(output, dim=1)
            #    answers[:, t] = idx.detach()
            #    x = idx
            """ Beam Approach """
            for t in range(max(args.max_dec_steps, max_dec_len) - 1):
                output, hidden_state = self.decoder(x, hidden_state)

                # For each sentence, find b best answers (beam search)
                states = []  # (probab, generated, hidden_state_index)
                for i, each_decode in enumerate(output):
                    prev_proba = proba_list[i]
                    prev_generated = generated_list[i]
                    arr = each_decode.detach().cpu().numpy(
                    )  # log-probab of each word
                    indices = arr.argsort()[-self.beam_size:][::-1]  #index
                    for idx in indices:
                        proba = arr[idx] + proba_list[i]  # new probab and prev
                        generated = prev_generated.copy()
                        generated.append(idx)
                        states.append((proba, generated, i))

                # Sort for the best generated sequence among all
                states.sort(key=lambda x: x[0], reverse=True)

                # Variables
                new_proba_list = []
                new_generated = []
                new_hidden = torch.Tensor()
                new_cell = torch.Tensor()
                new_x = torch.LongTensor()

                if use_cuda:
                    new_hidden = new_hidden.cuda()
                    new_cell = new_cell.cuda()
                    new_x = new_x.cuda()

                # Select top b sequences
                for state in states[:self.beam_size]:
                    new_proba_list.append(state[0])
                    new_generated.append(state[1])
                    idx = state[2]

                    h_0 = hidden_state[0].squeeze(0)[idx].unsqueeze(0)
                    c_0 = hidden_state[1].squeeze(0)[idx].unsqueeze(0)
                    new_hidden = torch.cat((new_hidden, h_0), dim=0)
                    new_cell = torch.cat((new_cell, c_0), dim=0)
                    generated_idx = torch.LongTensor([state[1][-1]])
                    if use_cuda:
                        generated_idx = generated_idx.cuda()

                    new_x = torch.cat((new_x, generated_idx))

                # Save the list
                proba_list = new_proba_list
                generated_list = new_generated
                hidden_state = (new_hidden.unsqueeze(0), new_cell.unsqueeze(0))
                x = new_x

            # Convert from id to word
            # answer = answers[0].numpy()
            answer = new_generated[0]
            sentence = ids2words(answer, self.vocab)
            self.file.write("{}\n".format(sentence))
            print("Writing line #{} to file ...".format(count + 1))
            self.file.flush()
            sys.stdout.flush()

            count += 1
            batch = self.batcher.next_batch()
예제 #18
0
 def __init__(self, data_path, opt, batch_size = 1):
     self.vocab = Vocab(config.vocab_path, config.vocab_size)
     self.batcher = Batcher(data_path, self.vocab, mode='eval',
                            batch_size=batch_size, single_pass=True)
     self.opt = opt
예제 #19
0
class Evaluate(object):
    def __init__(self, data_path, opt, batch_size=config.batch_size):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(data_path,
                               self.vocab,
                               mode='eval',
                               batch_size=batch_size,
                               single_pass=True)
        self.opt = opt
        time.sleep(5)

    def setup_valid(self):
        self.model = Model()
        self.model = get_cuda(self.model)
        checkpoint = T.load(
            os.path.join(config.save_model_path, self.opt.load_model))
        self.model.load_state_dict(checkpoint["model_dict"])
        # save the light version of picked model
        '''
        print(config.save_model_path,' ','light_'+self.opt.load_model)
        save_path = config.save_model_path + '/light_'+self.opt.load_model 
        print(save_path)
        T.save({
            "model_dict": self.model.state_dict(),
        }, save_path)      
        exit()
        ''' #-- end --

    def print_original_predicted(self, decoded_sents, ref_sents, article_sents,
                                 loadfile):
        filename = "test_" + loadfile.split(".")[0] + ".txt"

        with open(os.path.join("data", filename), "w") as f:
            for i in range(len(decoded_sents)):
                f.write("article: " + article_sents[i] + "\n")
                f.write("ref: " + ref_sents[i] + "\n")
                f.write("dec: " + decoded_sents[i] + "\n\n")

    def evaluate_batch(self, print_sents=False):

        self.setup_valid()
        batch = self.batcher.next_batch()
        start_id = self.vocab.word2id(data.START_DECODING)
        end_id = self.vocab.word2id(data.STOP_DECODING)
        unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
        decoded_sents = []
        ref_sents = []
        article_sents = []
        rouge = Rouge()
        while batch is not None:
            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = get_enc_data(
                batch)

            with T.autograd.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            #-----------------------Summarization----------------------------------------------------
            with T.autograd.no_grad():
                pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask,
                                       ct_e, extra_zeros,
                                       enc_batch_extend_vocab, self.model,
                                       start_id, end_id, unk_id)

            for i in range(len(pred_ids)):
                decoded_words = data.outputids2words(pred_ids[i], self.vocab,
                                                     batch.art_oovs[i])
                if len(decoded_words) < 2:
                    decoded_words = "xxx"
                else:
                    decoded_words = " ".join(decoded_words)
                decoded_sents.append(decoded_words)
                abstract = batch.original_abstracts[i]
                article = batch.original_articles[i]
                ref_sents.append(abstract)
                article_sents.append(article)

            batch = self.batcher.next_batch()

        load_file = self.opt.load_model

        if print_sents:
            self.print_original_predicted(decoded_sents, ref_sents,
                                          article_sents, load_file)

        scores = rouge.get_scores(decoded_sents, ref_sents, avg=True)
        if self.opt.task == "test":
            print(load_file, "scores:", scores)
        else:
            rouge_l = scores["rouge-l"]["f"]
            print(load_file, "rouge_l:", "%.4f" % rouge_l)
            with open("test_rg.txt", "a") as f:
                f.write("\n" + load_file + " - rouge_l: " + str(rouge_l))
                f.close()
예제 #20
0
class Evaluate(object):

    def __init__(self, data_path, opt, batch_size=config.batch_size):

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size,
                               single_pass=True)

        self.opt =opt
        time.sleep(5)

    def setup_valid(self):
        self.model = Model()
        self.model = get_cuda(self.model)
        checkpoint = torch.load(os.path.join(config.save_model_path,
                                             self.opt.load_model))
        # 加载在train中保存得模型
        self.model.load_state_dict(checkpoint['model_dict'])

    def print_original_predicted(self, decoded_sents, ref_sents, article_sents,
                                 loadfile):

        # 这里可能会存在一点问题,debug得时候剋注意一下
        filename = 'test_' + loadfile.split('.')[:-1] + '.txt'

        with open(os.path.join('data', filename), 'w') as f:
            for i in range(len(decoded_sents)):
                f.write('article' + article_sents[i] + '\n')
                f.write('reference:' + ref_sents[i] + '\n')
                f.write('decoder:' + decoded_sents[i] + '\n')

    def evaluate_batch(self, print_sents =False):

        self.setup_valid()
        batch = self.batcher.next_batch()

        start_id = self.vocab.word2id(data.START_DECODING)
        end_id = self.vocab.word2id(data.STOP_DECODING)
        unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)

        decoded_sents = []
        ref_sents = []
        article_sents = []
        rouge = Rouge()

        batch_number = 0

        while batch is not None:

            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, \
                extra_zeros, ct_e = get_enc_data(batch)

            with torch.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            with torch.no_grad():
                pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask, ct_e,
                                       extra_zeros, enc_batch_extend_vocab, self.model, start_id, end_id, unk_id)

            for i in range(len(pred_ids)):
                # 返回的是一个 单词列表。
                decoded_words = data.outputids2words(pred_ids[i], self.vocab, batch.art_oovs[i])

                if len(decoded_words) < 2:
                    decoded_words = 'xxx'
                else:
                    decoded_words = ' '.join(decoded_words)

                decoded_sents.append(decoded_words)
                summary = batch.original_summarys[i]
                article = batch.original_articles[i]
                ref_sents.append(summary)
                article_sents.append(article)

            batch = self.batcher.next_batch()
            batch_number += 1

            if batch_number < 100:
                continue
            else:
                break

        load_file = self.opt.load_model

        if print_sents:
            self.print_original_predicted(decoded_sents, ref_sents, article_sents, load_file)

        scores = rouge.get_scores(decoded_sents, ref_sents, avg=True)

        if self.opt.task == 'test':
            print(load_file, 'scores:', scores)
            sys.stdout.flush()
        else:
            rouge_l = scores['rouge-l']['f']
            print(load_file, 'rouge-l:', '%.4f' % rouge_l)
예제 #21
0
word = wvmodel.wv.index2entity[25]
vector = wvmodel.wv.vectors[25]
print(word)
# print(vector)


# In[ ]:


import torch
from data_util.data import Vocab
vocab_size = len(wvmodel.vocab) + 1


vocab = Vocab('Embedding/main_cat/word2Vec/word.vocab', vocab_size)

embed_size = 300
weight = torch.zeros(vocab_size, embed_size)

for i in range(len(vocab._id_to_word.keys())):
    try:
        vocab_word = vocab._id_to_word[i+4]
        w2vec_word = w2vec.wv.index2entity[i]
    except Exception as e :
        continue
    if i + 4 > vocab_size: break
#     print(vocab_word,w2vec_word)
    weight[i+4, :] = torch.from_numpy(w2vec.wv.vectors[i])
        
embedding = torch.nn.Embedding.from_pretrained(weight)
예제 #22
0
class Evaluate(object):
    def __init__(self, data_path, opt, batch_size=config.batch_size):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(data_path,
                               self.vocab,
                               mode='eval',
                               batch_size=batch_size,
                               single_pass=True)
        self.opt = opt
        time.sleep(5)

    def setup_valid(self):
        self.model = Model()
        self.model = get_cuda(self.model)
        if T.cuda.is_available(): map_location = T.device('cuda')
        else: map_location = T.device('cpu')
        checkpoint = T.load(
            os.path.join(config.save_model_path, self.opt.load_model),
            map_location)
        self.model.load_state_dict(checkpoint["model_dict"])


#        mlflow.pytorch.save_model(self.model,config.save_model_path+'_2')
#        mlflow.pytorch.load_model(config.save_model_path+'_2')

    def print_original_predicted(self, decoded_sents, ref_sents, article_sents,
                                 loadfile):
        filename = "test_" + loadfile.split(".")[0] + ".txt"

        with open(os.path.join("data", filename), "w") as f:
            for i in range(len(decoded_sents)):
                f.write("article: " + article_sents[i] + "\n")
                f.write("ref: " + ref_sents[i] + "\n")
                f.write("dec: " + decoded_sents[i] + "\n\n")

    def evaluate_batch(self, print_sents=False):

        self.setup_valid()
        batch = self.batcher.next_batch()
        start_id = self.vocab.word2id(data.START_DECODING)
        end_id = self.vocab.word2id(data.STOP_DECODING)
        unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
        decoded_sents = []
        ref_sents = []
        article_sents = []
        rouge = Rouge()
        while batch is not None:
            enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, ct_e = get_enc_data(
                batch)

            with T.autograd.no_grad():
                enc_batch = self.model.embeds(enc_batch)
                enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

            #-----------------------Summarization----------------------------------------------------
            with T.autograd.no_grad():
                pred_ids = beam_search(enc_hidden, enc_out, enc_padding_mask,
                                       ct_e, extra_zeros,
                                       enc_batch_extend_vocab, self.model,
                                       start_id, end_id, unk_id)

            for i in range(len(pred_ids)):
                decoded_words = data.outputids2words(pred_ids[i], self.vocab,
                                                     batch.art_oovs[i])
                if len(decoded_words) < 2:
                    decoded_words = "xxx"
                else:
                    decoded_words = " ".join(decoded_words)
                decoded_sents.append(decoded_words)
                abstract = batch.original_abstracts[i]
                article = batch.original_articles[i]
                ref_sents.append(abstract)
                article_sents.append(article)
                article_art_oovs = batch.art_oovs[i]

            #batch = self.batcher.next_batch()
            break

        load_file = self.opt.load_model  # just a model name

        #if print_sents:
        #    self.print_original_predicted(decoded_sents, ref_sents, article_sents, load_file)
        Batcher.article_summary = decoded_sents[0]
        Batcher.oovs = " ".join(article_art_oovs)

        #        print('Article: ',article_sents[0], '\n==> Summary: [',decoded_sents[0],']\nOut of vocabulary: ', " ".join(article_art_oovs),'\nModel used: ', load_file)
        scores = 0  #rouge.get_scores(decoded_sents, ref_sents, avg = True)
        if self.opt.task == "test":
            print('Done.')
            #print(load_file, "scores:", scores)
        else:
            rouge_l = scores["rouge-l"]["f"]
예제 #23
0
def main():
    vocab_file = os.path.join(vocab_path, 'vocab3.txt')
    vocab = Vocab(vocab_file)
    loader = WordVectorLoader(embed_dim=200)
    loader.create_embedding_matrix(embeddings_file_name, vocab._word_to_id)
    loader.save_embed_mat(embed_path)
예제 #24
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.concept_vocab = Concept_vocab(config.concept_vocab_path,
                                           config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               self.concept_vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        self.ds_batcher = Batcher(config.train_ds_data_path,
                                  self.vocab,
                                  self.concept_vocab,
                                  mode='train',
                                  batch_size=500,
                                  single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = AdagradCustom(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def calc_Rouge_1(self, sub, string):
        new_sub = [str(x) for x in sub]
        new_sub.insert(0, '"')
        new_sub.append('"')
        token_c = ' '.join(new_sub)
        summary = [[token_c]]
        new_string = [str(x) for x in string]
        new_string.insert(0, '"')
        new_string.append('"')
        token_r = ' '.join(new_string)
        reference = [[[token_r]]]

        rouge = Pythonrouge(summary_file_exist=False,
                            summary=summary,
                            reference=reference,
                            n_gram=2,
                            ROUGE_SU4=False,
                            ROUGE_L=False,
                            recall_only=True,
                            stemming=True,
                            stopwords=True,
                            word_level=True,
                            length_limit=True,
                            length=30,
                            use_cf=False,
                            cf=95,
                            scoring_formula='average',
                            resampling=False,
                            samples=10,
                            favor=True,
                            p=0.5)
        score = rouge.calc_score()
        return score['ROUGE-1']

    def calc_Rouge_2_recall(self, sub, string):
        token_c = sub
        token_r = string
        model = []
        ref = []
        if len(string) == 1 or len(string) == 1:
            score = 0.0
        else:
            i = 1
            while i < len(string):
                ref.append(str(token_r[i - 1]) + str(token_r[i]))
                i += 1
            i = 1
            while i < len(sub):
                model.append(str(token_c[i - 1]) + str(token_c[i]))
                i += 1
            sam = 0
            i = 0
            for i in range(len(ref)):
                for j in range(len(model)):
                    if ref[i] == model[j]:
                        sam += 1
                        model[j] = '-1'
                        break

            score = sam / float(len(ref))

        return score

    def calc_Rouge_L(self, sub, string):
        beta = 1.
        token_c = sub
        token_r = string
        if (len(string) < len(sub)):
            sub, string = string, sub
        lengths = [[0 for i in range(0,
                                     len(sub) + 1)]
                   for j in range(0,
                                  len(string) + 1)]
        for j in range(1, len(sub) + 1):
            for i in range(1, len(string) + 1):
                if (string[i - 1] == sub[j - 1]):
                    lengths[i][j] = lengths[i - 1][j - 1] + 1
                else:
                    lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
        lcs = lengths[len(string)][len(sub)]

        prec = lcs / float(len(token_c))
        rec = lcs / float(len(token_r))

        if (prec != 0 and rec != 0):
            score = ((1 + beta**2) * prec * rec) / float(rec + beta**2 * prec)
        else:
            score = 0.0
        return rec

    def calc_kl(self, dec, enc):
        kl = 0.
        dec = np.exp(dec)
        enc = np.exp(enc)
        all_dec = np.sum(dec)
        all_enc = np.sum(enc)
        for d, c in zip(dec, enc):
            d = d / all_dec
            c = c / all_enc
            kl = kl + c * np.log(c / d)
        return kl

    def calc_euc(self, dec, enc):
        euc = 0.
        for d, c in zip(dec, enc):
            euc = euc + np.sqrt(np.square(d - c))
        #print euc
        return euc

    def ds_loss(self, enc_batch_ds_emb, enc_padding_mask_ds, dec_batch_emb,
                dec_padding_mask):
        b1, t_k1, emb1 = list(enc_batch_ds_emb.size())
        b2, t_k2, emb2 = list(dec_batch_emb.size())
        enc_padding_mask_ds = enc_padding_mask_ds.unsqueeze(2).expand(
            b1, t_k1, emb1).contiguous()
        dec_padding_mask = dec_padding_mask.unsqueeze(2).expand(
            b2, t_k2, emb2).contiguous()
        enc_batch_ds_emb = enc_batch_ds_emb * enc_padding_mask_ds
        dec_batch_emb = dec_batch_emb * dec_padding_mask
        enc_batch_ds_emb = torch.sum(enc_batch_ds_emb, 1)
        dec_batch_emb = torch.sum(dec_batch_emb, 1)
        dec_title = dec_batch_emb.tolist()
        enc_article = enc_batch_ds_emb.tolist()
        dec_title_len = len(dec_title)
        enc_article_len = len(enc_article)
        dsloss = 0.
        for dec in dec_title:
            for enc in enc_article:
                dsloss = dsloss + self.calc_kl(dec, enc)
        dsloss = dsloss / float(dec_title_len * enc_article_len)
        print(dsloss)
        return dsloss

    def train_one_batch(self, batch, steps, batch_ds):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, enc_batch_concept_extend_vocab, concept_p, position, concept_mask, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)
        enc_batch_ds, enc_padding_mask_ds, enc_lens_ds, _, _, _, _, _, _, _, _ = \
            get_input_from_batch(batch_ds, use_cuda)

        self.optimizer.zero_grad()
        encoder_outputs, encoder_hidden, max_encoder_output, enc_batch_ds_emb, dec_batch_emb = self.model.encoder(
            enc_batch, enc_lens, enc_batch_ds, dec_batch)
        if config.DS_train:
            ds_final_loss = self.ds_loss(enc_batch_ds_emb, enc_padding_mask_ds,
                                         dec_batch_emb, dec_padding_mask)
        s_t_1 = self.model.reduce_state(encoder_hidden)
        s_t_0 = s_t_1
        c_t_0 = c_t_1
        if config.use_maxpool_init_ctx:
            c_t_1 = max_encoder_output
            c_t_0 = c_t_1

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                'train', y_t_1, s_t_1, encoder_outputs, enc_padding_mask,
                c_t_1, extra_zeros, enc_batch_extend_vocab,
                enc_batch_concept_extend_vocab, concept_p, position,
                concept_mask, coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        if config.DS_train:
            ds_final_loss = Variable(torch.FloatTensor([ds_final_loss]),
                                     requires_grad=False)
            ds_final_loss = ds_final_loss.cuda()
            loss = (config.pi - ds_final_loss) * torch.mean(batch_avg_loss)
        else:
            loss = torch.mean(batch_avg_loss)
        if steps > config.traintimes:
            scores = []
            sample_y = []
            s_t_1 = s_t_0
            c_t_1 = c_t_0
            for di in range(min(max_dec_len, config.max_dec_steps)):
                if di == 0:
                    y_t_1 = dec_batch[:, di]
                    sample_y.append(y_t_1.cpu().numpy().tolist())
                else:
                    sample_latest_tokens = sample_y[-1]
                    sample_latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                                            for t in sample_latest_tokens]

                    y_t_1 = Variable(torch.LongTensor(sample_latest_tokens))
                    y_t_1 = y_t_1.cuda()

                final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                    'train', y_t_1, s_t_1, encoder_outputs, enc_padding_mask,
                    c_t_1, extra_zeros, enc_batch_extend_vocab,
                    enc_batch_concept_extend_vocab, concept_p, position,
                    concept_mask, coverage, di)
                sample_select = torch.multinomial(final_dist, 1).view(-1)
                sample_log_probs = torch.gather(
                    final_dist, 1, sample_select.unsqueeze(1)).squeeze()
                sample_y.append(sample_select.cpu().numpy().tolist())
                sample_step_loss = -torch.log(sample_log_probs + config.eps)
                sample_step_mask = dec_padding_mask[:, di]
                sample_step_loss = sample_step_loss * sample_step_mask
                scores.append(sample_step_loss)
            sample_sum_losses = torch.sum(torch.stack(scores, 1), 1)
            sample_batch_avg_loss = sample_sum_losses / dec_lens_var

            sample_y = np.transpose(sample_y).tolist()

            base_y = []
            s_t_1 = s_t_0
            c_t_1 = c_t_0
            for di in range(min(max_dec_len, config.max_dec_steps)):
                if di == 0:
                    y_t_1 = dec_batch[:, di]
                    base_y.append(y_t_1.cpu().numpy().tolist())
                else:
                    base_latest_tokens = base_y[-1]
                    base_latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                                            for t in base_latest_tokens]

                    y_t_1 = Variable(torch.LongTensor(base_latest_tokens))
                    y_t_1 = y_t_1.cuda()

                final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                    'train', y_t_1, s_t_1, encoder_outputs, enc_padding_mask,
                    c_t_1, extra_zeros, enc_batch_extend_vocab,
                    enc_batch_concept_extend_vocab, concept_p, position,
                    concept_mask, coverage, di)
                base_log_probs, base_ids = torch.topk(final_dist, 1)
                base_y.append(base_ids[:, 0].cpu().numpy().tolist())

            base_y = np.transpose(base_y).tolist()

            refs = dec_batch.cpu().numpy().tolist()
            sample_dec_lens_var = map(int, dec_lens_var.cpu().numpy().tolist())
            sample_rougeL = [
                self.calc_Rouge_L(sample[:reflen],
                                  ref[:reflen]) for sample, ref, reflen in zip(
                                      sample_y, refs, sample_dec_lens_var)
            ]
            base_rougeL = [
                self.calc_Rouge_L(base[:reflen], ref[:reflen])
                for base, ref, reflen in zip(base_y, refs, sample_dec_lens_var)
            ]
            sample_rougeL = Variable(torch.FloatTensor(sample_rougeL),
                                     requires_grad=False)
            base_rougeL = Variable(torch.FloatTensor(base_rougeL),
                                   requires_grad=False)
            sample_rougeL = sample_rougeL.cuda()
            base_rougeL = base_rougeL.cuda()
            word_loss = -sample_batch_avg_loss * (base_rougeL - sample_rougeL)
            reinforce_loss = torch.mean(word_loss)
            loss = (1 - config.rein) * loss + config.rein * reinforce_loss

        loss.backward()

        clip_grad_norm(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm(self.model.reduce_state.parameters(),
                       config.max_grad_norm)

        self.optimizer.step()

        return loss.data[0]

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            batch_ds = self.ds_batcher.next_batch()
            loss = self.train_one_batch(batch, iter, batch_ds)
            loss = loss.cpu()

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 5
            if iter % print_interval == 0:
                print('steps %d , loss: %f' % (iter, loss))
                start = time.time()
            if iter % 50000 == 0:
                self.save_model(running_avg_loss, iter)
예제 #25
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.decode_dir, model_name)
        self._decode_dir = os.path.splitext(self._decode_dir)[0]
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')

        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            Path(p).mkdir(parents=True, exist_ok=True)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.pad_id = self.vocab.word2id(PAD_TOKEN)
        self.start_id = self.vocab.word2id(START_DECODING)
        self.stop_id = self.vocab.word2id(STOP_DECODING)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def if_already_exists(self, idx):
        decoded_file = os.path.join(self._rouge_dec_dir,
                                    "file.{}.txt".format(idx))
        return os.path.isfile(decoded_file)

    def decode(self, file_id_start, file_id_stop, ami_id='191209'):
        print("AMI transcription:", ami_id)

        test_data = load_ami_data(ami_id, 'test')

        # do this for faster stack CPU machines - to replace those that fail!!
        idx_list = [i for i in range(file_id_start, file_id_stop)]
        random.shuffle(idx_list)
        for idx in idx_list:

            # for idx in range(file_id_start, file_id_stop):
            # check if this is written already
            if self.if_already_exists(idx):
                print("ID {} already exists".format(idx))
                continue

            # Run beam search to get best Hypothesis
            best_summary, art_oovs = self.beam_search(test_data, idx)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self.vocab,
                (art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            # original_abstract_sents = batch.original_abstracts_sents[0]
            original_abstract_sents = []

            write_for_rouge(original_abstract_sents, decoded_words, idx,
                            self._rouge_ref_dir, self._rouge_dec_dir)

            print("decoded idx = {}".format(idx))
        print("Finished decoding idx [{},{})".format(file_id_start,
                                                     file_id_stop))

        # print("Starting ROUGE eval...")
        # results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        # rouge_log(results_dict, self._decode_dir)

    def beam_search(self, test_data, idx):
        # batch should have only one example
        # enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
        #       get_input_from_batch(batch, use_cuda)

        enc_pack, art_oovs = get_a_batch_decode(test_data,
                                                idx,
                                                self.vocab,
                                                config.beam_size,
                                                config.max_enc_steps,
                                                config.max_dec_steps,
                                                self.start_id,
                                                self.stop_id,
                                                self.pad_id,
                                                sum_type='short',
                                                use_cuda=use_cuda)
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = enc_pack
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state.forward1(encoder_hidden)

        dec_h, dec_c = s_t_0  # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        # decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [
            Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t_0[0],
                 coverage=(coverage_t_0[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]

        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda: y_t_1 = y_t_1.cuda()
            all_state_h = []
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h,
                                 0).unsqueeze(0), torch.stack(all_state_c,
                                                              0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder.forward1(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage_t_1, steps)

            log_probs = torch.log(final_dist)
            topk_log_probs, topk_ids = torch.topk(log_probs,
                                                  config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                        log_prob=topk_log_probs[i, j].item(),
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0], art_oovs
예제 #26
0
class Train(object):
    def __init__(self):
        if config.is_hierarchical:
            raise Exception("Hierarchical PGN-AMI not supported!")

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.pad_id = self.vocab.word2id(PAD_TOKEN)
        self.start_id = self.vocab.word2id(START_DECODING)
        self.stop_id = self.vocab.word2id(STOP_DECODING)

        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(self.model_dir,
                                       'iter{}.pt'.format(iter))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, ami_data, idx):
        # enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
        #     get_ami_input_from_batch(batch, use_cuda)
        # dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
        #     get_ami_output_from_batch(batch, use_cuda)

        enc_pack, dec_pack = get_a_batch(ami_data,
                                         idx,
                                         self.vocab,
                                         config.batch_size,
                                         config.max_enc_steps,
                                         config.max_dec_steps,
                                         self.start_id,
                                         self.stop_id,
                                         self.pad_id,
                                         sum_type='short',
                                         use_cuda=use_cuda)
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state.forward1(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing

            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)

            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        sys.stdout.flush()

        ami_data = load_ami_data('train')
        valid_data = load_ami_data('valid')
        # make the training data 100
        random.shuffle(valid_data)
        ami_data.extend(valid_data[:6])
        valid_data = valid_data[6:]

        num_batches = len(ami_data)
        idx = 0

        # validation & stopping
        best_valid_loss = 1000000000
        stop_counter = 0

        while iter < n_iters:
            if idx == 0:
                print("shuffle training data")
                random.shuffle(ami_data)

            loss = self.train_one_batch(ami_data, idx)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)

            iter += 1
            idx += config.batch_size
            if idx == num_batches: idx = 0

            if iter % self.print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(
                    str(datetime.now()), iter, loss))
                sys.stdout.flush()

            if iter % config.save_every == 0:
                self.save_model(running_avg_loss, iter)

            if iter % config.eval_every == 0:
                valid_loss = self.run_eval(valid_data)
                print("valid_loss = {:.5f}".format(valid_loss))
                if valid_loss < best_valid_loss:
                    stop_counter = 0
                    best_valid_loss = valid_loss
                    print("VALID better")
                else:
                    stop_counter += 1
                    print(
                        "VALID NOT better, counter = {}".format(stop_counter))
                    if stop_counter == config.stop_after:
                        print("Stop training")
                        return

        print("Finished training!")

    def eval_one_batch(self, eval_data, idx):

        enc_pack, dec_pack = get_a_batch(eval_data,
                                         idx,
                                         self.vocab,
                                         1,
                                         config.max_enc_steps,
                                         config.max_dec_steps,
                                         self.start_id,
                                         self.stop_id,
                                         self.pad_id,
                                         sum_type='short',
                                         use_cuda=use_cuda)

        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state.forward1(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)

            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist,
                                      dim=1,
                                      index=target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        return loss.data.item()

    def run_eval(self, eval_data):
        running_avg_loss, iter = 0, 0
        batch_losses = []
        num_batches = len(eval_data)
        print("valid data size = {}".format(num_batches))
        for idx in range(num_batches):
            loss = self.eval_one_batch(eval_data, idx)
            batch_losses.append(loss)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            print("#", end="")
            sys.stdout.flush()
        print()

        avg_loss = sum(batch_losses) / len(batch_losses)
        return avg_loss
예제 #27
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)

        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        # print("MODE MUST BE train")
        # time.sleep(15)
        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        # self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(self.model_dir, 'iter{}.pt'.format(iter))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path, map_location= lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        if not config.is_hierarchical:
            encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
            s_t_1 = self.model.reduce_state.forward1(encoder_hidden)

        else:
            stop_id = self.vocab.word2id('.')
            pad_id  = self.vocab.word2id('[PAD]')
            enc_sent_pos = get_sent_position(enc_batch, stop_id, pad_id)
            dec_sent_pos = get_sent_position(dec_batch, stop_id, pad_id)

            encoder_outputs, encoder_feature, encoder_hidden, sent_enc_outputs, sent_enc_feature, sent_enc_hidden, sent_enc_padding_mask, sent_lens, seq_lens2 = \
                                                                    self.model.encoder(enc_batch, enc_lens, enc_sent_pos)

            s_t_1, sent_s_t_1 = self.model.reduce_state(encoder_hidden, sent_enc_hidden)
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            if not config.is_hierarchical:
                # start = datetime.now()

                final_dist, s_t_1,  c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(y_t_1, s_t_1,
                                                            encoder_outputs, encoder_feature, enc_padding_mask, c_t_1,
                                                            extra_zeros, enc_batch_extend_vocab,
                                                                               coverage, di)
                # print('NO HIER Time: ',datetime.now() - start)
                # import pdb; pdb.set_trace()
            else:
                # start = datetime.now()
                max_doc_len = enc_batch.size(1)
                final_dist, sent_s_t_1,  c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, sent_s_t_1,
                                                            encoder_outputs, encoder_feature, enc_padding_mask, seq_lens2,
                                                            sent_s_t_1, sent_enc_outputs, sent_enc_feature, sent_enc_padding_mask,
                                                            sent_lens, max_doc_len,
                                                            c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di)
                # print('DO HIER Time: ',datetime.now() - start)
                # import pdb; pdb.set_trace()


            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses/dec_lens_var
        loss = torch.mean(batch_avg_loss)

        # start = datatime.now()
        loss.backward()
        # print('{} HIER Time: {}'.format(config.is_hierarchical ,datetime.now() - start))
        # import pdb; pdb.set_trace()

        clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        sys.stdout.flush()

        # data_path = "lib/data/batches_train.vocab50000.batch16.pk.bin"
        # with open(data_path, 'rb') as f:
        #     stored_batches = pickle.load(f, encoding="bytes")
        # print("loaded data: {}".format(data_path))
        # num_batches = len(stored_batches)

        while iter < n_iters:
            batch = self.batcher.next_batch()
            # batch_id = iter%num_batches
            # batch = stored_batches[batch_id]

            loss = self.train_one_batch(batch)

            # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter)

            iter += 1

            # if iter % 100 == 0:
            #     self.summary_writer.flush()

            if iter % self.print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(str(datetime.now()), iter, loss))
                sys.stdout.flush()

            if iter % config.save_every == 0:
                self.save_model(running_avg_loss, iter)

        print("Finished training!")
예제 #28
0
class Evaluate_pg(object):
    def __init__(self, model_file_path, is_word_level, is_combined, alpha):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        # self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
        #                        batch_size=config.batch_size, single_pass=True)
        self.dataset = DailyMailDataset("val", self.vocab)
        # time.sleep(15)
        model_name = os.path.basename(model_file_path)

        self.is_word_level = is_word_level
        self.is_combined = is_combined
        self.alpha = alpha

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)

        self.model = Model(model_file_path, is_eval=True)

    def compute_policy_grads_using_rewards(self, sentence_rewards,
                                           word_rewards, sentence_losses,
                                           word_losses, word_to_sent_ind):
        if self.is_combined:
            pg_losses = [[(self.alpha * word_reward + (1 - self.alpha) *
                           sentence_rewards[i][word_to_sent_ind[i][j]]) *
                          word_losses[i][j]
                          for j, word_reward in enumerate(abstract_rewards)
                          if j < len(word_to_sent_ind[i])]
                         for i, abstract_rewards in enumerate(word_rewards)]
            pg_losses = [sum(pg) for pg in pg_losses]
        elif self.is_word_level:
            pg_losses = [[
                word_reward * word_losses[i][j]
                for j, word_reward in enumerate(abstract_rewards)
                if j < len(word_to_sent_ind[i])
            ] for i, abstract_rewards in enumerate(word_rewards)]
            pg_losses = [sum(pg) for pg in pg_losses]
        else:
            pg_losses = [[
                rs * sentence_losses[ri][rsi] for rsi, rs in enumerate(r)
            ] for ri, r in enumerate(sentence_rewards)]
            pg_losses = [sum(pg) for pg in pg_losses]
        return pg_losses

    def compute_pg_loss(self, orig, pred, sentence_losses, split_predictions,
                        word_losses, word_to_sent_ind):
        sentence_rewards = None
        word_rewards = None
        # First compute the rewards
        if not self.is_word_level or self.is_combined:
            sentence_rewards = get_sentence_rewards(orig, pred)

        if self.is_word_level or self.is_combined:
            word_rewards = get_word_level_rewards(orig, split_predictions)

        pg_losses = self.compute_policy_grads_using_rewards(
            sentence_rewards=sentence_rewards,
            word_rewards=word_rewards,
            sentence_losses=sentence_losses,
            word_losses=word_losses,
            word_to_sent_ind=word_to_sent_ind)

        return pg_losses

    def compute_batched_loss(self, word_losses, orig, pred):
        orig_sum = []
        new_pred = []
        pred_sum = []
        sentence_losses = []

        # Convert the original sum as one single string per article
        for i in range(len(orig)):
            orig_sum.append(' '.join(map(str, orig[i])))
            new_pred.append([])
            pred_sum.append([])
            sentence_losses.append([])

        batch_sent_indices = []
        for i in range(len(pred)):
            sentence = []
            sentence = pred[i]
            losses = word_losses[i]
            sentence_indices = []
            count = 0
            while len(sentence) > 0:
                try:
                    idx = sentence.index(".")
                except ValueError:
                    idx = len(sentence)

                sentence_indices.extend([count for _ in range(idx)])

                if count > 0:
                    new_pred[i].append(new_pred[i][count - 1] +
                                       sentence[:idx + 1])
                else:
                    new_pred[i].append(sentence[:idx + 1])

                sentence_losses[i].append(sum(losses[:idx + 1]))

                sentence = sentence[idx + 1:]
                losses = losses[idx + 1:]
                count += 1
            batch_sent_indices.append(sentence_indices)

        for i in range(len(pred)):
            for j in range(len(new_pred[i])):
                pred_sum[i].append(' '.join(map(str, new_pred[i][j])))

        pg_losses = self.compute_pg_loss(orig_sum,
                                         pred_sum,
                                         sentence_losses,
                                         split_predictions=pred,
                                         word_losses=word_losses,
                                         word_to_sent_ind=batch_sent_indices)

        return pg_losses

    def eval_one_batch(self, batch):
        batch_size = batch.batch_size

        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        output_ids = []
        y_t_1 = torch.ones(batch_size, dtype=torch.long) * self.vocab.word2id(
            data.START_DECODING)

        if config.use_gpu:
            y_t_1 = y_t_1.cuda()

        for _ in range(batch_size):
            output_ids.append([])
            step_losses.append([])

        for di in range(min(max_dec_len, config.max_dec_steps)):
            #y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)  #NLL
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask

            # Move on to the next token
            _, idx = torch.max(final_dist, 1)
            idx = idx.reshape(batch_size, -1).squeeze()
            y_t_1 = idx

            for i, pred in enumerate(y_t_1):
                if not pred.item() == data.PAD_TOKEN:
                    output_ids[i].append(pred.item())

            for i, loss in enumerate(step_loss):
                step_losses[i].append(step_loss[i])

        # Obtain the original and predicted summaries
        original_abstracts = batch.original_abstracts_sents
        predicted_abstracts = [
            data.outputids2words(ids, self.vocab, None) for ids in output_ids
        ]

        # Compute the batched loss
        batched_losses = self.compute_batched_loss(step_losses,
                                                   original_abstracts,
                                                   predicted_abstracts)
        losses = torch.stack(batched_losses)
        losses = losses / dec_lens_var

        loss = torch.mean(losses)

        return loss.item()

    def run_eval(self, model_dir, train_iter_id):
        dataloader = DataLoader(self.dataset,
                                batch_size=config.batch_size,
                                shuffle=False,
                                num_workers=1,
                                collate_fn=create_batch_collate(
                                    self.vocab, config.batch_size))
        running_avg_loss, iter = 0, 0
        start = time.time()
        # batch = self.batcher.next_batch()
        pg_losses = []
        run_avg_losses = []
        for batch in dataloader:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            print("Iteration:", iter, "  loss:", loss, "  Running avg loss:",
                  running_avg_loss)
            iter += 1

            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start,
                       running_avg_loss))
                start = time.time()

            pg_losses.append(loss)
            run_avg_losses.append(running_avg_loss)

        # Dump val losses
        pickle.dump(
            pg_losses,
            open(
                os.path.join(model_dir,
                             'val_pg_losses_{}.p'.format(train_iter_id)),
                'wb'))
        pickle.dump(
            run_avg_losses,
            open(
                os.path.join(model_dir,
                             'val_run_avg_losses_{}.p'.format(train_iter_id)),
                'wb'))

        return run_avg_losses
예제 #29
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)


    def decode(self):
        start = time.time()
        counter = 0
        batch = self.batcher.next_batch()
        while batch is not None:
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(output_ids, self.vocab,
                                                 (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract_sents = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract_sents, decoded_words, counter,
                            self._rouge_ref_dir, self._rouge_dec_dir)
            counter += 1
            if counter % 1000 == 0:
                print('%d example in %d sec'%(counter, time.time() - start))
                start = time.time()

            batch = self.batcher.next_batch()

        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)


    def beam_search(self, batch):
        #batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        if config.use_maxpool_init_ctx:
            c_t_0 = max_encoder_output

        dec_h, dec_c = s_t_0 # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        #decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                      log_probs=[0.0],
                      state=(dec_h[0], dec_c[0]),
                      context = c_t_0[0],
                      coverage=(coverage_t_0[0] if config.is_coverage else None))
                 for _ in xrange(config.beam_size)]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda:
                y_t_1 = y_t_1.cuda()
            all_state_h =[]
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1,
                                                        encoder_outputs, enc_padding_mask, c_t_1,
                                                        extra_zeros, enc_batch_extend_vocab, coverage_t_1)

            topk_log_probs, topk_ids = torch.topk(final_dist, config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in xrange(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in xrange(config.beam_size * 2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].data[0],
                                   log_prob=topk_log_probs[i, j].data[0],
                                   state=state_i,
                                   context=context_i,
                                   coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)
        self.val_batcher = Batcher(config.eval_data_path,
                                   self.vocab,
                                   mode='eval',
                                   batch_size=config.batch_size,
                                   single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)
        self.scheduler = ExponentialLR(self.optimizer, gamma=0.99)
        start_iter, start_loss = 0, 0

        if model_file_path is not None:  #途中から始める場合
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:  #coverageが無しの場合
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch, iter):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []

        words = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            words.append(self.vocab.id2word(final_dist[0].argmax().item()))

            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            # print('step_loss',step_loss)
            # print('step_loss.size()',step_loss.size())
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        if iter % 100 == 0:
            print(words)
            print([self.vocab.id2word(idx.item()) for idx in dec_batch[0]])
            print([self.vocab.id2word(idx.item()) for idx in target_batch[0]])

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def eval_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)
        # print(loss)
        # print(type(loss))
        # print(loss.data)
        # print(loss.data.item())
        # return loss.data[0]
        return loss.data.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch, iter)
            val_loss = None
            if iter % 100 == 0:
                val_batch = self.val_batcher.next_batch()
                val_loss = self.eval_one_batch(val_batch)
                # print("val_loss",val_loss)
                self.scheduler.step()
                print("lr", self.scheduler.get_lr())

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1
            if iter % print_interval == 0:
                if val_loss is not None:
                    print(
                        'steps %d, seconds for %d batch: %.2f , loss: %f , eval_loss: %f'
                        % (iter, print_interval, time.time() - start, loss,
                           val_loss))
                else:
                    print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                          (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 1000 == 0:
                self.save_model(running_avg_loss, iter)
예제 #31
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        '''self.batcher = Batcher(config.oped_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)'''
        self.batches = self.read_opeds(config.oped_data_path, self.vocab,
                                       config.beam_size)

        self.model = Model(model_file_path, is_eval=True)

    def read_opeds(self, config_path, vocab, beam_size):
        file_list = glob.glob(config_path)
        #file_list = os.listdir(config_path)
        batch_list = []
        for file in file_list:
            with open(file, 'rb') as f:
                text = f.read().lower().decode('utf-8')
                text = re.sub('\n', '', text)
                text = re.sub(r'([.,!?()"])', r' \1 ', text).encode('utf-8')
                print(text)

                ex = Example(text, [], vocab)

                # text = text.split()
                # if len(text) > config.max_enc_steps:
                #     text = text[:config.max_enc_steps]
                # enc_input = [vocab.word2id(w.decode('utf-8')) for w in text]
                # assert(sum(enc_input) != 0)

                enc_input = [ex for _ in range(beam_size)]
                batch = Batch(enc_input, vocab, beam_size)
                batch_list.append(batch)
                print(batch.enc_batch)
        return batch_list

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def decode(self):
        start = time.time()
        counter = 0
        for batch in self.batches:
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self.vocab,
                (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            write_results(decoded_words, counter, self._rouge_dec_dir)
            counter += 1
            if counter % 1000 == 0:
                print('%d example in %d sec' % (counter, time.time() - start))
                start = time.time()
        '''print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)'''

    def beam_search(self, batch):
        #batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        dec_h, dec_c = s_t_0  # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        #decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [
            Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t_0[0],
                 coverage=(coverage_t_0[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda:
                y_t_1 = y_t_1.cuda()
            all_state_h = []
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h,
                                 0).unsqueeze(0), torch.stack(all_state_c,
                                                              0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage_t_1, steps)
            log_probs = torch.log(final_dist)
            topk_log_probs, topk_ids = torch.topk(log_probs,
                                                  config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                        log_prob=topk_log_probs[i, j].item(),
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
예제 #32
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def decode(self):
        start = time.time()
        counter = 0
        batch = self.batcher.next_batch()

        # 新的架构里写在训练的decode部分
        while batch is not None:

            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self.vocab,
                (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.MARK_EOS)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract_sents = batch.original_abstracts_sents[0]
            original_article = batch.original_articles[0]

            # 英文
            # write_for_rouge(original_abstract_sents, decoded_words, counter,
            #                 self._rouge_ref_dir, self._rouge_dec_dir)
            # 中文
            self.write_result(original_article, original_abstract_sents,
                              decoded_words, counter)
            counter += 1
            #     if counter % 1000 == 0:
            #         print('%d example in %d sec'%(counter, time.time() - start))
            #         start = time.time()

            batch = self.batcher.next_batch()

        # print("Decoder has finished reading dataset for single_pass.")
        # print("Now starting ROUGE eval...")
        # results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        # rouge_log(results_dict, self._decode_dir)

    def write_result(self, original_title, reference_summarization,
                     decoded_words, ex_index):
        """
        Write output to file.

        Args:
            reference_sents: list of strings
            decoded_words: list of strings
            ex_index: int, the index with which to label the files
        """
        summarization = ''.join(decoded_words)

        # Write to file
        result_file = os.path.join(self._decode_dir, "result.txt")

        with open(result_file, 'w') as f:
            f.write(original_title + '\t\t' + reference_summarization +
                    '\t\t' + summarization + "\n")

        print("Wrote example %i to file" % ex_index)

    def beam_search(self, batch):
        #batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        dec_h, dec_c = s_t_0  # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        #decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [
            Beam(tokens=[self.vocab.word2id(data.MARK_GO)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t_0[0],
                 coverage=(coverage_t_0[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.MARK_UNK) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))  # 向量
            if use_cuda:
                y_t_1 = y_t_1.cuda()
            all_state_h = []
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h,
                                 0).unsqueeze(0), torch.stack(all_state_c,
                                                              0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage_t_1, steps)
            log_probs = torch.log(final_dist)
            topk_log_probs, topk_ids = torch.topk(log_probs,
                                                  config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):  # 对于不同的句子
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                        log_prob=topk_log_probs[i, j].item(),
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.MARK_EOS):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]