Пример #1
0
    def __init__(self, args, model_name=None):
        self.args = args
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file)
        self.batcher = Batcher(args.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=args.batch_size,
                               single_pass=False,
                               args=args)
        self.eval_batcher = Batcher(args.eval_data_path,
                                    self.vocab,
                                    mode='eval',
                                    batch_size=args.batch_size,
                                    single_pass=True,
                                    args=args)
        time.sleep(15)

        if model_name is None:
            self.train_dir = os.path.join(config.log_root,
                                          'train_%d' % (int(time.time())))
        else:
            self.train_dir = os.path.join(config.log_root, model_name)

        if not os.path.exists(self.train_dir):
            os.mkdir(self.train_dir)

        self.model_dir = os.path.join(self.train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Пример #2
0
    def __init__(self, train_dir=None, eval_dir=None, vocab=None, vectors=None):
        self.vectors = vectors
        if vocab is None:
            self.vocab = Vocab(config.vocab_path, config.vocab_size)
        else:
            self.vocab = vocab

        print(self.vocab)
        self.batcher_train = Batcher(config.train_data_path, self.vocab, mode='train',
                                     batch_size=config.batch_size, single_pass=False)
        time.sleep(15)
        self.batcher_eval = Batcher(config.eval_data_path, self.vocab, mode='eval',
                                    batch_size=config.batch_size, single_pass=True)
        time.sleep(15)

        cur_time = int(time.time())
        if train_dir is None:
            train_dir = os.path.join(config.log_root, 'train_%d' % (cur_time))
            if not os.path.exists(train_dir):
                os.mkdir(train_dir)

        if eval_dir is None:
            eval_dir = os.path.join(config.log_root, 'eval_%s' % (cur_time))
            if not os.path.exists(eval_dir):
                os.mkdir(eval_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer_train = writer.FileWriter(train_dir)
        self.summary_writer_eval = writer.FileWriter(eval_dir)
    def __init__(self):
        self.vocab = Vocab(args.vocab_path, args.vocab_size)
        self.batcher = Batcher(
            args.decode_data_path,
            self.vocab,
            mode='decode',
            batch_size=1,
            single_pass=True)  # support only 1 item at a time
        time.sleep(15)
        vocab_size = self.vocab.size()
        self.beam_size = args.beam_size
        # self.bertClient = BertClient()
        self.encoder = EncoderLSTM(args.hidden_size, self.vocab.size())
        self.decoder = DecoderLSTM(args.hidden_size, self.vocab.size())
        if use_cuda:
            self.encoder = self.encoder.cuda()
            self.decoder = self.decoder.cuda()

        # Prepare the output folder and files
        output_dir = os.path.join(args.logs, "outputs")
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        output_file = os.path.join(output_dir,
                                   "decoder_{}.txt".format(args.output_name))
        self.file = open(output_file, "w+")
Пример #4
0
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        #         self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        #         self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        #         for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
        #             if not os.path.exists(p):
        #                 os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        #         self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
        #                                batch_size=config.beam_size, single_pass=True)
        decode_data_path = "/Users/rowancassius/Desktop/pointer_summarizer-master/training_ptr_gen/decode_file.txt"
        # decode_data_path = "/Users/rowancassius/Desktop/pointer_summarizer-master/training_ptr_gen/data_file.txt"
        self.batcher = Batcher(data_path=decode_data_path,
                               vocab=self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)

        # time.sleep(15)
        time.sleep(2)

        self.model = Model(model_file_path, is_eval=True)
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)
        self.val_batcher = Batcher(config.eval_data_path,
                                   self.vocab,
                                   mode='eval',
                                   batch_size=config.batch_size,
                                   single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
Пример #6
0
class Evaluate(object):
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, 'eval',
                               config.batch_size, single_pass=True)
        time.sleep(5)
        eval_dir = os.path.join(config.log_root, 'eval_%d'%(int(time.time())))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)

    def eval(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1 = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = get_output_from_batch(batch, use_cuda)

        encoder_outputs, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t, c_t, _, _  = self.model.decoder(y_t_1, s_t_1,
                                                          encoder_outputs, enc_padding_mask, c_t_1,
                                                          extra_zeros, enc_batch_extend_vocab)

            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        return loss.data[0]

    def run_eval(self):
        start = time.time()
        running_avg_loss, iter = 0, 0
        batch = self.batcher.next_batch()
        while batch is not None:
            loss = self.eval(batch)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (
                iter, print_interval, time.time() - start, loss))
                start = time.time()

            batch = self.batcher.next_batch()
Пример #7
0
    def __init__(self, data_path, opt, batch_size=config.batch_size):

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(data_path, self.vocab, mode='eval', batch_size=batch_size,
                               single_pass=True)

        self.opt =opt
        time.sleep(5)
 def __init__(self):
     self.vocab = Vocab(VOCAB_PATH, VOCAB_SIZE)
     self.batcher = Batcher(TRAIN_DATA_PATH, self.vocab, mode = 'train',batch_size = BATCH_SIZE, single_pass = False)
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     self.model = MyModel().to(DEVICE)
     self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR)
Пример #9
0
 def __init__(self):
     self.vocab = Vocab(config.vocab_path, config.vocab_size)
     self.train_batcher = Batcher(config.train_data_path,
                                  self.vocab,
                                  hps=config.hps,
                                  single_pass=False)
     self.val_batcher = Batcher(config.eval_data_path,
                                self.vocab,
                                hps=config.hps,
                                single_pass=False)
Пример #10
0
 def __init__(self, opt):
     self.vocab = Vocab(config.vocab_path, config.vocab_size)
     self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                            batch_size=config.batch_size, single_pass=False)
     self.opt = opt
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     time.sleep(5)
Пример #11
0
    def __init__(self, opt):
        '''
        opt needs to contain:
            - model_file_path
            - n_best
            - max_token_seq_len
        '''
        self.opt = opt
        self.device = torch.device('cuda' if use_cuda else 'cpu')

        print("Max article len", config.max_article_len)
        model = Model(config.vocab_size, config.vocab_size,
                      config.max_article_len)

        checkpoint = torch.load(opt["model_file_path"],
                                map_location=lambda storage, location: storage)

        # model saved as:
        # state = {
        #     'iter': iter,
        #     'transformer_state_dict': self.model.state_dict(),
        #     'optimizer': self.optimizer.state_dict(),
        #     'current_loss': running_avg_loss
        # }

        model.load_state_dict(checkpoint['transformer_state_dict'])

        print('[Info] Trained model state loaded.')

        #model.word_prob_prj = nn.LogSoftmax(dim=1)

        self.model = model.to(self.device)

        self.model.eval()

        self._decode_dir = os.path.join(
            config.log_root,
            'decode_%s' % (opt["model_file_path"].split("/")[-1]))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.batch_size,
                               single_pass=True)

        time.sleep(15)

        print('[Info] Summarizer object created.')
Пример #12
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
                               batch_size=config.batch_size, single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)
Пример #13
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, 'eval',
                               config.batch_size, single_pass=True)
        time.sleep(5)
        eval_dir = os.path.join(config.log_root, 'eval_%d'%(int(time.time())))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)
Пример #14
0
    def __init__(self, model_file_path, destination_dir):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.encode_data_path,
                               self.vocab,
                               mode='encode',
                               batch_size=config.batch_size,
                               single_pass=True)
        time.sleep(5)

        self.output = {}
        self.destination_dir = destination_dir
        self.model = Model(model_file_path, is_eval=True)
Пример #15
0
    def __init__(self, model_file_path, model_type="stem", load_batcher=True):

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        if load_batcher:
            self.batcher = Batcher(config.decode_data_path,
                                   self.vocab,
                                   mode='decode',
                                   batch_size=config.beam_size,
                                   single_pass=True)
            time.sleep(15)
        self.model = Model(model_file_path, is_eval=True)
        self.model_type = model_type
Пример #16
0
def load_batches_decode():

    vocab   = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.decode_data_path, vocab, mode='decode',
                           batch_size=config.beam_size, single_pass=True)

    batches = [None for _ in range(TEST_DATA_SIZE)]
    for i in range(TEST_DATA_SIZE):
        batch = batcher.next_batch()
        batches[i] = batch

    with open("lib/data/batches_test.vocab{}.beam{}.pk.bin".format(vocab.size(), config.beam_size), "wb") as f:
        pickle.dump(batches, f)
Пример #17
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        time.sleep(15)

        #train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        train_dir = './train_log'
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Пример #18
0
    def __init__(self, model_file_path):
        self._decode_dir = os.path.join(config.log_root, 'decode_%d' % (int(time.time())))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)
Пример #19
0
def load_batches_train():

    vocab   = Vocab(config.vocab_path, config.vocab_size)
    batcher = Batcher(config.decode_data_path, vocab, mode='train',
                           batch_size=config.batch_size, single_pass=False)

    TRAIN_DATA_SIZE = 287226
    num_batches = int(TRAIN_DATA_SIZE / config.batch_size)
    batches = [None for _ in range(num_batches)]
    for i in tqdm(range(num_batches)):
        batch = batcher.next_batch()
        batches[i] = batch

    with open("lib/data/batches_train.vocab{}.batch{}.pk.bin".format(vocab.size(), config.batch_size), "wb") as f:
        pickle.dump(batches, f)
Пример #20
0
 def __init__(self, args):
     self.hparams = hp()
     self.model = Model(self.hparams)
     self.vocab = Vocab(config.vocab_path, self.hparams.vocab_size)
     self.batcher = Batcher(config.train_data_path,
                            self.vocab,
                            mode='train',
                            batch_size=self.hparams.batch_size,
                            single_pass=False)
     self.args = args
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     time.sleep(3)
Пример #21
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)

        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        # print("MODE MUST BE train")
        # time.sleep(15)
        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
Пример #22
0
    def __init__(self, use_elmo=False, finetune_glove=False):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        self.use_elmo = use_elmo
        self.finetune_glove = finetune_glove

        time.sleep(15)

        self.model_dir = os.path.join(train_dir, 'model')

        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)
Пример #23
0
 def __init__(self, model_file_or_model, vocab=None):
     if vocab is None:
         self.vocab = Vocab(config.vocab_path, config.vocab_size)
     else:
         assert isinstance(vocab, Vocab)
         self.vocab = vocab
     self.batcher = Batcher(config.eval_data_path,
                            self.vocab,
                            mode='eval',
                            batch_size=config.batch_size,
                            single_pass=True)
     time.sleep(15)
     if isinstance(model_file_or_model, str):
         self.model = Model(device, model_file_or_model, is_eval=True)
     elif isinstance(model_file_or_model, Model):
         self.model = model_file_or_model
     else:
         raise ValueError("Cannot build model from type %s" %
                          type(model_file_or_model))
Пример #24
0
class Encode(object):
    def __init__(self, model_file_path, destination_dir):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.encode_data_path,
                               self.vocab,
                               mode='encode',
                               batch_size=config.batch_size,
                               single_pass=True)
        time.sleep(5)

        self.output = {}
        self.destination_dir = destination_dir
        self.model = Model(model_file_path, is_eval=True)

    def save_output(self, output, destination_dir):
        if destination_dir is None:
            torch.save(output, "output")
        else:
            torch.save(output, destination_dir)

    def encode_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)

        h, c = self.model.reduce_state(encoder_hidden)
        h, c = h.squeeze(0), c.squeeze(0)
        encodes = torch.cat((h, c), 1)

        for id, encode in zip(batch.original_abstracts, encodes):
            print(encode)
            self.output[id] = encode

    def run_encode(self):
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            self.encode_one_batch(batch)
            batch = self.batcher.next_batch()
        self.save_output(self.output, self.destination_dir)
Пример #25
0
 def __init__(self, opt, vocab, logger, writer, train_num):
     self.vocab = vocab
     self.train_batcher = Batcher(config.train_data_path,
                                  self.vocab,
                                  mode='train',
                                  batch_size=config.batch_size,
                                  single_pass=False)
     self.test_batcher = Batcher(config.test_data_path,
                                 self.vocab,
                                 mode='eval',
                                 batch_size=config.batch_size,
                                 single_pass=True)
     self.opt = opt
     self.start_id = self.vocab.word2id(data.START_DECODING)
     self.end_id = self.vocab.word2id(data.STOP_DECODING)
     self.pad_id = self.vocab.word2id(data.PAD_TOKEN)
     self.unk_id = self.vocab.word2id(data.UNKNOWN_TOKEN)
     self.logger = logger
     self.writer = writer
     self.train_num = train_num
     time.sleep(5)
Пример #26
0
    def __init__(self, args, model_file_path, save_path):
        self.args = args
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, save_path,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)
        vocab = args.vocab_path if args.vocab_path is not None else config.vocab_path
        self.vocab = Vocab(vocab, config.vocab_size, config.embedding_file)
        self.batcher = Batcher(args.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=args.beam_size,
                               single_pass=True,
                               args=args)
        time.sleep(15)

        self.model = Model(self.vocab, model_file_path, is_eval=True)
Пример #27
0
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
                               batch_size=config.batch_size, single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)
Пример #28
0
    def __init__(self, model_file_path=None):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        if not model_file_path:
            train_dir = os.path.join(config.log_root,
                                     'train_%d' % (int(time.time())))
            if not os.path.exists(train_dir):
                os.mkdir(train_dir)
        else:
            train_dir = re.sub('/model/model.*', '', model_file_path)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.create_file_writer(train_dir)
Пример #29
0
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.ouput_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)

        self.checkpoint_dir = os.path.join(train_dir, 'checkpoints')
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.train_summary_writer = tf.summary.create_file_writer(
            os.path.join(train_dir, 'log', 'train'))
        self.eval_summary_writer = tf.summary.create_file_writer(
            os.path.join(train_dir, 'log', 'eval'))
    def __init__(self):
        """
        Input:
            vocab_path = "xxx/finished_files/vocab",
            vocab_size = 50000

        Output:
            class object: self.vocab --> (dicts `_word_to_id` and `_id_to_word`)
        """
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        """
        Input:
            train_data_path = "xxx/finished_files/chunked/train_*",
            self.vocab: class object,
            mode = 'train', for training,
            batch_size = 8,
            single_pass = False

        Output:
            class object: self.vocab,
                (dicts `_word_to_id` and `_id_to_word`)
        """
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)
    def __init__(self):
        self.vocab = Vocab(args.vocab_path, args.vocab_size)
        sys.stdout.flush()
        self.batcher = Batcher(args.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=args.batch_size,
                               single_pass=False)
        time.sleep(15)
        vocab_size = self.vocab.size()
        self.model = BertLSTMModel(args.hidden_size, self.vocab.size(),
                                   args.max_dec_steps)
        # self.model = Seq2SeqLSTM(args.hidden_size, self.vocab.size(), args.max_dec_steps)
        if use_cuda:
            self.model = self.model.cuda()

        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=args.lr)

        train_logs = os.path.join(args.logs, "train_logs")
        eval_logs = os.path.join(args.logs, "eval_logs")
        self.train_summary_writer = tf.summary.FileWriter(train_logs)
        self.eval_summary_writer = tf.summary.FileWriter(eval_logs)
Пример #32
0
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)
Пример #33
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root, 'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)


    def decode(self):
        start = time.time()
        counter = 0
        batch = self.batcher.next_batch()
        while batch is not None:
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(output_ids, self.vocab,
                                                 (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract_sents = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract_sents, decoded_words, counter,
                            self._rouge_ref_dir, self._rouge_dec_dir)
            counter += 1
            if counter % 1000 == 0:
                print('%d example in %d sec'%(counter, time.time() - start))
                start = time.time()

            batch = self.batcher.next_batch()

        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)


    def beam_search(self, batch):
        #batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        if config.use_maxpool_init_ctx:
            c_t_0 = max_encoder_output

        dec_h, dec_c = s_t_0 # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        #decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                      log_probs=[0.0],
                      state=(dec_h[0], dec_c[0]),
                      context = c_t_0[0],
                      coverage=(coverage_t_0[0] if config.is_coverage else None))
                 for _ in xrange(config.beam_size)]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda:
                y_t_1 = y_t_1.cuda()
            all_state_h =[]
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1,
                                                        encoder_outputs, enc_padding_mask, c_t_1,
                                                        extra_zeros, enc_batch_extend_vocab, coverage_t_1)

            topk_log_probs, topk_ids = torch.topk(final_dist, config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in xrange(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in xrange(config.beam_size * 2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].data[0],
                                   log_prob=topk_log_probs[i, j].data[0],
                                   state=state_i,
                                   context=context_i,
                                   coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
Пример #34
0
class Evaluate(object):
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path, self.vocab, mode='eval',
                               batch_size=config.batch_size, single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)

    def eval_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        if config.use_maxpool_init_ctx:
            c_t_1 = max_encoder_output

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1,attn_dist, p_gen, coverage = self.model.decoder(y_t_1, s_t_1,
                                                                encoder_outputs, enc_padding_mask, c_t_1,
                                                                extra_zeros, enc_batch_extend_vocab, coverage)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        return loss.data[0]

    def run_eval(self):
        running_avg_loss, iter = 0, 0
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (
                iter, print_interval, time.time() - start, running_avg_loss))
                start = time.time()
            batch = self.batcher.next_batch()