Пример #1
0
class Evaluate(object):
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path,
                               self.vocab,
                               mode='eval',
                               batch_size=config.batch_size,
                               single_pass=True)
        self.model_file_path = model_file_path
        time.sleep(5)

        self.model = Model(model_file_path, is_eval=True)

    def eval_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        with torch.no_grad():
            encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
                enc_batch, enc_lens)
            s_t_1 = self.model.reduce_state(encoder_hidden)

            step_losses = []
            for di in range(min(max_dec_len, config.max_dec_steps)):
                y_t_1 = dec_batch[:, di]  # Teacher forcing
                final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                    y_t_1, s_t_1, encoder_outputs, encoder_feature,
                    enc_padding_mask, c_t_1, extra_zeros,
                    enc_batch_extend_vocab, coverage, di)

                target = target_batch[:, di]
                gold_probs = torch.gather(final_dist, 1,
                                          target.unsqueeze(1)).squeeze()
                step_loss = -torch.log(gold_probs + config.eps)
                if config.is_coverage:
                    step_coverage_loss = torch.sum(
                        torch.min(attn_dist, coverage), 1)
                    step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                    coverage = next_coverage

                step_mask = dec_padding_mask[:, di]
                step_loss = step_loss * step_mask
                step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)
        return loss.item()

    def run_eval(self):
        batch = self.batcher.next_batch()
        loss_list = []
        while batch is not None:
            loss = self.eval_one_batch(batch)
            loss_list.append(loss)
            batch = self.batcher.next_batch()
        return np.mean(loss_list)
Пример #2
0
def main():
    args = get_args()
    vocab = Vocab(args.vocab_path, args.vocab_size)  # create a vocabulary
    hps = get_hps()
    if not args.data_path == "":
        batcher = Batcher(args.data_path, vocab, hps, args.single_pass)
        import pdb
        pdb.set_trace()
        x = batcher.next_batch()
        import pdb
        pdb.set_trace()
        pass
    else:
        with open(args.json_path) as f:
            art = json.load(f)
        article = neologdn.normalize(art['body'])
        abstract = neologdn.normalize(art['title'])
        m = MeCab('-Owakati')
        parsed_article = m.parse(article)
        abs_words = m.parse(abstract).split()
        ex = B.Example(parsed_article, abs_words, vocab, hps)
        b = B.Batch([ex], hps, vocab)
        import pdb
        pdb.set_trace()
        pass
Пример #3
0
def fit_tfidf_vectorizer(hps, vocab):
    if not os.path.exists(
            os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer')):
        os.makedirs(os.path.join(FLAGS.actual_log_root, 'tfidf_vectorizer'))

    decode_model_hps = hps._replace(max_dec_steps=1,
                                    batch_size=1)  # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries

    batcher = Batcher(FLAGS.data_path, vocab, decode_model_hps,
                      single_pass=FLAGS.single_pass)
    all_sentences = []
    while True:
        batch = batcher.next_batch()  # 1 example repeated across batch
        if batch is None:  # finished decoding dataset in single_pass mode
            break
        all_sentences.extend(batch.raw_article_sents[0])

    stemmer = PorterStemmer()

    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_analyzer(self):
            analyzer = super(TfidfVectorizer, self).build_analyzer()
            return lambda doc: (stemmer.stem(w) for w in analyzer(doc))

    tfidf_vectorizer = StemmedTfidfVectorizer(analyzer='word',
                                              stop_words='english',
                                              ngram_range=(1, 3), max_df=0.7)
    tfidf_vectorizer.fit_transform(all_sentences)
    return tfidf_vectorizer
Пример #4
0
def train_generator(args, load_recent=True):
    '''Train the generator via classical approach'''
    logging.debug('Batcher...')
    batcher = Batcher(args.data_dir, args.batch_size, args.seq_length)

    logging.debug('Vocabulary...')
    with open(os.path.join(args.save_dir_gen, 'config.pkl'), 'w') as f:
        cPickle.dump(args, f)
    with open(os.path.join(args.save_dir_gen, 'real_beer_vocab.pkl'),
              'w') as f:
        cPickle.dump((batcher.chars, batcher.vocab), f)

    logging.debug('Creating generator...')
    generator = Generator(args, is_training=True)

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=True)) as sess:
        tf.initialize_all_variables().run()
        saver = tf.train.Saver(tf.all_variables())

        if load_recent:
            ckpt = tf.train.get_checkpoint_state(args.save_dir_gen)
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)

        for epoch in xrange(args.num_epochs):
            # Anneal learning rate
            new_lr = args.learning_rate * (args.decay_rate**epoch)
            sess.run(tf.assign(generator.lr, new_lr))
            batcher.reset_batch_pointer()
            state = generator.initial_state.eval()

            for batch in xrange(batcher.num_batches):
                start = time.time()
                x, y = batcher.next_batch()
                feed = {
                    generator.input_data: x,
                    generator.targets: y,
                    generator.initial_state: state
                }
                # train_loss, state, _ = sess.run([generator.cost, generator.final_state, generator.train_op], feed)
                train_loss, _ = sess.run([generator.cost, generator.train_op],
                                         feed)
                end = time.time()

                print '{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}' \
                 .format(epoch * batcher.num_batches + batch,
                  args.num_epochs * batcher.num_batches,
                  epoch, train_loss, end - start)

                if (epoch * batcher.num_batches +
                        batch) % args.save_every == 0:
                    checkpoint_path = os.path.join(args.save_dir_gen,
                                                   'model.ckpt')
                    saver.save(sess,
                               checkpoint_path,
                               global_step=epoch * batcher.num_batches + batch)
                    print 'Generator model saved to {}'.format(checkpoint_path)
Пример #5
0
def test_batcher():
    batcher = Batcher(hps.data_path, vocab, hps, hps.single_pass)
    #batcher = newbatcher(vocab, hps, hps.data_path, hps.single_pass)
    #time.sleep(15)
    while True:
        start = time.time()
        #batch = next(batcher)#.next_batch()
        batch = batcher.next_batch()
        print('elapse:', time.time() - start)
Пример #6
0
def train(params):
    data_loader = Batcher(params)
    params.vocab_size = data_loader.vocab_size

    if not os.path.isdir(params.save_dir):
        os.makedirs(params.save_dir)

    with open(os.path.join(params.save_dir, 'config.pkl'), 'wb') as f:
        cPickle.dump(params, f)
    with open(os.path.join(params.save_dir, 'chars_vocab.pkl'), 'wb') as f:
        cPickle.dump((data_loader.chars, data_loader.vocab), f)

    model = Model(params)

    with tf.Session() as sess:
        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(
            os.path.join(params.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S")))
        writer.add_graph(sess.graph)

        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=50)

        for e in range(params.num_epochs):
            sess.run(tf.assign(model.lr, params.learning_rate * (0.97**e)))

            data_loader.reset_batch_pointer()
            state = sess.run(model.initial_state)
            for b in range(data_loader.num_batches):
                start = time.time()

                x, y = data_loader.next_batch()
                feed = {model.input_data: x, model.targets: y}
                for i, (c, h) in enumerate(model.initial_state):
                    feed[c] = state[i].c
                    feed[h] = state[i].h
                train_loss, state, _ = sess.run(
                    [model.cost, model.final_state, model.train_op], feed)

                summ, train_loss, state, _ = sess.run(
                    [summaries, model.cost, model.final_state, model.train_op],
                    feed)
                writer.add_summary(summ, e * data_loader.num_batches + b)

                end = time.time()
                logging.info(
                    "Epoch #{e} / Batch #{b} -- Loss {train_loss:.3f} "
                    "Time {time_diff:.3f}".format(e=e,
                                                  b=b,
                                                  train_loss=train_loss,
                                                  time_diff=end - start))

            if e % params.save_every == 0 or e == params.num_epochs - 1:
                checkpoint_path = os.path.join(params.save_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=e)
Пример #7
0
 def generate_batch(self, mode):  #mode: train/test/val
     hps = self._hps
     hps['mode'] = mode
     batcher = Batcher(hps['data_path'] + '/{}.bin'.format(mode),
                       self._vocab,
                       hps,
                       single_pass=True)
     while True:
         batch = batcher.next_batch()
         feed_dict = self.make_feed_dict(batch)
         yield [feed_dict['enc_batch'],
                feed_dict['dec_batch']], feed_dict['target_batch']
Пример #8
0
def get_decode_results(sess, model, vocab, hps, data_path):

  eval_batcher = Batcher(data_path, vocab, hps, True)
  total_loss = 0.0
  total_correct_preds = 0.0
  predictions = np.array([])
  original_comments = []
  gold_labels = []
  attention_scores = []
  labelvalues = np.array(["male", "female"])
  predicted_labels = []
  probabilities = np.array([])

  n=0

  while True:
    try:
      eval_batch = eval_batcher.next_batch()
      if eval_batch is None:
        break
      eval_results = model.run_eval_step(sess, eval_batch)
      batch = eval_batch
      batch_size = FLAGS.batch_size
      loss = eval_results['loss']
      correct_predictions = eval_results['correct_predictions']
      predictions = eval_results['predictions']
      predicted_labels = np.concatenate((predicted_labels, labelvalues[predictions]))
      # print eval_results['probs']
      # print eval_results['batch']
      # print batch.enc_batch[0]
      # print batch.enc_batch[1]
      # print batch.enc_batch[2]
      # raw_input()
      probabilities = np.concatenate((probabilities, eval_results['probs']))
      gold_labels += batch.original_labels
      original_comments += batch.original_comments
      attention_scores += list(eval_results['attention_scores'])

      total_loss += loss*batch_size
      total_correct_preds += correct_predictions
      n+=batch_size
    except StopIteration:
      break

  eval_loss = total_loss/n
  accuracy = total_correct_preds/n

  return eval_loss, accuracy, original_comments, gold_labels, predicted_labels, attention_scores, np.array(probabilities, dtype=str)
Пример #9
0
def train_generator(args, load_recent=True):
	'''Train the generator via classical approach'''
	logging.debug('Batcher...')
	batcher   = Batcher(args.data_dir, args.batch_size, args.seq_length)

	logging.debug('Vocabulary...')
	with open(os.path.join(args.save_dir_gen, 'config.pkl'), 'w') as f:
		cPickle.dump(args, f)
	with open(os.path.join(args.save_dir_gen, 'real_beer_vocab.pkl'), 'w') as f:
		cPickle.dump((batcher.chars, batcher.vocab), f)

	logging.debug('Creating generator...')
	generator = Generator(args, is_training = True)

	with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
		tf.initialize_all_variables().run()
		saver = tf.train.Saver(tf.all_variables())

		if load_recent:
			ckpt = tf.train.get_checkpoint_state(args.save_dir_gen)
			if ckpt and ckpt.model_checkpoint_path:
				saver.restore(sess, ckpt.model_checkpoint_path)

		for epoch in xrange(args.num_epochs):
			# Anneal learning rate
			new_lr = args.learning_rate * (args.decay_rate ** epoch)
			sess.run(tf.assign(generator.lr, new_lr))
			batcher.reset_batch_pointer()
			state = generator.initial_state.eval()

			for batch in xrange(batcher.num_batches):
				start = time.time()
				x, y  = batcher.next_batch()
				feed  = {generator.input_data: x, generator.targets: y, generator.initial_state: state}
				# train_loss, state, _ = sess.run([generator.cost, generator.final_state, generator.train_op], feed)
				train_loss, _ = sess.run([generator.cost, generator.train_op], feed)
				end   = time.time()
				
				print '{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}' \
					.format(epoch * batcher.num_batches + batch,
						args.num_epochs * batcher.num_batches,
						epoch, train_loss, end - start)
				
				if (epoch * batcher.num_batches + batch) % args.save_every == 0:
					checkpoint_path = os.path.join(args.save_dir_gen, 'model.ckpt')
					saver.save(sess, checkpoint_path, global_step = epoch * batcher.num_batches + batch)
					print 'Generator model saved to {}'.format(checkpoint_path)
Пример #10
0
def get_eval_loss(sess, model, vocab, hps, data_path):

  eval_batcher = Batcher(data_path, vocab, hps, True)
  total_loss = 0.0
  total_ce_loss = 0.0
  total_correct_preds = 0.0
  preds = []
  truey = []
  n=0

  if FLAGS.mode == 'decode':
    pass

  while True:
    try:
      eval_batch = eval_batcher.next_batch()
      if eval_batch is None:
        break
      eval_results = model.run_eval_step(sess, eval_batch)

      batch_size = FLAGS.batch_size
      loss = eval_results['loss']
      ce_loss = eval_results['ce_loss']
      correct_predictions = eval_results['correct_predictions']
      predictions = eval_results['predictions']
      true_labels = eval_batch.labels
      preds += list(predictions)
      truey += list(true_labels)

      total_loss += loss*batch_size
      total_ce_loss += ce_loss*batch_size
      total_correct_preds += correct_predictions
      n+=batch_size
    except StopIteration:
      break

  eval_loss = total_loss/n
  eval_ce_loss = total_ce_loss/n
  accuracy = total_correct_preds/n

  print " Precision Score:", precision_score(truey, preds),
  print " Recall Score:", recall_score(truey, preds),
  print " F1 Score:", f1_score(truey, preds)
  print n

  return eval_loss, eval_ce_loss, accuracy
Пример #11
0
    def train(self, images, labels, load_model=True):
        train_log_dir = self.log_dir
        if not tf.gfile.Exists(train_log_dir):
            tf.gfile.MakeDirs(train_log_dir)

        with tf.Graph().as_default() as graph:

            # image_batch, label_batch = utils.get_batch_data(images, labels, batch_size=self.batch_size)

            inputs = tf.placeholder(dtype=tf.float32,
                                    shape=(self.batch_size, self.width,
                                           self.height, 1),
                                    name="inputs")
            ouputs = tf.placeholder(dtype=tf.float32,
                                    shape=(self.batch_size, 1, 1,
                                           self.num_classes),
                                    name="outputs")

            predictions = self.build_vgg16(inputs)
            # Specify the loss function:

            tf.losses.softmax_cross_entropy(ouputs, predictions)

            total_loss = tf.losses.get_total_loss()
            tf.summary.scalar('losses/total_loss', total_loss)

            # Specify the optimization scheme:
            optimizer = tf.train.AdadeltaOptimizer(
                learning_rate=self.learning_rate)

            # create_train_op that ensures that when we evaluate it to get the loss,
            # the update_ops are done and the gradient updates are computed.
            train_tensor = slim.learning.create_train_op(total_loss, optimizer)

            tf.logging.set_verbosity(tf.logging.INFO)
            best_loss = None
            # prepare
            saver = tf.train.Saver()
            sv = tf.train.Supervisor(
                logdir=self.log_dir,
                is_chief=True,
                saver=saver,
                summary_op=None,
                save_summaries_secs=
                60,  # save summaries for tensorboard every 60 secs
                save_model_secs=60)  # checkpoint every 60 secs)
            summary_writer = sv.summary_writer
            tf.logging.info("Preparing or waiting for session...")
            sess_context_manager = sv.prepare_or_wait_for_session(
                config=utils.get_config())
            tf.logging.info("Created session.")
            # Actually runs training.
            with sess_context_manager as sess:
                batcher = Batcher(images, labels, self.batch_size)
                epoch = 0
                turn = 0
                total_turn = 0
                while (True):
                    real_images, real_labels, finised = batcher.next_batch()
                    if finised:
                        epoch += 1
                        turn = 0
                    real_labels = np.eye(self.num_classes)[real_labels]
                    real_labels = np.reshape(
                        real_labels,
                        [real_labels.shape[0], 1, 1, real_labels.shape[1]])
                    feed_dict = {
                        "inputs:0": real_images,
                        "outputs:0": real_labels
                    }
                    _, loss, r = sess.run(
                        [train_tensor, total_loss, predictions], feed_dict)
                    turn += 1
                    total_turn += 1
                    if turn % 100 == 0:
                        tf.logging.info("epch: %d\tturn: %d/%d" %
                                        (epoch, turn, batcher.batch_count))
                        tf.logging.info("total loss: %f" % loss)
                        summary_writer.flush()
Пример #12
0
class BeamSearch(object):
    def __init__(self, model_file_path, data_path, data_class='val'):
        self.data_class = data_class
        if self.data_class not in  ['val', 'test']:
            print("data_class must be 'val' or 'test'.")
            raise ValueError

        # model_file_path e.g. --> ../log/{MODE NAME}/best_model/model_best_XXXXX
        model_name = os.path.basename(model_file_path)
        # log_root e.g. --> ../log/{MODE NAME}/
        log_root = os.path.dirname(os.path.dirname(model_file_path))
        # _decode_dir e.g. --> ../log/{MODE NAME}/decode_model_best_XXXXX/
        self._decode_dir = os.path.join(log_root, 'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        self._result_path = os.path.join(self._decode_dir, 'result_%s_%s.txt' \
                                                        % (model_name, self.data_class))
        # remove result file if exist
        if os.path.isfile(self._result_path):
            os.remove(self._result_path)
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(data_path, self.vocab, mode='decode',
                               batch_size=config.beam_size, single_pass=True)
        time.sleep(5)

        self.model = Model(model_file_path, is_eval=True)


    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)


    def beam_search(self, batch):
        # batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        dec_h, dec_c = s_t_0 # 1 x 2H
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        # decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                      log_probs=[0.0],
                      state=(dec_h[0], dec_c[0]),
                      context = c_t_0[0],
                      coverage=(coverage_t_0[0] if config.is_coverage else None))
                 for _ in range(config.beam_size)]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]

            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda:
                y_t_1 = y_t_1.cuda()
            all_state_h =[]
            all_state_c = []
            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)
                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1,
                                                        encoder_outputs, encoder_feature, enc_padding_mask, c_t_1,
                                                        extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps)

            log_probs = torch.log(final_dist)
            topk_log_probs, topk_ids = torch.topk(log_probs, config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size * 2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                   log_prob=topk_log_probs[i, j].item(),
                                   state=state_i,
                                   context=context_i,
                                   coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)
        return beams_sorted[0]
    
    
    def decode(self):
        start = time.time()
        counter = 0
        bleu_scores = []
        batch = self.batcher.next_batch()
        while batch is not None:
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(output_ids, self.vocab,
                                                 (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_articles = batch.original_articles[0]
            original_abstracts = batch.original_abstracts_sents[0]
            reference = original_abstracts[0].strip().split()
            bleu = nltk.translate.bleu_score.sentence_bleu([reference], decoded_words, weights = (0.5, 0.5))
            bleu_scores.append(bleu)

            # write_for_rouge(original_abstracts, decoded_words, counter,
            #                 self._rouge_ref_dir, self._rouge_dec_dir)

            write_for_result(original_articles, original_abstracts, decoded_words, \
                                                self._result_path, self.data_class)

            counter += 1
            if counter % 1000 == 0:
                print('%d example in %d sec'%(counter, time.time() - start))
                start = time.time()

            batch = self.batcher.next_batch()
        
        '''
        # uncomment this if you successfully install `pyrouge`
        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)
        '''

        if self.data_class == 'val':
            print('Average BLEU score:', np.mean(bleu_scores))
            with open(self._result_path, "a") as f:
                print('Average BLEU score:', np.mean(bleu_scores), file=f)

    def get_processed_path(self):
        # ../log/{MODE NAME}/decode_model_best_XXXXX/result_model_best_2800_{data_class}.txt
        input_path = self._result_path
        temp = os.path.splitext(input_path)
        # ../log/{MODE NAME}/decode_model_best_XXXXX/result_model_best_2800_{data_class}_processed.txt
        output_path = temp[0] + "_processed" + temp[1]
        return input_path, output_path
Пример #13
0
from data import Vocab
from batcher import Batcher

import config
import data
import os

FLAGS = config.FLAGS

vocab_in, vocab_out = data.load_dict_data(FLAGS)

batcher_train = Batcher(FLAGS.data_path,
                        vocab_in,
                        vocab_out,
                        FLAGS,
                        data_file='train.txt.tags')
epoch = 0
while True:
    print(epoch)
    while batcher_train.c_epoch == epoch:
        batch = batcher_train.next_batch()
    epoch += 1
    print("done")
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               batch_size=config.batch_size)
        train_dir = os.path.join(config.log_root)
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

    def save_model(self, loss, iter_step, name=None):
        state = {
            'iter': iter_step,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': loss
        }
        if name is None:
            name = 'model_{}_{}'.format(iter_step, loss)
        model_save_path = os.path.join(self.model_dir, name)
        torch.save(state, model_save_path)
        print('saved loss:', loss)
        print('******************')
        #print('\n')

    def setup_train(self, model_file_path=None):
        # 初始化模型
        self.model = Model(model_file_path)
        # 模型参数的列表
        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        # 定义优化器
        self.optimizer = optim.Adam(params, lr=config.adam_lr)
        #self.optimizer = optim.Adagrad(params, lr=0.15, initial_accumulator_value=0.1, eps=1e-10)

        # 初始化迭代次数和损失
        start_iter, start_loss = 0, 0
        # 如果传入的已存在的模型路径,加载模型继续训练
        if model_file_path is not None:
            print('loading saved model:', model_file_path)
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if USE_CUDA:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.to(DEVICE)

        return start_iter, start_loss

    def train_one_batch(self, batch):
        # enc_batch是包含unk的序列
        # c_t_1是初始上下文向量
        # extra_zeros:oov词汇表概率,[batch_size, batch.max_art_oovs]
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        # dec_batch是普通摘要序列,包含unk,target_batch是目标词序列,不包含unk,unk的词用len(vocabe)+oov相对位置代替
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)
        self.optimizer.zero_grad()

        # [batch, seq_lens, 2*hid_dim],[batch*max(seq_lens), 2*hid_dim],[2, batch, hid_dim])
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)

        # (h,c) = ([1, batch, hid_dim], [1, batch, hid_dim])
        # 之前的hidden state是双向的[2, batch, hid_dim],需要转成1维的[1, batch, hid_dim],作为新的decoder的hidden输入
        s_t_1 = self.model.reduce_state(encoder_hidden)  # h,c

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            # 摘要的一个单词,batch里的每个句子的同一位置的单词编码
            y_t_1 = dec_batch[:, di]
            # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的vocab size
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            # 摘要的下一个单词的编码,[B]
            target = target_batch[:, di]
            # [B,1]
            target_i = target.unsqueeze(1)
            # 取出目标单词的概率//取出final_dist中,target中对应位置的数据(对于目标单词预测的概率)
            gold_probs = torch.gather(final_dist, 1, target_i).squeeze()

            #print(gold_probs)

            # if gold_probs <= 0:
            #     print('*******loss less than 0 ***********')
            #     gold_probs = 1e-2
            #     print('pro has been modified', gold_probs)
            #     print('\n')

            # 单个词的预测损失
            # 加入绝对值
            step_loss = -torch.log(torch.abs(gold_probs) + 1e-8)

            #print('')
            if config.is_coverage:
                # 取当前t步attention向量,和之前t-1步attention和向量,的min值做sum,当作额外的coverage loss来压制重复生成。
                # 迫使loss让当前第t步的attention向量attn_dist值,尽可能比之前t-1步attention和向量的值小。(大的的attention值意味着之前可能被预测生成了这个词)
                step_coverage_loss = torch.sum(
                    torch.min(torch.abs(attn_dist), torch.abs(coverage)), 1)
                #print('step_coverage_loss is ', step_coverage_loss)
                # 加个\lambda 系数,表示多大程度考虑这个压制重复的coverage loss
                step_loss = step_loss + config.cov_loss_wt * torch.abs(
                    step_coverage_loss)
                # 初始时的coverage覆盖向量,就更新成累加了
                coverage = next_coverage
            # mask的部位不计入损失
            step_mask = dec_padding_mask[:, di]
            step_loss = torch.abs(step_loss) * torch.abs(step_mask)
            step_losses.append(step_loss)

        sum_losses = torch.abs(torch.sum(torch.stack(step_losses, 1), 1))
        # print('sum_losses is ',sum_losses)
        # 序列的整体损失
        # print('dec_lens_var is ', dec_lens_var)
        batch_avg_loss = sum_losses / (torch.abs(dec_lens_var) + 1)

        # 整个batch的整体损失
        loss = torch.mean(batch_avg_loss)
        #print('loss from one_batch is ', loss)

        loss.backward()

        #         self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        #         clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        #         clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, model_file_path=None):
        # 训练设置,包括
        iter_data, loss = self.setup_train(model_file_path)
        start = time.time()
        # 总数据量data_size,轮回训练iter_loop次数据
        data_size = 80000
        i = 0
        min_loss = 10000
        cum_loss = 0
        while iter_data < data_size * config.iter_loop:
            # 获取下一个batch数据
            batch = self.batcher.next_batch()
            iter_data += batch.batch_size
            loss = self.train_one_batch(batch)

            cum_loss += loss
            i += 1
            if i % 10 == 0:
                #print('loss of one batch is', loss)
                avg_loss = cum_loss / 10
                print('cum_loss over 10 batch:', cum_loss)
                print('steps %d, seconds for %d ,' % (i, time.time() - start))
                print('avg_loss over 10 batch:', avg_loss)
                start = time.time()
                cum_loss = 0
                # > 100 就开始存储,因为可能是重新加载的
                if avg_loss < min_loss and i > 100:
                    min_loss = avg_loss
                    self.save_model(avg_loss, i, name='best_model2')
Пример #15
0
    tokens = [t if t != '\s' else ' ' for t in chars]
    tokens = ''.join(tokens)

    print(tokens)


if __name__ == "__main__":
    message_start = "Select the run mode for the NN:\n\t1. Greedy Search\n\t2. Beam Search\n-> "
    mode = input(message_start)

    message_iterations = "Specify the numer of iterations (min 2500, is suggested more) -> "
    max_iterations = input(message_iterations)

    iteration = 0
    while iteration < int(max_iterations) + 1:
        batch = batcher.next_batch()
        model.train_on_batch(batch.input, batch.target)

        if iteration % 500 == 0:
            print('Names generated after iteration {}:'.format(iteration))

            if int(mode) == 1:
                for i in range(3):
                    make_name(model, vocab, hps)
            else:
                # il "for i in range(3)" funziona solo col greedy, perché il greedy
                # è non deterministico, con il beam ti stampa 3 nomi uguali perché il
                # primo è sempre il più probabile
                make_name_beam(model, vocab, hps)

            print("")
Пример #16
0
            fd = fd.cuda()
        final_lists.append(fd)
    attn_lists = []
    for j in range(max_dec_steps):
        ad = Variable(torch.rand(batch_size, attn_len))
        if use_cuda:
            ad = ad.cuda()
        attn_lists.append(ad)
    return final_lists, attn_lists


final_dists, attn_dists = forward(attn_len=args.max_enc_steps,
                                  max_dec_steps=args.max_dec_steps,
                                  batch_size=args.batch_size,
                                  extended_vsize=args.extended_vsize,
                                  use_cuda=False,
                                  mode=args.mode,
                                  pointer_gen=args.pointer_gen,
                                  use_coverage=args.coverage)

for n in range(args.num_steps):
    print("lalallalalallalalallalalallalallalla:", n)
    batch = dataloader.next_batch()
    batch = batch2var(batch, use_cuda=False)
    loss = loss_function(final_dists, attn_dists, batch)
    print(loss)
    log('loss', loss.data[0], step=n)
    if n % args.check_n == 0:
        save_checkpoint({'step': n + 1}, is_best=False)
    if n > 20:
        break
Пример #17
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(5)

        if not os.path.exists(config.log_root):
            os.makedirs(config.log_root)

        self.model_dir = os.path.join(config.log_root, 'train_model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.eval_log = os.path.join(config.log_root, 'eval_log')
        if not os.path.exists(self.eval_log):
            os.mkdir(self.eval_log)
        self.summary_writer = tf.compat.v1.summary.FileWriter(self.eval_log)

    def save_model(self, running_avg_loss, iter, mode):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        if mode == 'train':
            save_model_dir = self.model_dir
        else:
            best_model_dir = os.path.join(config.log_root, 'best_model')
            if not os.path.exists(best_model_dir):
                os.mkdir(best_model_dir)
            save_model_dir = best_model_dir

        if len(os.listdir(save_model_dir)) > 0:
            shutil.rmtree(save_model_dir)
            time.sleep(2)
            os.mkdir(save_model_dir)
        train_model_path = os.path.join(save_model_dir,
                                        'model_best_%d' % (iter))
        torch.save(state, train_model_path)
        return train_model_path

    def setup_train(self,
                    model_file_path=None,
                    emb_v_path=None,
                    emb_list_path=None,
                    vocab=None,
                    log=None):
        self.model = Model(model_file_path)
        if model_file_path is None:
            set_embedding(self.model,
                          emb_v_path=emb_v_path,
                          emb_list_path=emb_list_path,
                          vocab=self.vocab,
                          use_cuda=use_cuda,
                          log=log)
        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        if config.mode == 'MLE':
            self.optimizer = Adagrad(params,
                                     lr=0.15,
                                     initial_accumulator_value=0.1)
        else:
            self.optimizer = Adam(params, lr=initial_lr)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']
        return start_iter, start_loss

    def train_one_batch(self, batch, alpha, beta):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        nll_list = []
        gen_summary = torch.LongTensor(
            config.batch_size * [config.sample_size * [[2]]])  # B x S x 1
        if use_cuda: gen_summary = gen_summary.cuda()
        preds_y = gen_summary.squeeze(2)  # B x S
        for di in range(min(config.max_dec_steps, dec_batch.size(1))):
            # Select the current input word
            p1 = np.random.uniform()
            if p1 < alpha:  # use ground truth word
                y_t_1 = dec_batch[:, di]
            else:  # use decoded word
                y_t_1 = preds_y[:, 0]

            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)

            # Select the current output word
            p2 = np.random.uniform()
            if p2 < beta:  # sample the ground truth word
                target = target_batch[:, di]
                sampled_batch = torch.stack(config.sample_size * [target],
                                            1)  # B x S
            else:  # randomly sample a word with given probabilities
                sampled_batch = torch.multinomial(final_dist,
                                                  config.sample_size,
                                                  replacement=True)  # B x S

            # Compute the NLL
            probs = torch.gather(final_dist, 1, sampled_batch).squeeze()
            step_nll = -torch.log(probs + config.eps)

            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_nll = step_nll + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage
            nll_list.append(step_nll)

            # Store the decoded words in preds_y
            preds_y = gen_preds(sampled_batch, use_cuda)
            # Add the decoded words into gen_summary (mixed with ground truth and decoded words)
            gen_summary = torch.cat((gen_summary, preds_y.unsqueeze(2)),
                                    2)  # B x S x L

        # compute the REINFORCE score
        nll = torch.sum(torch.stack(nll_list, 2), 2)  # B x S
        all_rewards, avg_reward = compute_reward(batch, gen_summary,
                                                 self.vocab, config.mode,
                                                 use_cuda)  # B x S, 1
        batch_loss = torch.sum(nll * all_rewards, dim=1)  # B
        loss = torch.mean(batch_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()
        return loss.item(), avg_reward.item()

    def trainIters(self, n_iters, model_file_path=None):
        if config.mode not in [
                "MLE", "RL", "GTI", "SO", "SIO", "DAGGER", "DAGGER*"
        ]:
            print("\nTRAINING MODE ERROR\n")
            raise ValueError
        # log file path
        log_path = os.path.join(config.log_root, 'log')
        log = open(log_path, 'w')
        print_log("==============================", file=log)
        iter, running_avg_loss = self.setup_train(
            model_file_path,
            emb_v_path=config.emb_v_path,
            emb_list_path=config.vocab_path,
            vocab=self.vocab,
            log=log)
        min_val_loss = np.inf

        alpha = config.alpha
        beta = config.beta
        k1 = config.k1
        k2 = config.k2
        delay = iter  # set to 0 in the original code (wyu-du)

        print("\nLog root is %s" % config.log_root)
        print_log("Train mode is %s" % config.mode, file=log)
        print_log("k1: %s, k2: %s" % (config.k1, config.k2), file=log)
        print_log("==============================", file=log)

        cur_time = time.time()
        while iter < n_iters:
            if config.mode == 'RL':
                alpha = 0.
                beta = 0.
            elif config.mode == 'GTI':
                alpha = 1.
                beta = 0.
            elif config.mode == 'SO':
                alpha = 1.
                beta = k2 / (k2 + np.exp((iter - delay) / k2))
            elif config.mode == 'SIO':
                alpha *= k1
                if alpha < 0.01:
                    beta = k2 / (k2 + np.exp((iter - delay) / k2))
                else:
                    beta = 1.
                    delay += 1
            elif config.mode == 'DAGGER':
                alpha *= k1
                beta = 1.
            elif config.mode == 'DAGGER*':
                alpha = config.alpha
                beta = 1.
            else:
                alpha = 1.
                beta = 1.

            batch = self.batcher.next_batch()
            loss, avg_reward = self.train_one_batch(batch, alpha, beta)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % config.print_interval == 0:
                print_log('steps %d, current_loss: %f, avg_reward: %f, alpha: %f, beta: %f, delay: %d' % \
                            (iter, loss, avg_reward, alpha, beta, delay), file=log)

            if iter % config.save_model_iter == 0:
                model_file_path = self.save_model(running_avg_loss,
                                                  iter,
                                                  mode='train')
                evl_model = Evaluate(model_file_path)
                val_avg_loss = evl_model.run_eval()
                if val_avg_loss < min_val_loss:
                    min_val_loss = val_avg_loss
                    best_model_file_path = self.save_model(running_avg_loss,
                                                           iter,
                                                           mode='eval')
                    print_log('Save best model at %s' % best_model_file_path,
                              file=log)
                print_log('steps %d, train_loss: %f, val_loss: %f, time: %ds' % \
                                        (iter, loss, val_avg_loss, time.time()-cur_time), file=log)
                # write val_loss into tensorboard
                loss_sum = tf.compat.v1.Summary()
                loss_sum.value.add(tag='val_avg_loss',
                                   simple_value=val_avg_loss)
                self.summary_writer.add_summary(loss_sum, global_step=iter)
                self.summary_writer.flush()
                cur_time = time.time()

        log.close()
Пример #18
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_closs': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage, wr_attention = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di, wr_attention)

            target = target_batch[:, di]
            print(target)
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()

            step_loss = -torch.log(gold_probs + config.eps)

            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)

                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)
        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()

        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)
            print("loss: ", loss, "step:", iter)
            if (math.isnan(loss)):
                exit()

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 5000 == 0:
                self.save_model(running_avg_loss, iter)
Пример #19
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        time.sleep(15)
        stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        train_dir = os.path.join(config.log_root, 'train_{}'.format(stamp))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter_step):
        """保存模型"""
        state = {
            'iter': iter_step,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime()) 
        model_save_path = os.path.join(self.model_dir, 'model_{}_{}'.format(iter_step, stamp))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        """模型初始化或加载、初始化迭代次数、损失、优化器"""
        # 初始化模型
        self.model = Model(model_file_path)
        # 模型参数的列表
        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        total_params = sum([param[0].nelement() for param in params])
        print('The Number of params of model: %.3f million\n' % (total_params / 1e6))  # million
        # 定义优化器
        # self.optimizer = optim.Adam(params, lr=config.adam_lr)
        # 使用AdagradCustom做优化器
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = AdagradCustom(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc)
        # 初始化迭代次数和损失
        start_iter, start_loss = 0, 0
        # 如果传入的已存在的模型路径,加载模型继续训练
        if model_file_path is not None:
            state = torch.load(model_file_path, map_location = lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if USE_CUDA:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.to(DEVICE)

        return start_iter, start_loss

    def train_one_batch(self, batch):
        """
        训练一个batch,返回该batch的loss。
        enc_batch:             torch.Size([16, 400]), 16篇文章的编码,不足400词的用pad的编码补足, oov词汇用0编码;
        enc_padding_mask:      torch.Size([16, 400]), 对应pad的位置为0,其余为1;
        enc_lens:              numpy.ndarray, 列表内每个元素表示每篇article的单词数;
        enc_batch_extend_vocab:torch.Size([16, 400]), 16篇文章的编码;oov词汇用超过词汇表的编码;
        extra_zeros:           torch.Size([16, 文章oov词汇数量]) zero tensor;
        c_t_1:                 torch.Size([16, 512]) zero tensor;
        coverage:              Variable(torch.zeros(batch_size, max_enc_seq_len)) if is_coverage==True else None;coverage模式时后续有值
        ----------------------------------------
        dec_batch:             torch.Size([16, 100]) 摘要编码含有开始符号编码以及PAD;
        dec_padding_mask:      torch.Size([16, 100]) 对应pad的位置为0,其余为1;
        max_dec_len:           标量,摘要词语数量,不包含pad
        dec_lens_var:          torch.Size([16] 摘要词汇数量         
        target_batch:          torch.Size([16, 100]) 目标摘要编码含有STOP符号编码以及PAD
        """
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)
        self.optimizer.zero_grad()
        """
        # 记得修改Batch类添加vocab属性
 
        print("模型输入文章编码:", "*"*100)
        print("enc_batch:", enc_batch, enc_batch.size())
        print("enc_batch[-1]:", enc_batch[-1])
        # print("batch._id_to_word:", batch.vocab._id_to_word)
        print("enc_batch[-1]原文:", [batch.vocab.id2word(idx) for idx in enc_batch[-1].cpu().numpy()])
        print("-"*50)
        print("enc_padding_mask:", enc_padding_mask, enc_padding_mask.size())
        print("-"*50)
        print("enc_lens:", enc_lens, enc_lens.shape)
        print("-"*50)
        print("enc_batch_extend_vocab", enc_batch_extend_vocab, enc_batch_extend_vocab.size())
        print("enc_batch_extend_vocab[-1]:", enc_batch_extend_vocab[-1])
        print("enc_batch_extend_vocab[-1]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in enc_batch_extend_vocab[-1].cpu().numpy()])
        print("-"*50)
        print("extra_zeros:", extra_zeros, extra_zeros.size())
        print("-"*50)
        print("c_t_1:", c_t_1, c_t_1.size())
        print("-"*50)
        print("coverage:", coverage)
        print("*"*100)
        
        print("模型输入摘要编码,包括源和目标:", "*"*100)
        print("dec_batch:", dec_batch, dec_batch.size())
        print("dec_batch[0]:", dec_batch[0])
        # print("batch._id_to_word:", batch.vocab._id_to_word)
        print("dec_batch[0]原文:", [batch.vocab.id2word(idx) for idx in dec_batch[0].cpu().numpy()])
        print("-"*50)
        print("dec_padding_mask:", dec_padding_mask, dec_padding_mask.size())
        print("-"*50)
        print("max_dec_len:", max_dec_len)
        print("-"*50)
        print("dec_lens_var", dec_lens_var, dec_lens_var.size())
        print("-"*50)
        print("target_batch:", target_batch, target_batch.size())
        print("-"*50)
        print("target_batch[0]:", target_batch[0], target_batch[0].size())
        print("target_batch[0]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in target_batch[0].cpu().numpy()])
        print("*"*100)
        input("任意键继续>>>")
        """
        # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim])
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)   # (h,c) = ([1, B, hid_dim], [1, B, hid_dim])
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]      # 摘要的一个单词,batch里的每个句子的同一位置的单词编码
            # print("y_t_1:", y_t_1, y_t_1.size())
            final_dist, s_t_1,  c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1,
                                                        encoder_outputs, encoder_feature, enc_padding_mask, c_t_1,
                                                        extra_zeros, enc_batch_extend_vocab, coverage, di)
            target = target_batch[:, di]  # 摘要的下一个单词的编码
            # print("target-iter:", target, target.size())
            # print("final_dist:", final_dist, final_dist.size())
            # input("go on>>")
            # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()   # 取出目标单词的概率gold_probs
            step_loss = -torch.log(gold_probs + config.eps)  # 最大化gold_probs,也就是最小化step_loss(添加负号)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage
                
            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses/dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        # 训练设置,包括
        iter_step, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter_step < n_iters:
            # 获取下一个batch数据
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter_step)
            iter_step += 1

            if iter_step % 100 == 0:
                self.summary_writer.flush()
            
            # print_interval = 1000
            if iter_step % 100 == 0:
                # lr = self.optimizer.state_dict()['param_groups'][0]['lr']
                print('steps %d, seconds for %d steps: %.2f, loss: %f' % (iter_step, 100,
                                                                          time.time() - start, loss))
                start = time.time()
            # 5000次迭代就保存一下模型
            if iter_step % 1000 == 0:
                self.save_model(running_avg_loss, iter_step)
Пример #20
0
class Evaluate(object):
    def __init__(self, model_file_path):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.eval_data_path,
                               self.vocab,
                               mode='eval',
                               batch_size=config.batch_size,
                               single_pass=True)
        time.sleep(15)
        model_name = os.path.basename(model_file_path)

        eval_dir = os.path.join(config.log_root, 'eval_%s' % (model_name))
        if not os.path.exists(eval_dir):
            os.mkdir(eval_dir)
        self.summary_writer = tf.summary.FileWriter(eval_dir)

        self.model = Model(model_file_path, is_eval=True)

    def eval_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        return loss.data[0]

    def run_eval(self):
        running_avg_loss, iter = 0, 0
        start = time.time()
        batch = self.batcher.next_batch()
        while batch is not None:
            loss = self.eval_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start,
                       running_avg_loss))
                start = time.time()
            batch = self.batcher.next_batch()
Пример #21
0
out_bid = tf.nn.softmax(tf.matmul(output, softmax_w), name='out_bid')

for i, next_i in enumerate(next_state):
    tf.identity(next_i.c, name='next_c_{}'.format(i))
    tf.identity(next_i.h, name='next_h_{}'.format(i))

cost = tf.losses.softmax_cross_entropy(out_bid_target, out_bid_logit)

train_step = tf.train.AdamOptimizer(0.001).minimize(cost)

batch = Batcher(n_examples, batch_size)
cost_batch = Batcher(n_examples, 10000)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(max_to_keep=20)

    for i in range(n_iterations):
        x_batch, y_batch = batch.next_batch([X_train, y_train])
        if i % display_step == 0:
            x_cost, y_cost = cost_batch.next_batch([X_train, y_train])
            c_train = sess.run(cost, feed_dict={seq_in: x_cost, seq_out: y_cost, keep_prob: 1.0})
            c_valid = sess.run(cost, feed_dict={seq_in: X_val, seq_out: y_val, keep_prob: 1.0})
            print('{}. c_train={} c_valid={}'.format(i, c_train, c_valid))
            sys.stdout.flush()
            saver.save(sess, model_path, global_step=i)
        sess.run(train_step, feed_dict={seq_in: x_batch, seq_out: y_batch, keep_prob: 0.8})

    saver.save(sess, model_path, global_step=n_iterations)
Пример #22
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(5)

        # print("BATCH")
        # print(self.batcher)

        if not os.path.exists(config.log_root):
            os.mkdir(config.log_root)

        self.model_dir = os.path.join(config.log_root, 'train_model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.eval_log = os.path.join(config.log_root, 'eval_log')
        if not os.path.exists(self.eval_log):
            os.mkdir(self.eval_log)
        self.summary_writer = tf.compat.v1.summary.FileWriter(self.eval_log)

    def save_model(self, running_avg_loss, iter, mode):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        if mode == 'train':
            save_model_dir = self.model_dir
        else:
            best_model_dir = os.path.join(config.log_root, 'best_model')
            if not os.path.exists(best_model_dir):
                os.mkdir(best_model_dir)
            save_model_dir = best_model_dir

        if len(os.listdir(save_model_dir)) > 0:
            shutil.rmtree(save_model_dir)
            time.sleep(2)
            os.mkdir(save_model_dir)
        train_model_path = os.path.join(save_model_dir,
                                        'model_best_%d' % (iter))
        torch.save(state, train_model_path)
        return train_model_path

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        if config.mode == 'MLE':
            self.optimizer = Adagrad(params,
                                     lr=0.15,
                                     initial_accumulator_value=0.1)
        else:
            self.optimizer = Adam(params, lr=initial_lr)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']
        return start_iter, start_loss

    def train_one_batch(self, batch, alpha, beta):

        #
        # print("BATCH")
        # print(batch)


        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        #
        # print("ENC_BATCH")
        # print(len(enc_batch))
        # print(len(enc_batch[0]))
        # print((enc_batch[0]))
        #
        # print("enc_padding_mask")
        # print(enc_padding_mask)
        # print(len(enc_padding_mask))
        # print(len(enc_padding_mask[0]))

        # print("enc_lens")
        # print(enc_lens)
        # print("enc_batch_extend_vocab")
        # print(enc_batch_extend_vocab)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)

        # print("encoder_outputs")
        # print(encoder_outputs.siz)

        s_t_1 = self.model.reduce_state(encoder_hidden)

        nll_list = []

        # sample_size 是啥?

        gen_summary = torch.LongTensor(
            config.batch_size * [config.sample_size * [[2]]])  # B x S x 1

        # print("gen_summary")
        # print(gen_summary.size())
        # print(gen_summary)

        if use_cuda: gen_summary = gen_summary.cuda()
        preds_y = gen_summary.squeeze(2)  # B x S

        # TODO: Print Gold Here!!!!
        # print("preds_y")
        # print(preds_y.size())
        # print(preds_y)
        # print(self.vocab.size())
        # print("temp")
        # from data import outputids2words
        # temp = outputids2words(list(map(lambda x : x.item(), dec_batch[1])),self.vocab,None)
        # print(temp)
        # # for item in dec_batch[1]:
        # #     temp = self.vocab.id2word(item.item())
        # #     from data import outputids2words(dec_batch[1])
        # #     print(temp)

        from data import outputids2words

        # print("dec_batch")
        # print(dec_batch[0])
        # temp = outputids2words(list(map(lambda x : x.item(), dec_batch[0])),self.vocab,None)
        # print(temp)
        # print()
        # print("target_batch")
        # print(target_batch[0])
        # temp = outputids2words(list(map(lambda x : x.item(), target_batch[0])),self.vocab,None)
        # print(temp)
        # print()

        for di in range(min(config.max_dec_steps, dec_batch.size(1))):
            # Select the current input word
            p1 = np.random.uniform()
            if p1 < alpha:  # use ground truth word
                y_t_1 = dec_batch[:, di]
            else:  # use decoded word
                y_t_1 = preds_y[:, 0]

            # print("y_t_1")
            # # print(y_t_1)
            # print("dec_batch")
            # print(dec_batch)
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)

            # Select the current output word
            p2 = np.random.uniform()
            if p2 < beta:  # sample the ground truth word
                target = target_batch[:, di]
                sampled_batch = torch.stack(config.sample_size * [target],
                                            1)  # B x S
            else:  # randomly sample a word with given probabilities
                sampled_batch = torch.multinomial(final_dist,
                                                  config.sample_size,
                                                  replacement=True)  # B x S

            # Compute the NLL
            probs = torch.gather(final_dist, 1, sampled_batch).squeeze()
            step_nll = -torch.log(probs + config.eps)

            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_nll = step_nll + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage
            nll_list.append(step_nll)

            # Store the decoded words in preds_y
            preds_y = gen_preds(sampled_batch, use_cuda)
            # Add the decoded words into gen_summary (mixed with ground truth and decoded words)
            gen_summary = torch.cat((gen_summary, preds_y.unsqueeze(2)),
                                    2)  # B x S x L

        # compute the REINFORCE score
        nll = torch.sum(torch.stack(nll_list, 2), 2)  # B x S
        all_rewards, avg_reward = compute_reward(batch, gen_summary,
                                                 self.vocab, config.mode,
                                                 use_cuda)  # B x S, 1
        batch_loss = torch.sum(nll * all_rewards, dim=1)  # B
        loss = torch.mean(batch_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()
        return loss.item(), avg_reward.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        min_val_loss = np.inf

        alpha = config.alpha
        beta = config.beta
        k1 = config.k1
        k2 = config.k2
        delay = 0
        while iter < n_iters:
            if config.mode == 'RL':
                alpha = 0.
                beta = 0.
            elif config.mode == 'GTI':
                alpha = 1.
                beta = 0.
            elif config.mode == 'SO':
                alpha = 1.
                beta = k2 / (k2 + np.exp((iter - delay) / k2))
            elif config.mode == 'SIO':
                alpha *= k1
                if alpha < 0.01:
                    beta = k2 / (k2 + np.exp((iter - delay) / k2))
                else:
                    beta = 1.
                    delay += 1
            elif config.mode == 'DAGGER':
                alpha *= k1
                beta = 1.
            elif config.mode == 'DAGGER*':
                alpha = config.alpha
                beta = 1.
            else:
                alpha = 1.
                beta = 1.

            batch = self.batcher.next_batch()
            loss, avg_reward = self.train_one_batch(batch, alpha, beta)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % config.print_interval == 0:
                print('steps %d, current_loss: %f, avg_reward: %f' %
                      (iter, loss, avg_reward))

            if iter % config.save_model_iter == 0:
                model_file_path = self.save_model(running_avg_loss,
                                                  iter,
                                                  mode='train')
                evl_model = Evaluate(model_file_path)
                val_avg_loss = evl_model.run_eval()
                if val_avg_loss < min_val_loss:
                    min_val_loss = val_avg_loss
                    best_model_file_path = self.save_model(running_avg_loss,
                                                           iter,
                                                           mode='eval')
                    print('Save best model at %s' % best_model_file_path)
                print('steps %d, train_loss: %f, val_loss: %f' %
                      (iter, loss, val_avg_loss))
                # write val_loss into tensorboard
                loss_sum = tf.compat.v1.Summary()
                loss_sum.value.add(tag='val_avg_loss',
                                   simple_value=val_avg_loss)
                self.summary_writer.add_summary(loss_sum, global_step=iter)
                self.summary_writer.flush()
Пример #23
0
batch = Batcher(n_examples, batch_size)
cost_train_batch = Batcher(n_examples, 10000)
cost_val_batch = Batcher(100000, 10000)

# run the session

model_path = sys.argv[3]

with tf.Session() as sess:
    sess.run(init)

    saver = tf.train.Saver(max_to_keep=100)

    for iteration in range(n_iterations // display_step):
        for i in range(display_step):
            x_batch, y_batch = batch.next_batch([X_train, y_train])
            train_step.run(feed_dict={
                X: x_batch,
                Y: y_batch,
                keep_prob: dropout_keep
            })

        saver.save(sess, model_path, global_step=iteration * display_step)

        sys.stdout.write('*')
        x_batch_c, y_batch_c = cost_train_batch.next_batch([X_train, y_train])
        x_batch_v, y_batch_v = cost_val_batch.next_batch([X_val, y_val])
        c = sess.run(cost,
                     feed_dict={
                         X: x_batch_c,
                         Y: y_batch_c,
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)
        stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        train_dir = os.path.join(config.log_root)
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)


#         self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter_step):
        """保存模型"""
        state = {
            'iter': iter_step,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        model_save_path = os.path.join(self.model_dir,
                                       'model_{}'.format(iter_step))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        # 初始化模型
        self.model = Model(model_file_path)
        # 模型参数的列表
        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        # 定义优化器
        self.optimizer = optim.Adam(params, lr=config.adam_lr)
        # 初始化迭代次数和损失
        start_iter, start_loss = 0, 0
        # 如果传入的已存在的模型路径,加载模型继续训练
        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if USE_CUDA:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.to(DEVICE)

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)
        self.optimizer.zero_grad()
        # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim])
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(
            encoder_hidden)  # (h,c) = ([1, B, hid_dim], [1, B, hid_dim])
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # 摘要的一个单词,batch里的每个句子的同一位置的单词编码
            # print("y_t_1:", y_t_1, y_t_1.size())
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]  # 摘要的下一个单词的编码
            # print("target-iter:", target, target.size())
            # print("final_dist:", final_dist, final_dist.size())
            # input("go on>>")
            # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000
            gold_probs = torch.gather(
                final_dist, 1,
                target.unsqueeze(1)).squeeze()  # 取出目标单词的概率gold_probs
            step_loss = -torch.log(
                gold_probs + config.eps)  # 最大化gold_probs,也就是最小化step_loss(添加负号)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        # 训练设置,包括
        iter_step, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter_step < n_iters:
            # 获取下一个batch数据
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter_step)
            iter_step += 1
            if running_avg_loss < 0.01:
                break

            if iter_step % 100 == 0:
                print('steps %d, seconds for %d ,' %
                      (iter_step, time.time() - start))
                print('loss:', loss)

                start = time.time()

            if iter_step % 500 == 0 and running_avg_loss > 0.001:
                self.save_model(running_avg_loss, iter_step)
Пример #25
0
def main(unused_argv):
    print("unused_argv: ", unused_argv)
    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    tf.logging.set_verbosity(
        tf.logging.INFO)  # choose what level of logging you want
    tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode))

    # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
    FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
    if not os.path.exists(FLAGS.log_root):
        if FLAGS.mode == "train":
            os.makedirs(FLAGS.log_root)
        else:
            raise Exception(
                "Logdir %s doesn't exist. Run in train mode to create it." %
                (FLAGS.log_root))
    print("FLAGS.vocab_size: ", FLAGS.vocab_size)
    vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size)  # create a vocabulary
    print("vocab size: ", vocab.size())
    # If in decode mode, set batch_size = beam_size
    # Reason: in decode mode, we decode one example at a time.
    # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
    if FLAGS.mode == 'decode':
        FLAGS.batch_size = FLAGS.beam_size

    # If single_pass=True, check we're in decode mode
    if FLAGS.single_pass and FLAGS.mode != 'decode':
        raise Exception(
            "The single_pass flag should only be True in decode mode")

    # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
    hparam_list = [
        'mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag',
        'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim',
        'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage',
        'cov_loss_wt', 'pointer_gen', 'fine_tune', 'train_size', 'subred_size',
        'use_doc_vec', 'use_multi_attn', 'use_multi_pgen', 'use_multi_pvocab',
        'create_ckpt'
    ]
    hps_dict = {}
    for key, val in FLAGS.__flags.items():  # for each flag
        if key in hparam_list:  # if it's in the list
            hps_dict[key] = val  # add it to the dict
    hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)

    # Create a batcher object that will create minibatches of data
    batcher = Batcher(FLAGS.data_path,
                      vocab,
                      hps,
                      single_pass=FLAGS.single_pass)

    tf.set_random_seed(111)  # a seed value for randomness

    #   return

    if hps.mode.value == 'train':
        print("creating model...")
        model = SummarizationModel(hps, vocab)

        # -------------------------------------
        if hps.create_ckpt.value:
            step = 0

            model.build_graph()
            print("get value")
            pretrained_ckpt = '/home/cs224u/pointer/log/pretrained_model_tf1.2.1/train/model-238410'
            reader = pywrap_tensorflow.NewCheckpointReader(pretrained_ckpt)
            var_to_shape_map = reader.get_variable_to_shape_map()
            value = {}
            for key in var_to_shape_map:
                value[key] = reader.get_tensor(key)

            print("assign op")
            assign_op = []
            if hps.use_multi_pvocab.value:
                new_key = [
                    "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_0/Bias",
                    "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_1/Bias"
                ]
                for v in tf.trainable_variables():
                    key = v.name.split(":")[0]
                    if key in new_key:
                        origin_key = "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/" + key.split(
                            "/")[-1]
                        a_op = v.assign(tf.convert_to_tensor(
                            value[origin_key]))
                    else:
                        a_op = v.assign(tf.convert_to_tensor(value[key]))
                    # if key == "seq2seq/embedding/embedding":
                    # a_op = v.assign(tf.convert_to_tensor(value[key]))
                    assign_op.append(a_op)
            else:
                for v in tf.trainable_variables():
                    key = v.name.split(":")[0]
                    if key == "seq2seq/embedding/embedding":
                        a_op = v.assign(tf.convert_to_tensor(value[key]))
                        assign_op.append(a_op)
            # ratio = 1
            # for v in tf.trainable_variables():
            #   key = v.name.split(":")[0]
            #   # embedding (50000, 128) -> (50000, 32)

            #   if key == "seq2seq/embedding/embedding":
            #       print (key)
            #       print (value[key].shape)
            #       d1 = value[key].shape[1]
            #       a_op = v.assign(tf.convert_to_tensor(value[key][:,:d1//ratio]))
            #   # kernel (384, 1024) -> (96, 256)
            #   # w_reduce_c (512, 256) -> (128, 64)
            #   elif key == "seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/kernel" or \
            #   key == "seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/kernel" or \
            #   key == "seq2seq/reduce_final_st/w_reduce_c" or \
            #   key == "seq2seq/reduce_final_st/w_reduce_h" or \
            #   key == "seq2seq/decoder/attention_decoder/Linear/Matrix" or \
            #   key == "seq2seq/decoder/attention_decoder/lstm_cell/kernel" or \
            #   key == "seq2seq/decoder/attention_decoder/Attention/Linear/Matrix" or \
            #   key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Matrix":
            #       print (key)
            #       print (value[key].shape)
            #       d0, d1 = value[key].shape[0], value[key].shape[1]
            #       a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio, :d1//ratio]))
            #   # bias (1024,) -> (256,)
            #   elif key == "seq2seq/encoder/bidirectional_rnn/fw/lstm_cell/bias" or \
            #   key == "seq2seq/encoder/bidirectional_rnn/bw/lstm_cell/bias" or \
            #   key == "seq2seq/reduce_final_st/bias_reduce_c" or \
            #   key == "seq2seq/reduce_final_st/bias_reduce_h" or \
            #   key == "seq2seq/decoder/attention_decoder/lstm_cell/bias" or \
            #   key == "seq2seq/decoder/attention_decoder/v" or \
            #   key == "seq2seq/decoder/attention_decoder/Attention/Linear/Bias" or \
            #   key == "seq2seq/decoder/attention_decoder/Linear/Bias" or \
            #   key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Bias":
            #       print (key)
            #       print (value[key].shape)
            #       d0 = value[key].shape[0]
            #       a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio]))
            #   # W_h (1, 1, 512, 512) -> (1, 1, 128, 128)
            #   elif key == "seq2seq/decoder/attention_decoder/W_h":
            #       print (key)
            #       print (value[key].shape)
            #       d2, d3 = value[key].shape[2], value[key].shape[3]
            #       a_op = v.assign(tf.convert_to_tensor(value[key][:,:,:d2//ratio,:d3//ratio]))
            #   # Matrix (1152, 1) -> (288, 1)
            #   elif key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Matrix" or \
            #   key == "seq2seq/output_projection/w":
            #       print (key)
            #       print (value[key].shape)
            #       d0 = value[key].shape[0]
            #       a_op = v.assign(tf.convert_to_tensor(value[key][:d0//ratio,:]))
            #   # Bias (1,) -> (1,)
            #   elif key == "seq2seq/output_projection/v" or \
            #   key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Bias":
            #       print (key)
            #       print (value[key].shape)
            #       a_op = v.assign(tf.convert_to_tensor(value[key]))

            #   # multi_attn
            #   if hps.use_multi_attn.value:
            #     if key == "seq2seq/decoder/attention_decoder/attn_0/v" or \
            #     key == "seq2seq/decoder/attention_decoder/attn_1/v":
            #     # key == "seq2seq/decoder/attention_decoder/attn_2/v":
            #       k = "seq2seq/decoder/attention_decoder/v"
            #       print (key)
            #       print (value[k].shape)
            #       d0 = value[k].shape[0]
            #       a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio]))
            #     if key == "seq2seq/decoder/attention_decoder/Attention/Linear_0/Bias" or \
            #     key == "seq2seq/decoder/attention_decoder/Attention/Linear_1/Bias":
            #     # key == "seq2seq/decoder/attention_decoder/Attention/Linear_2/Bias":
            #       k = "seq2seq/decoder/attention_decoder/Attention/Linear/Bias"
            #       print (key)
            #       print (value[k].shape)
            #       d0 = value[k].shape[0]
            #       a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio]))
            #   elif hps.use_multi_pgen.value:
            #     if key == "seq2seq/decoder/attention_decoder/Linear_0/Bias" or \
            #     key == "seq2seq/decoder/attention_decoder/Linear_1/Bias":
            #     # key == "seq2seq/decoder/attention_decoder/Linear_2/Bias":
            #       k = "seq2seq/decoder/attention_decoder/Linear/Bias"
            #       print (key)
            #       print (value[k].shape)
            #       d0 = value[k].shape[0]
            #       a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio]))
            #     if key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_0/Bias" or \
            #     key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_1/Bias":
            #     # key == "seq2seq/decoder/attention_decoder/calculate_pgen/Linear_2/Bias":
            #       k = "seq2seq/decoder/attention_decoder/calculate_pgen/Linear/Bias"
            #       print (key)
            #       print (value[k].shape)
            #       a_op = v.assign(tf.convert_to_tensor(value[k]))
            #   elif hps.use_multi_pvocab.value:
            #     if key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_0/Bias" or \
            #     key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_1/Bias":
            #     # key == "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear_2/Bias":
            #       k = "seq2seq/decoder/attention_decoder/AttnOutputProjection/Linear/Bias"
            #       print (key)
            #       print (value[k].shape)
            #       d0 = value[k].shape[0]
            #       a_op = v.assign(tf.convert_to_tensor(value[k][:d0//ratio]))

            #    assign_op.append(a_op)

            # Add an op to initialize the variables.
            init_op = tf.global_variables_initializer()
            # Add ops to save and restore all the variables.
            saver = tf.train.Saver()
            with tf.Session(config=util.get_config()) as sess:
                sess.run(init_op)
                # Do some work with the model.
                for a_op in assign_op:
                    a_op.op.run()

                for _ in range(0):
                    batch = batcher.next_batch()
                    results = model.run_train_step(sess, batch)

                # Save the variables to disk.
                if hps.use_multi_attn.value:
                    ckpt_tag = "multi_attn_2_attn_proj"
                elif hps.use_multi_pgen.value:
                    ckpt_tag = "multi_attn_2_pgen_proj"
                elif hps.use_multi_pvocab.value:
                    ckpt_tag = "big_multi_attn_2_pvocab_proj"
                else:
                    ckpt_tag = "pointer_proj"

                ckpt_to_save = '/home/cs224u/pointer/log/ckpt/' + ckpt_tag + '/model.ckpt-' + str(
                    step)
                save_path = saver.save(sess, ckpt_to_save)
                print("Model saved in path: %s" % save_path)

        # -------------------------------------
        else:
            setup_training(model, batcher, hps)

    elif hps.mode.value == 'eval':
        model = SummarizationModel(hps, vocab)
        run_eval(model, batcher, vocab)
    elif hps.mode.value == 'decode':
        decode_model_hps = hps  # This will be the hyperparameters for the decoder model
        decode_model_hps = hps._replace(
            max_dec_steps=1
        )  # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
        model = SummarizationModel(decode_model_hps, vocab)
        decoder = BeamSearchDecoder(model, batcher, vocab)
        decoder.decode(
        )  # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
    else:
        raise ValueError("The 'mode' flag must be one of train/eval/decode")
Пример #26
0
    for i in range(len(predictions)):
        result.append(costs[i, predicted_indexes[i]])

    return result


batch = Batcher(n_examples, batch_size)
cost_batch = Batcher(n_examples, 10000)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver(max_to_keep=50)

    for i in range(n_iterations):
        a_batch, h_batch, c_batch = batch.next_batch(
            [A_train, H_train, C_train])
        if i % display_step == 0:
            a_cost, h_cost, c_cost = cost_batch.next_batch(
                [A_train, H_train, C_train])
            c_train_pred = sess.run(cost_pred,
                                    feed_dict={
                                        seq_in: a_cost,
                                        H: h_cost,
                                        C: c_cost,
                                        keep_prob: 1.0
                                    })
            c_train_reg = sess.run(cost_reg,
                                   feed_dict={
                                       seq_in: a_cost,
                                       H: h_cost,
                                       C: c_cost,
Пример #27
0
def main(unused_argv):
  if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
    raise Exception("Problem with flags: %s" % unused_argv)

  tf.logging.set_verbosity(tf.logging.INFO) # choose what level of logging you want
  tf.logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode))

  # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary
  FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name)
  if not os.path.exists(FLAGS.log_root):
    if FLAGS.mode=="train":
      os.makedirs(FLAGS.log_root)
    else:
      raise Exception("Logdir %s doesn't exist. Run in train mode to create it." % (FLAGS.log_root))

  
  vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary
  link_mat_size = vocab._count
  print (link_mat_size)
  
  # If in decode mode, set batch_size = beam_size
  # Reason: in decode mode, we decode one example at a time.
  # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses.
  if FLAGS.mode == 'decode':
    FLAGS.batch_size = FLAGS.beam_size
  
  # If single_pass=True, check we're in decode mode
  if FLAGS.single_pass and FLAGS.mode!='decode':
    raise Exception("The single_pass flag should only be True in decode mode")

  # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
  hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps','pointer_gen']
  hps_dict = {}
  for key,val in FLAGS.__flags.iteritems(): # for each flag
    if key in hparam_list: # if it's in the list
      hps_dict[key] = val # add it to the dict
  hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)
  print (hps)
  
  # Create a batcher object that will create minibatches of data
  batcher = Batcher(FLAGS.data_path, vocab, hps, single_pass=FLAGS.single_pass)
  batch = batcher.next_batch()
  print (batch.art_oovs)
  tf.set_random_seed(111) # a seed value for randomness
  
  if hps.mode == 'train':
    print "creating model..."
    model = SummarizationModel(hps, vocab)
    
    setup_training(model, batcher, vocab)
    
  
  elif hps.mode == 'eval':
    model = SummarizationModel(hps, vocab)
    run_eval(model, batcher, vocab)
  elif hps.mode == 'decode':
    decode_model_hps = hps  # This will be the hyperparameters for the decoder model
    decode_model_hps = hps._replace(max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries
    model = SummarizationModel(decode_model_hps, vocab)
    
    decoder = BeamSearchDecoder(model, batcher, vocab)
    
    decoder.decode() # decode indefinitely (unless single_pass=True, in which case deocde the dataset exactly once)
  else:
    raise ValueError("The 'mode' flag must be one of train/eval/decode")
Пример #28
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(5)

        # check the existence of log root file
        if not os.path.exists(config.log_root):
            os.mkdir(config.log_root)

        # check the existence of training model file
        self.model_dir = os.path.join(config.log_root, 'train_model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        # check the existence of training log file
        self.train_log = os.path.join(config.log_root, 'train_log')
        if not os.path.exists(self.train_log):
            os.mkdir(self.train_log)
        self.summary_writer = tf.summary.FileWriter(self.train_log)

    def save_model(self, running_avg_loss, iter, mode):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        if mode == 'train':
            save_model_dir = self.model_dir
        else:
            best_model_dir = os.path.join(config.log_root, 'best_model')
            if not os.path.exists(best_model_dir):
                os.mkdir(best_model_dir)
            save_model_dir = best_model_dir

        if len(os.listdir(save_model_dir)) > 0:
            shutil.rmtree(save_model_dir)
            time.sleep(2)
            os.mkdir(save_model_dir)
        model_save_path = os.path.join(save_model_dir, 'model_%d' % (iter))
        torch.save(state, model_save_path)
        return model_save_path

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        min_val_loss = np.inf
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % config.print_interval == 0:
                tf.logging.info(
                    'steps %d, seconds for %d batch: %.2f , loss: %f, min_val_loss: %f'
                    % (iter, config.print_interval, time.time() - start, loss,
                       min_val_loss))
                start = time.time()
            if iter % config.model_save_iters == 0:
                self.summary_writer.flush()
                model_save_path = self.save_model(running_avg_loss,
                                                  iter,
                                                  mode='train')
                tf.logging.info('Evaluate the model %s at validation set....' %
                                model_save_path)
                evl_model = Evaluate(model_save_path)
                val_avg_loss = evl_model.run_eval()
                if val_avg_loss < min_val_loss:
                    min_val_loss = val_avg_loss
                    best_model_save_path = self.save_model(running_avg_loss,
                                                           iter,
                                                           mode='eval')
                    tf.logging.info('Save best model at %s' %
                                    best_model_save_path)
Пример #29
0
    def step(self):
        rollout = []
        hyperparameters = self.hyperparameters

        env_info = self.environment.reset(train_mode=True)[self.brain_name]    
        self.states = env_info.vector_observations  
        states = self.states

        for _ in range(hyperparameters['rollout_length']):
            actions, log_probs, _, values = self.network(states)
            env_info = self.environment.step(actions.cpu().detach().numpy())[self.brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            terminals = np.array([1 if t else 0 for t in env_info.local_done])
            self.all_rewards += rewards
            
            for i, terminal in enumerate(terminals):
                if terminals[i]:
                    self.episode_rewards.append(self.all_rewards[i])
                    self.all_rewards[i] = 0
                    
            rollout.append([states, values.detach(), actions.detach(), log_probs.detach(), rewards, 1 - terminals])
            states = next_states

        self.states = states
        pending_value = self.network(states)[-1]
        rollout.append([states, pending_value, None, None, None, None])

        processed_rollout = [None] * (len(rollout) - 1)
        advantages = torch.Tensor(np.zeros((self.config['environment']['number_of_agents'], 1)))
        returns = pending_value.detach()
        for i in reversed(range(len(rollout) - 1)):
            states, value, actions, log_probs, rewards, terminals = rollout[i]
            terminals = torch.Tensor(terminals).unsqueeze(1)
            rewards = torch.Tensor(rewards).unsqueeze(1)
            actions = torch.Tensor(actions.cpu())
            states = torch.Tensor(states)
            next_value = rollout[i + 1][1]
            returns = rewards + hyperparameters['discount_rate'] * terminals * returns.cpu()
            td_error = rewards + hyperparameters['discount_rate'] * terminals * next_value.detach().cpu() - value.detach().cpu()

            advantages = advantages * hyperparameters['tau'] * hyperparameters['discount_rate'] * terminals + td_error
            processed_rollout[i] = [states, actions, log_probs, returns, advantages]

        states, actions, log_probs_old, returns, advantages = map(lambda x: torch.cat(x, dim=0), zip(*processed_rollout))
        advantages = (advantages - advantages.mean()) / advantages.std()

        batcher = Batcher(states.size(0) // hyperparameters['mini_batch_number'], [np.arange(states.size(0))])
        for _ in range(hyperparameters['optimization_epochs']):
            batcher.shuffle()
            while not batcher.end():
                batch_indices = batcher.next_batch()[0]
                batch_indices = torch.Tensor(batch_indices).long()
                sampled_states = states[batch_indices]
                sampled_actions = actions[batch_indices]
                sampled_log_probs_old = log_probs_old[batch_indices]
                sampled_returns = returns[batch_indices]
                sampled_advantages = advantages[batch_indices]

                _, log_probs, entropy_loss, values = self.network(sampled_states, sampled_actions)
                ratio = (log_probs - sampled_log_probs_old).exp()
                obj = ratio * sampled_advantages
                obj_clipped = ratio.clamp(1.0 - hyperparameters['ppo_clip'],
                                          1.0 + hyperparameters['ppo_clip']) * sampled_advantages
                policy_loss = -torch.min(obj, obj_clipped).mean(0) - hyperparameters['entropy_coefficent'] * entropy_loss.mean()
                value_loss = 0.5 * (sampled_returns - values.cpu()).pow(2).mean()

                self.optimizier.zero_grad()
                (policy_loss + value_loss).backward()
                nn.utils.clip_grad_norm_(self.network.parameters(), hyperparameters['gradient_clip'])
                self.optimizier.step()

        steps = hyperparameters['rollout_length'] * self.config['environment']['number_of_agents']
        self.total_steps += steps
Пример #30
0
class BeamSearch(object):
    def __init__(self, model_file_path):
        model_name = os.path.basename(model_file_path)
        self._decode_dir = os.path.join(config.log_root,
                                        'decode_%s' % (model_name))
        self._rouge_ref_dir = os.path.join(self._decode_dir, 'rouge_ref')
        self._rouge_dec_dir = os.path.join(self._decode_dir, 'rouge_dec_dir')
        for p in [self._decode_dir, self._rouge_ref_dir, self._rouge_dec_dir]:
            if not os.path.exists(p):
                os.mkdir(p)

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.decode_data_path,
                               self.vocab,
                               mode='decode',
                               batch_size=config.beam_size,
                               single_pass=True)
        time.sleep(15)

        self.model = Model(model_file_path, is_eval=True)

    def sort_beams(self, beams):
        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)

    def decode(self):
        start = time.time()
        counter = 0
        batch = self.batcher.next_batch()
        while batch is not None:
            # Run beam search to get best Hypothesis
            best_summary = self.beam_search(batch)

            # Extract the output ids from the hypothesis and convert back to words
            output_ids = [int(t) for t in best_summary.tokens[1:]]
            decoded_words = data.outputids2words(
                output_ids, self.vocab,
                (batch.art_oovs[0] if config.pointer_gen else None))

            # Remove the [STOP] token from decoded_words, if necessary
            try:
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words

            original_abstract = batch.original_abstracts_sents[0]

            write_for_rouge(original_abstract, decoded_words, counter,
                            self._rouge_ref_dir, self._rouge_dec_dir)
            counter += 1
            if counter % 1000 == 0:
                print('%d example in %d sec' % (counter, time.time() - start))
                start = time.time()

            batch = self.batcher.next_batch()

        print("Decoder has finished reading dataset for single_pass.")
        print("Now starting ROUGE eval...")
        results_dict = rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir)
        rouge_log(results_dict, self._decode_dir)

    def beam_search(self, batch):
        #batch should have only one example
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = \
            get_input_from_batch(batch, use_cuda)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_0 = self.model.reduce_state(encoder_hidden)

        dec_h, dec_c = s_t_0  # 1 x 2*hidden_size
        dec_h = dec_h.squeeze()
        dec_c = dec_c.squeeze()

        #decoder batch preparation, it has beam_size example initially everything is repeated
        beams = [
            Beam(tokens=[self.vocab.word2id(data.START_DECODING)],
                 log_probs=[0.0],
                 state=(dec_h[0], dec_c[0]),
                 context=c_t_0[0],
                 coverage=(coverage_t_0[0] if config.is_coverage else None))
            for _ in range(config.beam_size)
        ]
        results = []
        steps = 0
        while steps < config.max_dec_steps and len(results) < config.beam_size:
            latest_tokens = [h.latest_token for h in beams]
            latest_tokens = [t if t < self.vocab.size() else self.vocab.word2id(data.UNKNOWN_TOKEN) \
                             for t in latest_tokens]
            y_t_1 = Variable(torch.LongTensor(latest_tokens))
            if use_cuda:
                y_t_1 = y_t_1.cuda()
            all_state_h = []
            all_state_c = []

            all_context = []

            for h in beams:
                state_h, state_c = h.state
                all_state_h.append(state_h)
                all_state_c.append(state_c)

                all_context.append(h.context)

            s_t_1 = (torch.stack(all_state_h,
                                 0).unsqueeze(0), torch.stack(all_state_c,
                                                              0).unsqueeze(0))
            c_t_1 = torch.stack(all_context, 0)

            coverage_t_1 = None
            if config.is_coverage:
                all_coverage = []
                for h in beams:
                    all_coverage.append(h.coverage)
                coverage_t_1 = torch.stack(all_coverage, 0)

            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage_t_1, steps)

            topk_log_probs, topk_ids = torch.topk(final_dist,
                                                  config.beam_size * 2)

            dec_h, dec_c = s_t
            dec_h = dec_h.squeeze()
            dec_c = dec_c.squeeze()

            all_beams = []
            num_orig_beams = 1 if steps == 0 else len(beams)
            for i in range(num_orig_beams):
                h = beams[i]
                state_i = (dec_h[i], dec_c[i])
                context_i = c_t[i]
                coverage_i = (coverage_t[i] if config.is_coverage else None)

                for j in range(config.beam_size *
                               2):  # for each of the top 2*beam_size hyps:
                    new_beam = h.extend(token=topk_ids[i, j].item(),
                                        log_prob=topk_log_probs[i, j].item(),
                                        state=state_i,
                                        context=context_i,
                                        coverage=coverage_i)
                    all_beams.append(new_beam)

            beams = []
            for h in self.sort_beams(all_beams):
                if h.latest_token == self.vocab.word2id(data.STOP_DECODING):
                    if steps >= config.min_dec_steps:
                        results.append(h)
                else:
                    beams.append(h)
                if len(beams) == config.beam_size or len(
                        results) == config.beam_size:
                    break

            steps += 1

        if len(results) == 0:
            results = beams

        beams_sorted = self.sort_beams(results)

        return beams_sorted[0]
Пример #31
0
def main():
    embedding_dict_file = os.path.join(os.path.dirname(hps.word_count_path),
                                       'emb_dict_50000.pkl')
    vocab = Vocab(hps.word_count_path, hps.glove_path, hps.embedding_dim,
                  hps.max_vocab_size, embedding_dict_file)
    train_file = os.path.join(hps.data_path, 'train_raw.json')
    dev_file = os.path.join(hps.data_path, 'dev_raw.json')  #'dev_raw.json')

    if (not os.path.exists(train_file)) \
    or (not os.path.exists(dev_file)):
        raise Exception(
            'train and dev data not exist in data_path, please check')

    if hps.save and not hps.exp_dir:
        raise Exception(
            'please specify exp_dir when you want to save experiment info')

    print(vars(hps))
    if hps.save:
        utils.save_hps(hps.exp_dir, hps)

    net = PointerNet(hps, vocab.emb_mat)
    net = net.cuda()

    model_parameters = list(filter(lambda p: p.requires_grad,
                                   net.parameters()))
    print('the number of parameters in model:',
          sum(p.numel() for p in model_parameters))
    optimizer = optim.Adam(model_parameters)

    train_data_batcher = Batcher(train_file, vocab, hps, hps.single_pass)
    dev_data_batcher = Batcher(dev_file, vocab, hps, hps.single_pass)

    if hps.reward_metric == 'bleu':
        reward = get_batch_bleu

    global_step = 0
    dev_loss_track = []
    min_dev_loss = math.inf
    for i in range(hps.num_epoch):
        epoch_loss_track = []
        train_data_batcher.setup()
        while True:
            start = time.time()
            try:
                batch = train_data_batcher.next_batch()
                #print('get next batch time:', time.time()-start)
            except StopIteration:
                # do evaluation here, if necessary, to save best model
                dev_data_batcher.setup()
                dev_loss = run_eval(dev_data_batcher, net)
                print(
                    "epoch {}: avg train loss: {:>10.4f}, dev_loss: {:>10.4f}".
                    format(i + 1,
                           sum(epoch_loss_track) / len(epoch_loss_track),
                           dev_loss))
                dev_loss_track.append(dev_loss)

                if i > hps.early_stopping_from:
                    last5devloss = dev_loss_track[i] + dev_loss_track[
                        i - 1] + dev_loss_track[i - 2] + dev_loss_track[
                            i - 3] + dev_loss_track[i - 4]
                    last10devloss = dev_loss_track[i - 5] + dev_loss_track[
                        i - 6] + dev_loss_track[i - 7] + dev_loss_track[
                            i - 8] + dev_loss_track[i - 9]
                    if hps.early_stopping_from and last5devloss >= last10devloss:
                        print("early stopping by dev_loss!")
                        sys.exit()

                if dev_loss < min_dev_loss:
                    min_dev_loss = dev_loss
                    if hps.save:
                        utils.save_model(hps.exp_dir, net, min_dev_loss)
                break

            paragraph_tensor = torch.tensor(batch.enc_batch,
                                            dtype=torch.int64,
                                            requires_grad=False).cuda()
            question_tensor = torch.tensor(batch.dec_batch,
                                           dtype=torch.int64,
                                           requires_grad=False).cuda()
            answer_position_tensor = torch.tensor(batch.ans_indices,
                                                  dtype=torch.int64,
                                                  requires_grad=False).cuda()
            target_tensor = torch.tensor(batch.target_batch,
                                         dtype=torch.int64,
                                         requires_grad=False).cuda()

            paragraph_batch_extend_vocab = None
            max_para_oovs = None
            if hps.pointer_gen:
                paragraph_batch_extend_vocab = torch.tensor(
                    batch.enc_batch_extend_vocab,
                    dtype=torch.int64,
                    requires_grad=False).cuda()
                max_para_oovs = batch.max_para_oovs

            optimizer.zero_grad()
            net.train()

            vocab_scores, vocab_dists, attn_dists, final_dists = net(
                paragraph_tensor, question_tensor, answer_position_tensor,
                paragraph_batch_extend_vocab, max_para_oovs)

            dec_padding_mask = torch.ne(target_tensor, 0).float().cuda()
            # for self-critic
            if hps.self_critic:
                greedy_seq = [
                    torch.argmax(dist, dim=1, keepdim=True)
                    for dist in final_dists
                ]  # each dist = [batch_size, vsize]
                greedy_seq_tensor = torch.cat(greedy_seq,
                                              dim=1)  # [batch_size, seq_len]

                sample_seq = []
                for dist in final_dists:
                    m = torch.distributions.categorical.Categorical(probs=dist)
                    sample_seq.append(m.sample())  # each is [batch_size,]
                sample_seq_tensor = torch.stack(sample_seq, dim=1)

                if hps.pointer_gen:
                    loss_per_step = []
                    for dist, sample_tgt in zip(final_dists, sample_seq):
                        # dist = [batch_size, extended_vsize]
                        probs = torch.gather(
                            dist, 1, sample_tgt.unsqueeze(1)).squeeze()
                        losses = -torch.log(probs)
                        loss_per_step.append(losses)  # a list of [batch_size,]
                    rl_loss = mask_and_avg(loss_per_step,
                                           dec_padding_mask,
                                           batch_average=False,
                                           step_average=False)
                    # this rl_loss = [batch_size, ]
                else:
                    # a list of dec_max_len (vocab_scores)
                    loss_batch_by_step = F.cross_entropy(
                        torch.stack(vocab_scores,
                                    dim=1).reshape(-1, vocab.size()),
                        sample_seq_tensor.reshape(-1),
                        size_average=False,
                        reduce=False)
                    # loss [batch_size*dec_max_len,]
                    mask_loss_batch_by_step = loss_batch_by_step * dec_padding_mask.reshape(
                        -1)
                    batch_size = vocab_scores[0].size(0)
                    rl_loss = torch.sum(mask_loss_batch_by_step.reshape(
                        batch_size, -1),
                                        dim=1)

                r1 = reward(target_tensor, greedy_seq_tensor)
                r2 = reward(target_tensor, sample_seq_tensor)
                reward_diff = r1 - r2
                final_rl_loss = reward_diff * rl_loss
                loss = torch.mean(final_rl_loss)
                print(
                    'r1: %.3f, r2: %.3f, reward_diff: %.3f, final rl loss: %.3f, loss batch mean: %.3f'
                    % (torch.max(r1).item(), torch.max(r2).item(),
                       torch.max(reward_diff).item(),
                       torch.max(final_rl_loss).item(), loss.item()))

            # for maximum likelihood
            if hps.maxium_likelihood:
                if hps.pointer_gen:
                    loss_per_step = []
                    for dec_step, dist in enumerate(final_dists):
                        # dist = [batch_size, extended_vsize]
                        targets = target_tensor[:, dec_step]
                        gold_probs = torch.gather(
                            dist, 1, targets.unsqueeze(1)).squeeze()
                        losses = -torch.log(gold_probs)
                        loss_per_step.append(losses)  # a list of [batch_size,]
                    loss = mask_and_avg(loss_per_step, dec_padding_mask)
                else:
                    # a list of dec_max_len (vocab_scores)
                    loss_batch_by_step = F.cross_entropy(
                        torch.stack(vocab_scores,
                                    dim=1).reshape(-1, vocab.size()),
                        target_tensor.reshape(-1),
                        size_average=False,
                        reduce=False)
                    # loss [batch_size*dec_max_len,]
                    loss = torch.sum(loss_batch_by_step *
                                     dec_padding_mask.reshape(-1)) / torch.sum(
                                         dec_padding_mask)

            epoch_loss_track.append(loss.item())
            global_step += 1

            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), max_norm=hps.norm_limit)
            optimizer.step()
            #print('time one step:', time.time()-start)
            if (global_step == 1) or (global_step % hps.print_every == 0):
                print('Step {:>5}: ave loss: {:>10.4f}, speed: {:.1f} case/s'.
                      format(global_step,
                             sum(epoch_loss_track) / len(epoch_loss_track),
                             hps.batch_size / (time.time() - start)))