Пример #1
0
def create_load_vocab(arg,
                      file_name,
                      out_file_name,
                      pad=True,
                      unk=True,
                      sos_eos=False):
    """Creates and loads the vocab file for a given corpus.

    Args:
    arg: The output of the parser.
    file_name: The name of the file containing the corpus.
    out_file_name: The file into which the vocab should be written into.
    pad: A boolean to indicate if the pad token should be included
        in the vocabulary.
    unk: A boolean to indicate if the unknown token should be included
        in the vocabulary.
    sos_eos: A boolean to indicate if the SOS and EOS token should be included
        in the vocabulary.

    Returns:
    A dictionary of the vocabulary and it's corresponding index. It also
    includes a list of all the vocabulary.
    """

    full_path = os.path.join('./top_data', arg.train_data_path, file_name)
    output_path = os.path.join(arg.vocab_path, out_file_name)

    create_vocabulary(full_path, output_path, pad, unk, sos_eos)
    vocab = load_vocabulary(output_path)

    return vocab
Пример #2
0
  def __init__(self, train_file=None, valid_file=None, test_file=None):
    """Create G2P model and initialize or load parameters in session."""
    self.test_file = test_file

    # Preliminary actions before model creation.
    if FLAGS.train:
      #Load model parameters.
      num_layers, size = data_utils.save_params(FLAGS.num_layers, FLAGS.size,
                                                FLAGS.model)
      batch_size = FLAGS.batch_size
      # Prepare G2P data.
      print("Preparing G2P data")
      train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, self.gr_vocab,\
      self.ph_vocab = data_utils.prepare_g2p_data(FLAGS.model, train_file,
                                                  valid_file)
      # Read data into buckets and compute their sizes.
      print ("Reading development and training data.")
      self.valid_set = self.__put_into_buckets(valid_gr_ids, valid_ph_ids)
      self.train_set = self.__put_into_buckets(train_gr_ids, train_ph_ids)
    else:
      #Load model parameters.
      num_layers, size = data_utils.load_params(FLAGS.num_layers, FLAGS.size,
                                                FLAGS.model)
      batch_size = 1 # We decode one word at a time.
      # Load vocabularies
      self.gr_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model,
                                                              "vocab.grapheme"))
      self.ph_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model,
                                                              "vocab.phoneme"))

    self.rev_ph_vocab =\
        data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.phoneme"),
                                   reverse=True)

    self.session = tf.Session()

    # Create model.
    print("Creating %d layers of %d units." % (num_layers, size))
    self.model = seq2seq_model.Seq2SeqModel(len(self.gr_vocab),
                                            len(self.ph_vocab), self._BUCKETS,
                                            size, num_layers,
                                            FLAGS.max_gradient_norm, batch_size,
                                            FLAGS.learning_rate,
                                            FLAGS.learning_rate_decay_factor,
                                            forward_only=not FLAGS.train)
    self.model.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1)
    self.__create_model()
Пример #3
0
  def __init__(self, train_dic=None, valid_dic=None, test_dic=None):
    """Create G2P model and initialize or load parameters in session."""
    self.test_dic = test_dic

    # Preliminary actions before model creation.
    if FLAGS.train:
      #Load model parameters.
      num_layers, size = data_utils.save_params(FLAGS.num_layers, FLAGS.size,
                                                FLAGS.model)
      batch_size = FLAGS.batch_size
      # Prepare G2P data.
      print("Preparing G2P data")
      train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, self.gr_vocab,\
      self.ph_vocab = data_utils.prepare_g2p_data(FLAGS.model, train_dic,
                                                  valid_dic)
      # Read data into buckets and compute their sizes.
      print ("Reading development and training data.")
      self.valid_set = self.__put_into_buckets(valid_gr_ids, valid_ph_ids)
      self.train_set = self.__put_into_buckets(train_gr_ids, train_ph_ids)
    else:
      #Load model parameters.
      num_layers, size = data_utils.load_params(FLAGS.num_layers, FLAGS.size,
                                                FLAGS.model)
      batch_size = 1 # We decode one word at a time.
      # Load vocabularies
      self.gr_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model,
                                                              "vocab.grapheme"))
      self.ph_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model,
                                                              "vocab.phoneme"))

    self.rev_ph_vocab =\
        data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.phoneme"),
                                   reverse=True)

    self.session = tf.Session()

    # Create model.
    print("Creating %d layers of %d units." % (num_layers, size))
    self.model = seq2seq_model.Seq2SeqModel(len(self.gr_vocab),
                                            len(self.ph_vocab), self._BUCKETS,
                                            size, num_layers,
                                            FLAGS.max_gradient_norm, batch_size,
                                            FLAGS.learning_rate,
                                            FLAGS.learning_rate_decay_factor,
                                            forward_only=not FLAGS.train)

    self.__create_model()
Пример #4
0
def load_vocabs_load_model(sess):
  """Load vocabularies and saved model.

  Returns:
    gr_vocab: Grapheme vocabulary;
    rev_ph_vocab: Reversed phoneme vocabulary;
    model: Trained model.
  """
  # Load vocabularies
  gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
  ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
  gr_vocab = data_utils.load_vocabulary(gr_vocab_path, False)
  rev_ph_vocab = data_utils.load_vocabulary(ph_vocab_path, True)

  # Get vocabulary sizes
  gr_vocab_size = len(gr_vocab)
  ph_vocab_size = len(rev_ph_vocab)
  # Load model
  model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
  model.batch_size = 1  # We decode one word at a time.
  return (gr_vocab, rev_ph_vocab, model)
Пример #5
0
def load_vocabs_load_model(sess):
    """Load vocabularies and saved model.

  Returns:
    gr_vocab: Grapheme vocabulary;
    rev_ph_vocab: Reversed phoneme vocabulary;
    model: Trained model.
  """
    # Load vocabularies
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab = data_utils.load_vocabulary(gr_vocab_path, False)
    rev_ph_vocab = data_utils.load_vocabulary(ph_vocab_path, True)

    # Get vocabulary sizes
    gr_vocab_size = len(gr_vocab)
    ph_vocab_size = len(rev_ph_vocab)
    # Load model
    model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
    model.batch_size = 1  # We decode one word at a time.
    return (gr_vocab, rev_ph_vocab, model)
Пример #6
0
    def load_decode_model(self):
        """Load G2P model and initialize or load parameters in session."""
        if not os.path.exists(os.path.join(self.model_dir, 'checkpoint')):
            raise RuntimeError("Model not found in %s" % self.model_dir)

        self.batch_size = 1  # We decode one word at a time.
        #Load model parameters.
        num_layers, size = data_utils.load_params(self.model_dir)
        # Load vocabularies
        print("Loading vocabularies from %s" % self.model_dir)
        self.gr_vocab = data_utils.load_vocabulary(
            os.path.join(self.model_dir, "vocab.grapheme"))
        self.ph_vocab = data_utils.load_vocabulary(
            os.path.join(self.model_dir, "vocab.phoneme"))

        self.rev_ph_vocab =\
          data_utils.load_vocabulary(os.path.join(self.model_dir, "vocab.phoneme"),
                                     reverse=True)

        self.session = tf.Session()

        # Restore model.
        print("Creating %d layers of %d units." % (num_layers, size))
        self.model = seq2seq_model.Seq2SeqModel(len(self.gr_vocab),
                                                len(self.ph_vocab),
                                                self._BUCKETS,
                                                size,
                                                num_layers,
                                                0,
                                                self.batch_size,
                                                0,
                                                0,
                                                forward_only=True)
        self.model.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        # Check for saved models and restore them.
        print("Reading model parameters from %s" % self.model_dir)
        self.model.saver.restore(self.session,
                                 os.path.join(self.model_dir, "model"))
Пример #7
0
def train():
    embedding_mat = data_utils.load_embedding_mat(FLAGS.save_embedding_file)
    vocab = data_utils.load_vocabulary(FLAGS.vocabulary_file)
    model = cnn_model.CNN_MODEL(embed_dim=FLAGS.embedding_dim,
                                filter_sizes=FLAGS.filter_sizes,
                                max_sent_len=FLAGS.max_sentence_len,
                                embedding_mat=embedding_mat,
                                word_nums=len(vocab),
                                filter_nums=FLAGS.filter_nums,
                                label_nums=FLAGS.label_nums,
                                learning_rate=FLAGS.learning_rate,
                                model_path=FLAGS.model_path,
                                epoch=FLAGS.num_epochs,
                                batch_size=FLAGS.batch_size,
                                dropout_prob=FLAGS.dropout_keep_prob)

    train_data = data_utils.generate_data('./data/train_data.ids',
                                          FLAGS.max_sentence_len, vocab)
    valid_data = data_utils.generate_data('./data/valid_data.ids',
                                          FLAGS.max_sentence_len, vocab)
    print('train data size is {}, valid data size is {}.'.format(
        len(train_data[0]), len(valid_data[0])))

    model.train(train_data, valid_data)
Пример #8
0
def train(train_dic, valid_dic, test_dic):
  """Train a gr->ph translation model using G2P data."""
  if not os.path.exists(FLAGS.model):
    os.makedirs(FLAGS.model)
  # Save model's architecture
  params_path = os.path.join(FLAGS.model, "model.params")
  with open(params_path, 'w') as param_file:
    param_file.write("num_layers:" + str(FLAGS.num_layers) + "\n")
    param_file.write("size:" + str(FLAGS.size))
  # Prepare G2P data.
  print("Preparing G2P data")
  train_gr, train_ph = data_utils.split_to_grapheme_phoneme(train_dic)
  valid_gr, valid_ph = data_utils.split_to_grapheme_phoneme(valid_dic)
  train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab =\
    data_utils.prepare_g2p_data(FLAGS.model, train_gr, train_ph,
                                valid_gr, valid_ph)
  gr_vocab_size = len(gr_vocab)
  ph_vocab_size = len(ph_vocab)
  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False, gr_vocab_size, ph_vocab_size)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data.")
    valid_set = put_into_buckets(valid_gr_ids, valid_ph_ids)
    train_set = put_into_buckets(train_gr_ids, train_ph_ids)

    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_BUCKETS))]
    train_total_size = float(sum(train_bucket_sizes))
    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while (FLAGS.max_steps == 0
           or model.global_step.eval() <= FLAGS.max_steps):
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        if len(previous_losses) > 34 and \
        previous_losses[-35:-34] <= min(previous_losses[-35:]):
          break
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.model, "translate.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(_BUCKETS)):
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              valid_set, bucket_id)
          _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
    print('Training process stopped.')

    print('Beginning calculation word error rate (WER) on test sample.')
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    rev_ph_vocab = data_utils.load_vocabulary(ph_vocab_path, True)
    model.forward_only = True
    model.batch_size = 1  # We decode one word at a time.
    evaluate(test_dic, sess, model, gr_vocab, rev_ph_vocab, ph_vocab)
Пример #9
0
def build_vocabularies():
    vocabulary = du.load_vocabulary()
    rev_vocabulary = du.build_reverse_vocabulary(vocabulary)
    return (vocabulary, rev_vocabulary)
Пример #10
0
def train(train_dic, valid_dic, test_dic):
    """Train a gr->ph translation model using G2P data."""
    if not os.path.exists(FLAGS.model):
        os.makedirs(FLAGS.model)
    # Save model's architecture
    params_path = os.path.join(FLAGS.model, "model.params")
    with open(params_path, 'w') as param_file:
        param_file.write("num_layers:" + str(FLAGS.num_layers) + "\n")
        param_file.write("size:" + str(FLAGS.size))
    # Prepare G2P data.
    print("Preparing G2P data")
    train_gr, train_ph = data_utils.split_to_grapheme_phoneme(train_dic)
    valid_gr, valid_ph = data_utils.split_to_grapheme_phoneme(valid_dic)
    train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab =\
      data_utils.prepare_g2p_data(FLAGS.model, train_gr, train_ph,
                                  valid_gr, valid_ph)
    gr_vocab_size = len(gr_vocab)
    ph_vocab_size = len(ph_vocab)
    with tf.Session() as sess:
        # Create model.
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, False, gr_vocab_size, ph_vocab_size)

        # Read data into buckets and compute their sizes.
        print("Reading development and training data.")
        valid_set = put_into_buckets(valid_gr_ids, valid_ph_ids)
        train_set = put_into_buckets(train_gr_ids, train_ph_ids)

        train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_BUCKETS))]
        train_total_size = float(sum(train_bucket_sizes))
        # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
        # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
        # the size if i-th training bucket, as used later.
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in xrange(len(train_bucket_sizes))
        ]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        while (FLAGS.max_steps == 0
               or model.global_step.eval() <= FLAGS.max_steps):
            # Choose a bucket according to data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in xrange(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                train_set, bucket_id)
            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, False)
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, print statistics, and run evals.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                print(
                    "global step %d learning rate %.4f step-time %.2f perplexity "
                    "%.2f" %
                    (model.global_step.eval(), model.learning_rate.eval(),
                     step_time, perplexity))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                if len(previous_losses) > 34 and \
                previous_losses[-35:-34] <= min(previous_losses[-35:]):
                    break
                previous_losses.append(loss)
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.model, "translate.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)
                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in xrange(len(_BUCKETS)):
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                        valid_set, bucket_id)
                    _, eval_loss, _ = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True)
                    eval_ppx = math.exp(
                        eval_loss) if eval_loss < 300 else float('inf')
                    print("  eval: bucket %d perplexity %.2f" %
                          (bucket_id, eval_ppx))
        print('Training process stopped.')

        print('Beginning calculation word error rate (WER) on test sample.')
        ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
        rev_ph_vocab = data_utils.load_vocabulary(ph_vocab_path, True)
        model.forward_only = True
        model.batch_size = 1  # We decode one word at a time.
        evaluate(test_dic, sess, model, gr_vocab, rev_ph_vocab, ph_vocab)