Пример #1
0
def interactive():
  with tf.Session() as sess:
    # Create model and load parameters.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
    ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
    model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path)
    _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    w = sys.stdin.readline()
    word = " ".join(list(w))
    while word:
      gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
      if not gr_absent:
        res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
        print(res_phoneme_seq)
      else:
        print("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
      print("> ", end="")
      sys.stdout.flush()
      w = sys.stdin.readline()
      word = " ".join(list(w))
Пример #2
0
    def load_data(self, debug=False):
        """Loads train/valid/test data and sentence encoding"""

        en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data(
            'tmp', 40000, 40000)

        self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary(
            en_vocab_path)
        self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary(
            fr_vocab_path)

        source_path = './tmp/train.ids40000.questions'
        target_path = './tmp/train.ids40000.answers'

        if self.config.train_mode:
            source_path = './tmp/train.ids40000.questions'
            target_path = './tmp/train.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)
        else:
            source_path = './tmp/test.ids40000.questions'
            target_path = './tmp/test.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)

        self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket(
            sources, targets, self.config)

        source_vocab_path = './tmp/vocab40000.questions'
        target_vocab_path = './tmp/vocab40000.answers'
        self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path)
        self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path)

        self.word_embedding = np.random.uniform(
            -self.config.embedding_init, self.config.embedding_init,
            (self.source_vocab_size, self.config.embed_size))
    def load_data(self, debug=False):
        """Loads train/valid/test data and sentence encoding"""
        '''
        en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_data(
        FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)
        '''

        en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data(
            'tmp', 40000, 40000)

        self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary(
            en_vocab_path)
        self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary(
            fr_vocab_path)
        #print self.source_vocab_to_id
        #print self.source_id_to_vocab
        '''
        print self.target_vocab_to_id
        print self.target_id_to_vocab
        '''
        '''
        for i in range(0, 10):
            print i
            print self.target_id_to_vocab[int(float(i))]
        #adsfas
        '''

        source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions'
        target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers'

        if self.config.train_mode:
            source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions'
            target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)
        else:
            source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.questions'
            target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.answers'
            sources, targets = data_utils.read_data(source_path, target_path)

        self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket(
            sources, targets, self.config)

        source_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.questions'
        target_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.answers'
        self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path)
        self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path)

        self.word_embedding = np.random.uniform(
            -self.config.embedding_init, self.config.embedding_init,
            (self.source_vocab_size, self.config.embed_size))
Пример #4
0
def load_model(sess, gr_vocab_path, ph_vocab_path):
  """Load saved model.

  Args:
    sess: current session;
    gr_vocab_path: Path to the graphemes vocabulary;
    ph_vocab_path: Path to the phonemes vocabulary.

  Returns:
    model: Trained model.
  """
  # Get vocabulary sizes
  gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
  ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
  # Load model
  model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
  model.batch_size = 1  # We decode one word at a time.
  return model
Пример #5
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
    ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
    model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path)
    _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path)

    # Decode from input file.
    graphemes = open(FLAGS.decode).readlines()

    output_file_path = FLAGS.output

    if output_file_path:
      with gfile.GFile(output_file_path, mode="w") as output_file:
        for w in graphemes:
          word = " ".join(list(w))
          gr_absent = [gr for gr in w if gr not in gr_vocab]
          if not gr_absent:
            res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
            output_file.write(w.replace('\n',' '))
            output_file.write(res_phoneme_seq)
            output_file.write('\n')
          else:
            raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
    else:
      for w in graphemes:
        word = " ".join(list(w))
        gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
        if not gr_absent:
          res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
          print(w.replace('\n',' ') + res_phoneme_seq)
          sys.stdout.flush()
        else:
          raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
Пример #6
0
# External libraries
import torch
import pandas as pd

# Internal libraries
import lstm_sentiment as lstm
import data_utils as du
import string_processing as sp

# Values necessary to load the network.
vocab = pd.read_csv("vocabulary.txt",
                    names=['ind', 'word'],
                    encoding='iso-8859-1')
vocab = pd.Series(vocab['ind'].values, index=vocab['word']).to_dict()
vocab_size = du.get_vocab_size("vocabulary.txt")

# Load the network.
network = lstm.LSTMSentiment(vocab_size)
network.load_state_dict(torch.load('model'))
network.eval()

# Get user input.
user_sentence = input("Enter a review: ")

# Process user input and convert it to tokens.
user_sentence = sp.normalize(user_sentence)
user_sentence = sp.tokenize(user_sentence)
user_sentence = sp.get_numbers(user_sentence, vocab)
user_sentence = sp.padding(user_sentence, 30)

# Predict and output results.
Пример #7
0
def evaluate():
  with tf.Session() as sess:
    # Create model and load parameters.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
    ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
    model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
    model.batch_size = 1  # We decode one word at a time.

    # Load vocabularies.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path)
    _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path)

    # Decode from input file.
    test = open(FLAGS.evaluate).read().split('\n')
    test_graphemes = []
    test_phonemes = []

    for line in test:
      lst = line.split()
      if len(lst)>=2:
        test_graphemes.append(lst[0])
        test_phonemes.append(" ".join(lst[1:]))

    duplicates = {}
    total_dupl_num = 0
    for i, gr in enumerate(test_graphemes):
      if test_graphemes.count(gr) > 1:
        total_dupl_num += test_graphemes.count(gr) - 1
        if gr in duplicates:
          duplicates[gr].append(test_phonemes[i])
        else:
          duplicates[gr] = [test_phonemes[i]]

    errors = 0
    counter = 0
    dupl_error_calculated = []
    for i, w in enumerate(test_graphemes):
      if w not in duplicates:
        counter += 1
        word = " ".join(list(w))
        gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
        if not gr_absent:
          model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) 
          if model_assumption != test_phonemes[i]:
            errors += 1
        else:
          raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) 
      elif w not in dupl_error_calculated:
        counter += 1
        dupl_error_calculated.append(w)
        word = " ".join(list(w))
        gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
        if not gr_absent:
          model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
          if model_assumption not in duplicates[w]:
            errors += 1
        else:
          raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )

    print("WER : ", errors/counter )
    print("Accuracy : ", (1-errors/counter) )
Пример #8
0
def train(train_gr, train_ph, valid_gr, valid_ph):
  """Train a gr->ph translation model using G2P data."""
  # Prepare G2P data.
  print("Preparing G2P data")
  train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab_path, ph_vocab_path = data_utils.prepare_g2p_data(FLAGS.model, train_gr, train_ph, valid_gr, valid_ph)
  gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
  ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False, gr_vocab_size, ph_vocab_size)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    valid_set = read_data(valid_gr_ids, valid_ph_ids)
    train_set = read_data(train_gr_ids, train_ph_ids, FLAGS.max_train_data_size)
    
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))
    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while (FLAGS.max_steps == 0 or current_step <= FLAGS.max_steps ):
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        if len(previous_losses) > 34 and previous_losses[-35:-34] <= min(previous_losses[-35:]):
          break
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.model, "translate.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(_buckets)):
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              valid_set, bucket_id)
          _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
        sys.stdout.flush()