def create_load_vocab(arg, file_name, out_file_name, pad=True, unk=True, sos_eos=False): """Creates and loads the vocab file for a given corpus. Args: arg: The output of the parser. file_name: The name of the file containing the corpus. out_file_name: The file into which the vocab should be written into. pad: A boolean to indicate if the pad token should be included in the vocabulary. unk: A boolean to indicate if the unknown token should be included in the vocabulary. sos_eos: A boolean to indicate if the SOS and EOS token should be included in the vocabulary. Returns: A dictionary of the vocabulary and it's corresponding index. It also includes a list of all the vocabulary. """ full_path = os.path.join('./top_data', arg.train_data_path, file_name) output_path = os.path.join(arg.vocab_path, out_file_name) create_vocabulary(full_path, output_path, pad, unk, sos_eos) vocab = load_vocabulary(output_path) return vocab
def __init__(self, train_file=None, valid_file=None, test_file=None): """Create G2P model and initialize or load parameters in session.""" self.test_file = test_file # Preliminary actions before model creation. if FLAGS.train: #Load model parameters. num_layers, size = data_utils.save_params(FLAGS.num_layers, FLAGS.size, FLAGS.model) batch_size = FLAGS.batch_size # Prepare G2P data. print("Preparing G2P data") train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, self.gr_vocab,\ self.ph_vocab = data_utils.prepare_g2p_data(FLAGS.model, train_file, valid_file) # Read data into buckets and compute their sizes. print ("Reading development and training data.") self.valid_set = self.__put_into_buckets(valid_gr_ids, valid_ph_ids) self.train_set = self.__put_into_buckets(train_gr_ids, train_ph_ids) else: #Load model parameters. num_layers, size = data_utils.load_params(FLAGS.num_layers, FLAGS.size, FLAGS.model) batch_size = 1 # We decode one word at a time. # Load vocabularies self.gr_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.grapheme")) self.ph_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.phoneme")) self.rev_ph_vocab =\ data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.phoneme"), reverse=True) self.session = tf.Session() # Create model. print("Creating %d layers of %d units." % (num_layers, size)) self.model = seq2seq_model.Seq2SeqModel(len(self.gr_vocab), len(self.ph_vocab), self._BUCKETS, size, num_layers, FLAGS.max_gradient_norm, batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, forward_only=not FLAGS.train) self.model.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1) self.__create_model()
def __init__(self, train_dic=None, valid_dic=None, test_dic=None): """Create G2P model and initialize or load parameters in session.""" self.test_dic = test_dic # Preliminary actions before model creation. if FLAGS.train: #Load model parameters. num_layers, size = data_utils.save_params(FLAGS.num_layers, FLAGS.size, FLAGS.model) batch_size = FLAGS.batch_size # Prepare G2P data. print("Preparing G2P data") train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, self.gr_vocab,\ self.ph_vocab = data_utils.prepare_g2p_data(FLAGS.model, train_dic, valid_dic) # Read data into buckets and compute their sizes. print ("Reading development and training data.") self.valid_set = self.__put_into_buckets(valid_gr_ids, valid_ph_ids) self.train_set = self.__put_into_buckets(train_gr_ids, train_ph_ids) else: #Load model parameters. num_layers, size = data_utils.load_params(FLAGS.num_layers, FLAGS.size, FLAGS.model) batch_size = 1 # We decode one word at a time. # Load vocabularies self.gr_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.grapheme")) self.ph_vocab = data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.phoneme")) self.rev_ph_vocab =\ data_utils.load_vocabulary(os.path.join(FLAGS.model, "vocab.phoneme"), reverse=True) self.session = tf.Session() # Create model. print("Creating %d layers of %d units." % (num_layers, size)) self.model = seq2seq_model.Seq2SeqModel(len(self.gr_vocab), len(self.ph_vocab), self._BUCKETS, size, num_layers, FLAGS.max_gradient_norm, batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, forward_only=not FLAGS.train) self.__create_model()
def load_vocabs_load_model(sess): """Load vocabularies and saved model. Returns: gr_vocab: Grapheme vocabulary; rev_ph_vocab: Reversed phoneme vocabulary; model: Trained model. """ # Load vocabularies gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab = data_utils.load_vocabulary(gr_vocab_path, False) rev_ph_vocab = data_utils.load_vocabulary(ph_vocab_path, True) # Get vocabulary sizes gr_vocab_size = len(gr_vocab) ph_vocab_size = len(rev_ph_vocab) # Load model model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one word at a time. return (gr_vocab, rev_ph_vocab, model)
def load_decode_model(self): """Load G2P model and initialize or load parameters in session.""" if not os.path.exists(os.path.join(self.model_dir, 'checkpoint')): raise RuntimeError("Model not found in %s" % self.model_dir) self.batch_size = 1 # We decode one word at a time. #Load model parameters. num_layers, size = data_utils.load_params(self.model_dir) # Load vocabularies print("Loading vocabularies from %s" % self.model_dir) self.gr_vocab = data_utils.load_vocabulary( os.path.join(self.model_dir, "vocab.grapheme")) self.ph_vocab = data_utils.load_vocabulary( os.path.join(self.model_dir, "vocab.phoneme")) self.rev_ph_vocab =\ data_utils.load_vocabulary(os.path.join(self.model_dir, "vocab.phoneme"), reverse=True) self.session = tf.Session() # Restore model. print("Creating %d layers of %d units." % (num_layers, size)) self.model = seq2seq_model.Seq2SeqModel(len(self.gr_vocab), len(self.ph_vocab), self._BUCKETS, size, num_layers, 0, self.batch_size, 0, 0, forward_only=True) self.model.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) # Check for saved models and restore them. print("Reading model parameters from %s" % self.model_dir) self.model.saver.restore(self.session, os.path.join(self.model_dir, "model"))
def train(): embedding_mat = data_utils.load_embedding_mat(FLAGS.save_embedding_file) vocab = data_utils.load_vocabulary(FLAGS.vocabulary_file) model = cnn_model.CNN_MODEL(embed_dim=FLAGS.embedding_dim, filter_sizes=FLAGS.filter_sizes, max_sent_len=FLAGS.max_sentence_len, embedding_mat=embedding_mat, word_nums=len(vocab), filter_nums=FLAGS.filter_nums, label_nums=FLAGS.label_nums, learning_rate=FLAGS.learning_rate, model_path=FLAGS.model_path, epoch=FLAGS.num_epochs, batch_size=FLAGS.batch_size, dropout_prob=FLAGS.dropout_keep_prob) train_data = data_utils.generate_data('./data/train_data.ids', FLAGS.max_sentence_len, vocab) valid_data = data_utils.generate_data('./data/valid_data.ids', FLAGS.max_sentence_len, vocab) print('train data size is {}, valid data size is {}.'.format( len(train_data[0]), len(valid_data[0]))) model.train(train_data, valid_data)
def train(train_dic, valid_dic, test_dic): """Train a gr->ph translation model using G2P data.""" if not os.path.exists(FLAGS.model): os.makedirs(FLAGS.model) # Save model's architecture params_path = os.path.join(FLAGS.model, "model.params") with open(params_path, 'w') as param_file: param_file.write("num_layers:" + str(FLAGS.num_layers) + "\n") param_file.write("size:" + str(FLAGS.size)) # Prepare G2P data. print("Preparing G2P data") train_gr, train_ph = data_utils.split_to_grapheme_phoneme(train_dic) valid_gr, valid_ph = data_utils.split_to_grapheme_phoneme(valid_dic) train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab =\ data_utils.prepare_g2p_data(FLAGS.model, train_gr, train_ph, valid_gr, valid_ph) gr_vocab_size = len(gr_vocab) ph_vocab_size = len(ph_vocab) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False, gr_vocab_size, ph_vocab_size) # Read data into buckets and compute their sizes. print ("Reading development and training data.") valid_set = put_into_buckets(valid_gr_ids, valid_ph_ids) train_set = put_into_buckets(train_gr_ids, train_ph_ids) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_BUCKETS))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while (FLAGS.max_steps == 0 or model.global_step.eval() <= FLAGS.max_steps): # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) if len(previous_losses) > 34 and \ previous_losses[-35:-34] <= min(previous_losses[-35:]): break previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.model, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_BUCKETS)): encoder_inputs, decoder_inputs, target_weights = model.get_batch( valid_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) print('Training process stopped.') print('Beginning calculation word error rate (WER) on test sample.') ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") rev_ph_vocab = data_utils.load_vocabulary(ph_vocab_path, True) model.forward_only = True model.batch_size = 1 # We decode one word at a time. evaluate(test_dic, sess, model, gr_vocab, rev_ph_vocab, ph_vocab)
def build_vocabularies(): vocabulary = du.load_vocabulary() rev_vocabulary = du.build_reverse_vocabulary(vocabulary) return (vocabulary, rev_vocabulary)
def train(train_dic, valid_dic, test_dic): """Train a gr->ph translation model using G2P data.""" if not os.path.exists(FLAGS.model): os.makedirs(FLAGS.model) # Save model's architecture params_path = os.path.join(FLAGS.model, "model.params") with open(params_path, 'w') as param_file: param_file.write("num_layers:" + str(FLAGS.num_layers) + "\n") param_file.write("size:" + str(FLAGS.size)) # Prepare G2P data. print("Preparing G2P data") train_gr, train_ph = data_utils.split_to_grapheme_phoneme(train_dic) valid_gr, valid_ph = data_utils.split_to_grapheme_phoneme(valid_dic) train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab, ph_vocab =\ data_utils.prepare_g2p_data(FLAGS.model, train_gr, train_ph, valid_gr, valid_ph) gr_vocab_size = len(gr_vocab) ph_vocab_size = len(ph_vocab) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False, gr_vocab_size, ph_vocab_size) # Read data into buckets and compute their sizes. print("Reading development and training data.") valid_set = put_into_buckets(valid_gr_ids, valid_ph_ids) train_set = put_into_buckets(train_gr_ids, train_ph_ids) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_BUCKETS))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while (FLAGS.max_steps == 0 or model.global_step.eval() <= FLAGS.max_steps): # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) if len(previous_losses) > 34 and \ previous_losses[-35:-34] <= min(previous_losses[-35:]): break previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.model, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_BUCKETS)): encoder_inputs, decoder_inputs, target_weights = model.get_batch( valid_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp( eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) print('Training process stopped.') print('Beginning calculation word error rate (WER) on test sample.') ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") rev_ph_vocab = data_utils.load_vocabulary(ph_vocab_path, True) model.forward_only = True model.batch_size = 1 # We decode one word at a time. evaluate(test_dic, sess, model, gr_vocab, rev_ph_vocab, ph_vocab)