def interactive(): with tf.Session() as sess: # Create model and load parameters. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path) _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() w = sys.stdin.readline() word = " ".join(list(w)) while word: gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) print(res_phoneme_seq) else: print("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) print("> ", end="") sys.stdout.flush() w = sys.stdin.readline() word = " ".join(list(w))
def load_data(self, debug=False): """Loads train/valid/test data and sentence encoding""" en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data( 'tmp', 40000, 40000) self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary( en_vocab_path) self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary( fr_vocab_path) source_path = './tmp/train.ids40000.questions' target_path = './tmp/train.ids40000.answers' if self.config.train_mode: source_path = './tmp/train.ids40000.questions' target_path = './tmp/train.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) else: source_path = './tmp/test.ids40000.questions' target_path = './tmp/test.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket( sources, targets, self.config) source_vocab_path = './tmp/vocab40000.questions' target_vocab_path = './tmp/vocab40000.answers' self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path) self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path) self.word_embedding = np.random.uniform( -self.config.embedding_init, self.config.embedding_init, (self.source_vocab_size, self.config.embed_size))
def load_data(self, debug=False): """Loads train/valid/test data and sentence encoding""" ''' en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) ''' en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data( 'tmp', 40000, 40000) self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary( en_vocab_path) self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary( fr_vocab_path) #print self.source_vocab_to_id #print self.source_id_to_vocab ''' print self.target_vocab_to_id print self.target_id_to_vocab ''' ''' for i in range(0, 10): print i print self.target_id_to_vocab[int(float(i))] #adsfas ''' source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions' target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers' if self.config.train_mode: source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions' target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) else: source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.questions' target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket( sources, targets, self.config) source_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.questions' target_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.answers' self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path) self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path) self.word_embedding = np.random.uniform( -self.config.embedding_init, self.config.embedding_init, (self.source_vocab_size, self.config.embed_size))
def load_model(sess, gr_vocab_path, ph_vocab_path): """Load saved model. Args: sess: current session; gr_vocab_path: Path to the graphemes vocabulary; ph_vocab_path: Path to the phonemes vocabulary. Returns: model: Trained model. """ # Get vocabulary sizes gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) # Load model model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one word at a time. return model
def decode(): with tf.Session() as sess: # Create model and load parameters. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path) _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path) # Decode from input file. graphemes = open(FLAGS.decode).readlines() output_file_path = FLAGS.output if output_file_path: with gfile.GFile(output_file_path, mode="w") as output_file: for w in graphemes: word = " ".join(list(w)) gr_absent = [gr for gr in w if gr not in gr_vocab] if not gr_absent: res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) output_file.write(w.replace('\n',' ')) output_file.write(res_phoneme_seq) output_file.write('\n') else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) else: for w in graphemes: word = " ".join(list(w)) gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) print(w.replace('\n',' ') + res_phoneme_seq) sys.stdout.flush() else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
# External libraries import torch import pandas as pd # Internal libraries import lstm_sentiment as lstm import data_utils as du import string_processing as sp # Values necessary to load the network. vocab = pd.read_csv("vocabulary.txt", names=['ind', 'word'], encoding='iso-8859-1') vocab = pd.Series(vocab['ind'].values, index=vocab['word']).to_dict() vocab_size = du.get_vocab_size("vocabulary.txt") # Load the network. network = lstm.LSTMSentiment(vocab_size) network.load_state_dict(torch.load('model')) network.eval() # Get user input. user_sentence = input("Enter a review: ") # Process user input and convert it to tokens. user_sentence = sp.normalize(user_sentence) user_sentence = sp.tokenize(user_sentence) user_sentence = sp.get_numbers(user_sentence, vocab) user_sentence = sp.padding(user_sentence, 30) # Predict and output results.
def evaluate(): with tf.Session() as sess: # Create model and load parameters. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one word at a time. # Load vocabularies. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path) _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path) # Decode from input file. test = open(FLAGS.evaluate).read().split('\n') test_graphemes = [] test_phonemes = [] for line in test: lst = line.split() if len(lst)>=2: test_graphemes.append(lst[0]) test_phonemes.append(" ".join(lst[1:])) duplicates = {} total_dupl_num = 0 for i, gr in enumerate(test_graphemes): if test_graphemes.count(gr) > 1: total_dupl_num += test_graphemes.count(gr) - 1 if gr in duplicates: duplicates[gr].append(test_phonemes[i]) else: duplicates[gr] = [test_phonemes[i]] errors = 0 counter = 0 dupl_error_calculated = [] for i, w in enumerate(test_graphemes): if w not in duplicates: counter += 1 word = " ".join(list(w)) gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) if model_assumption != test_phonemes[i]: errors += 1 else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) elif w not in dupl_error_calculated: counter += 1 dupl_error_calculated.append(w) word = " ".join(list(w)) gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) if model_assumption not in duplicates[w]: errors += 1 else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) print("WER : ", errors/counter ) print("Accuracy : ", (1-errors/counter) )
def train(train_gr, train_ph, valid_gr, valid_ph): """Train a gr->ph translation model using G2P data.""" # Prepare G2P data. print("Preparing G2P data") train_gr_ids, train_ph_ids, valid_gr_ids, valid_ph_ids, gr_vocab_path, ph_vocab_path = data_utils.prepare_g2p_data(FLAGS.model, train_gr, train_ph, valid_gr, valid_ph) gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False, gr_vocab_size, ph_vocab_size) # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) valid_set = read_data(valid_gr_ids, valid_ph_ids) train_set = read_data(train_gr_ids, train_ph_ids, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while (FLAGS.max_steps == 0 or current_step <= FLAGS.max_steps ): # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) if len(previous_losses) > 34 and previous_losses[-35:-34] <= min(previous_losses[-35:]): break previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.model, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): encoder_inputs, decoder_inputs, target_weights = model.get_batch( valid_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()