def score(data: str, load_from: str, batch_size: int, **kwargs): """Scores a text using a trained language model. See argument description in `bin/romanesco`.""" vocab = Vocabulary() vocab.load(os.path.join(load_from, 'vocab.json')) raw_data = reader.read(data, vocab) inputs, targets, loss, _, _, _, _, init_state = define_computation_graph( vocab.size, batch_size) saver = tf.train.Saver() with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, MODEL_FILENAME)) _current_state = np.zeros((NUM_LAYERS, 2, batch_size, STATE_SIZE)) total_loss = 0.0 total_iter = 0 for x, y in reader.iterate(raw_data, batch_size, NUM_STEPS): l = session.run([loss], feed_dict={ inputs: x, targets: y, init_state: _current_state }) total_loss += l[0] total_iter += 1 perplexity = np.exp(total_loss / total_iter) return perplexity
def train(data: str, epochs: int = C.NUM_EPOCHS, batch_size: int = C.BATCH_SIZE, hidden_size: int = C.HIDDEN_SIZE, embedding_size: int = C.EMBEDDING_SIZE, vocab_max_size: int = C.VOCAB_SIZE, save_to: str = C.MODEL_PATH, log_to: str = C.LOGS_PATH, num_steps: int = C.NUM_STEPS, **kwargs): """Trains a language model. See argument description in `bin/romanesco`.""" # create vocabulary to map words to ids vocab = Vocabulary() vocab.build(data, max_size=vocab_max_size) vocab.save(os.path.join(save_to, C.VOCAB_FILENAME)) # convert training data to list of word ids raw_data = reader.read(data, vocab) # define computation graph inputs, targets, loss, train_step, _, summary = define_computation_graph( vocab_size=vocab.size, batch_size=batch_size, num_steps=num_steps, hidden_size=hidden_size, embedding_size=embedding_size) saver = tf.train.Saver() with tf.Session() as session: # init session.run(tf.global_variables_initializer()) # write logs (@tensorboard) summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph()) # iterate over training data `epochs` times for epoch in range(1, epochs + 1): total_loss = 0.0 total_iter = 0 for x, y in reader.iterate(raw_data, batch_size, C.NUM_STEPS): l, _, s = session.run([loss, train_step, summary], feed_dict={ inputs: x, targets: y }) summary_writer.add_summary(s, total_iter) total_loss += l total_iter += 1 if total_iter % 100 == 0: logging.debug("Epoch=%s, iteration=%s", epoch, total_iter) perplexity = np.exp(total_loss / total_iter) logging.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity) saver.save(session, os.path.join(save_to, C.MODEL_FILENAME))
def train(data: str, epochs: int, batch_size: int, vocab_max_size: int, save_to: str, log_to: str, **kwargs): """Trains a language model. See argument description in `bin/romanesco`.""" # create folders for model and logs if they don't exist yet for folder in [save_to, log_to]: if not os.path.exists(folder): os.makedirs(folder) # create vocabulary to map words to ids vocab = Vocabulary() vocab.build(data, max_size=vocab_max_size) vocab.save(os.path.join(save_to, VOCAB_FILENAME)) # convert training data to list of word ids raw_data = reader.read(data, vocab) # define computation graph inputs, targets, loss, train_step, _, summary, current_state, init_state = define_computation_graph( vocab.size, batch_size) saver = tf.train.Saver() with tf.Session() as session: # init session.run(tf.global_variables_initializer()) # write logs (@tensorboard) summary_writer = tf.summary.FileWriter(log_to, graph=tf.get_default_graph()) # iterate over training data `epoch` times for epoch in range(1, epochs + 1): _current_state = np.zeros((NUM_LAYERS, 2, batch_size, STATE_SIZE)) total_loss = 0.0 total_iter = 0 for x, y in reader.iterate(raw_data, batch_size, NUM_STEPS): l, _, _current_state, s = session.run( [loss, train_step, current_state, summary], feed_dict={ inputs: x, targets: y, init_state: _current_state }) summary_writer.add_summary(s, total_iter) total_loss += l total_iter += 1 if total_iter % 100 == 0: logging.debug("Epoch=%s, iteration=%s", epoch, total_iter) perplexity = np.exp(total_loss / total_iter) logging.info("Perplexity on training data after epoch %s: %.2f", epoch, perplexity) saver.save(session, os.path.join(save_to, MODEL_FILENAME))
def score(data: str, load_from: str = C.MODEL_PATH, batch_size: int = C.BATCH_SIZE, hidden_size: int = C.HIDDEN_SIZE, embedding_size: int = C.EMBEDDING_SIZE, num_steps: int = C.NUM_STEPS, **kwargs): """Scores a text using a trained language model. See argument description in `bin/romanesco`.""" vocab = Vocabulary() vocab.load(os.path.join(load_from, C.VOCAB_FILENAME)) raw_data = reader.read(data, vocab) data_length = len(raw_data) if data_length < num_steps: logging.warning( "Length of input data is shorter than NUM_STEPS. Will try to reduce NUM_STEPS." ) num_steps = data_length - 1 if data_length < batch_size * num_steps: logging.warning( "Length of input data is shorter than BATCH_SIZE * NUM_STEPS. Will try to set batch size to 1." ) batch_size = 1 inputs, targets, loss, _, _, _ = define_computation_graph( vocab_size=vocab.size, batch_size=batch_size, num_steps=num_steps, hidden_size=hidden_size, embedding_size=embedding_size) saver = tf.train.Saver() with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME)) total_loss = 0.0 total_iter = 0 for x, y in reader.iterate(raw_data, batch_size, num_steps): l = session.run([loss], feed_dict={inputs: x, targets: y}) total_loss += l[0] total_iter += 1 perplexity = np.exp(total_loss / total_iter) return perplexity
def sample(length: int, load_from: str, first_symbol: str = None, **kwargs): """Generates a text by sampling from a trained language model. See argument description in `bin/romanesco`.""" vocab = Vocabulary() vocab.load(os.path.join(load_from, 'vocab.json')) inputs, targets, _, _, logits, _ = define_computation_graph(vocab.size, 1) saver = tf.train.Saver() sampled_sequence = [] with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, MODEL_FILENAME)) if first_symbol: try: sampled_symbol = vocab.get_id(first_symbol) except KeyError: logging.error('Unknown symbol `{0}`. Try with another start symbol.') sys.exit(0) else: sampled_symbol = vocab.get_random_id() x = np.array(np.zeros(NUM_STEPS, dtype=int)) # padding with zeros (UNK) y = np.array(np.zeros(NUM_STEPS, dtype=int)) # we don't care about gold targets here UNK_ID = vocab.get_id(UNK) for _ in range(length): sampled_sequence.append(sampled_symbol) x = np.roll(x, -1) x[NUM_STEPS - 1] = sampled_symbol l = session.run([logits], feed_dict={inputs: [x], targets: [y]}) next_symbol_logits = l[0][0][-1] # first returned session variable, first batch, last symbol next_symbol_probs = softmax(next_symbol_logits) # avoid generating unknown words sampled_symbol = UNK_ID while sampled_symbol == UNK_ID: # TODO: avoid infinite loop sampled_symbol = np.random.choice(range(vocab.size), p=next_symbol_probs) words = vocab.get_words(sampled_sequence) return ' '.join(words).replace(' ' + EOS + ' ', '\n') # OPTIMIZE: remove <eos> at the very end
def sample(length: int = C.SAMPLE_LENGTH, load_from: str = C.MODEL_PATH, first_symbols: List[str] = [], hidden_size: int = C.HIDDEN_SIZE, embedding_size: int = C.EMBEDDING_SIZE, num_steps: int = C.NUM_STEPS, **kwargs): """Generates a text by sampling from a trained language model. See argument description in `bin/romanesco`.""" vocab = Vocabulary() vocab.load(os.path.join(load_from, C.VOCAB_FILENAME)) inputs, targets, _, _, logits, _ = define_computation_graph( vocab_size=vocab.size, batch_size=1, num_steps=num_steps, hidden_size=hidden_size, embedding_size=embedding_size) saver = tf.train.Saver() sampled_sequence = [] with tf.Session() as session: # load model saver.restore(session, os.path.join(load_from, C.MODEL_FILENAME)) if first_symbols != []: try: first_symbol_ids = [ vocab.get_id(symbol, strict=True) for symbol in first_symbols ] except KeyError: logging.error( 'Unknown first symbol. Try with other first symbols.') sys.exit(0) else: # if no prime text, then just sample a single symbol first_symbol_ids = [vocab.get_random_id()] x = np.array(np.zeros(num_steps, dtype=int)) # padding with zeros (UNK) y = np.array(np.zeros( num_steps, dtype=int)) # we don't care about gold targets here UNK_ID = vocab.get_id(C.UNK) sampled_symbol = first_symbol_ids.pop(0) for _ in range(length): sampled_sequence.append(sampled_symbol) x = np.roll(x, -1) x[num_steps - 1] = sampled_symbol l = session.run([logits], feed_dict={inputs: [x], targets: [y]}) next_symbol_logits = l[0][0][ -1] # first returned session variable, first batch, last symbol next_symbol_probs = softmax(next_symbol_logits) try: sampled_symbol = first_symbol_ids.pop(0) # list of priming symbols is exhausted except IndexError: # avoid generating unknown words sampled_symbol = UNK_ID while sampled_symbol == UNK_ID: # TODO: avoid infinite loop sampled_symbol = np.random.choice(range(vocab.size), p=next_symbol_probs) words = vocab.get_words(sampled_sequence) for index, word in enumerate(words): if word == C.EOS: words[index] = "\n" return ' '.join(words)