def train(num_episodes=1000, save_every=100, checkpoint_dir="checkpoints", tensorboard_dir="tensorboard", tboard_every=10, find_target_prop=0): pol = Policy() writer = tf.contrib.summary.create_file_writer(tensorboard_dir) for j in range(1, num_episodes + 1): random_secret = random.randint(0, config.max_guesses - 1) e = Episode(pol, random_secret, find_target_prop, True) history = e.generate() print("Episode:{}, length: {}".format(j, len(history))) G = -1 optimizer = \ tf.train.GradientDescentOptimizer( learning_rate=config.reinforce_alpha*G) for i in reversed(range(1, len(history))): history_so_far = history[:i] next_action, _ = history[i] with tfe.GradientTape() as tape: action_logits = pol(history_so_far, with_softmax=False) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.one_hot(tf.convert_to_tensor([next_action]), config.max_guesses), logits=action_logits) grads = tape.gradient(loss, pol.variables) optimizer.apply_gradients(zip(grads, pol.variables)) G -= 1 optimizer._learning_rate = G * config.reinforce_alpha optimizer._learning_rate_tensor = None # hack. Should be able to pass a callable as learning_rate, see # https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer#args # can I perhaps submit a PR to fix this bug? sys.stdout.write("{}/{}\r".format(len(history) - i, len(history))) if j % save_every == 0 or j == num_episodes: saver = tfe.Saver(pol.named_variables) save_path = os.path.join( checkpoint_dir, "episode{}".format(str(j).zfill(len(str(num_episodes))))) saver.save(save_path) if j % tboard_every == 0: with writer.as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('total_return', tf.convert_to_tensor([G]), step=j) return pol
state = self.lstm.zero_state(1, tf.float32) for guess, feedback in game_state: guess_tensor = tf.reshape(tf.convert_to_tensor(guess), (1,)) feedback_tensor = tf.reshape(tf.convert_to_tensor(feedback), (1,)) guess_embedded = self.guess_embedding(guess_tensor) feedback_embedded = self.feedback_embedding(feedback_tensor) combined_embedded = tf.concat([guess_embedded, feedback_embedded], axis=-1) # can I do multiple inputs to the LSTM instead of concatenating? output, state = self.lstm(combined_embedded, state) logits = self.dense(output) if with_softmax: return tf.nn.softmax(logits) return logits if __name__ == "__main__": from episode import Episode import numpy as np np.random.seed(123) p = Policy() e = Episode(p, "0000") x = p(e.generate()) print(x.numpy())
for guess, feedback in game_state: guess_tensor = tf.reshape(tf.convert_to_tensor(guess), (1, )) feedback_tensor = tf.reshape(tf.convert_to_tensor(feedback), (1, )) guess_embedded = self.guess_embedding(guess_tensor) feedback_embedded = self.feedback_embedding(feedback_tensor) combined_embedded = tf.concat([guess_embedded, feedback_embedded], axis=-1) # can I do multiple inputs to the LSTM instead of concatenating? output, state = self.lstm(combined_embedded, state) logits = self.dense(output) if with_softmax: return tf.nn.softmax(logits) return logits if __name__ == "__main__": from episode import Episode import numpy as np np.random.seed(123) p = Policy() print("lstm") print(p.lstm) e = Episode(p, "0000", find_target_proba=0.3, train=True) g = e.generate() print(g) x = p(g) print(x)