def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None, log_frequency=1000): self.name = name self.discount_factor = discount_factor self.max_global_steps = max_global_steps self.global_step = tf.contrib.framework.get_global_step() self.global_policy_net = policy_net self.global_value_net = value_net self.global_counter = global_counter self.local_counter = itertools.count() self.sp = StateProcessor() self.summary_writer = summary_writer self.env = env self.log_frequency = log_frequency # Create local policy/value nets that are not updated asynchronously with tf.variable_scope(name): self.policy_net = PolicyEstimator(policy_net.num_outputs) self.value_net = ValueEstimator(reuse=True) # Op to copy params from global policy/valuenets self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope=self.name+'/', collection=tf.GraphKeys.TRAINABLE_VARIABLES)) self.vnet_train_op = make_train_op(self.value_net, self.global_value_net) self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net) self.state = None
def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): self.name = name self.discount_factor = discount_factor self.max_global_steps = max_global_step self.global_step = tf.contrib.framework.get_global_step() self.global_policy_net = policy_net self.global_value_net = value_net self.global_counter = global_counter self.local_counter = itertools.count() self.sp = StateProcessor() self.summary_writer = summary_writer self.env = env with tf.variable_scope(name): self.policy_net = PolicyEstimator(policy_net.num_outputs) self.value_net = ValueEstimator(reuse=True) self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables( scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES) tf.contrib.slim.get_variables( scope=self.name+'/', ollection=tf.GraphKeys.TRAINABLE_VARIABLES) ) self.vnet_train_op = make_copy_params_op( self.value_net, self.global_value_net) self.pnet_train_op = make_copy_params_op( self.policy_net, self.global_policy_net) self.state = None
def __init__(self, name, env, model_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): self.name = name self.discount_factor = discount_factor self.max_global_steps = max_global_steps self.global_step = tf.contrib.framework.get_global_step() self.global_model_net = model_net self.global_counter = global_counter self.local_counter = itertools.count() self.sp = StateProcessor() self.summary_writer = summary_writer self.env = env self.total_reward = 0 self.episode_length = 0 # Create local policy/value nets that are not updated asynchronously with tf.variable_scope(name): self.model_net = Model(model_net.num_outputs) # Op to copy params from global policy/valuenets self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES)) self.mnet_train_op = make_train_op(self.model_net, self.global_model_net) self.state = None
def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except FileExistsError: pass # Local policy net with tf.variable_scope("policy_eval"): self.policy_net = PolicyEstimator(policy_net.num_outputs) # Op to copy params from global policy/value net parameters self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))
def testPredict(self): env = make_env() sp = StateProcessor() estimator = PolicyEstimator(len(VALID_ACTIONS)) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) # Generate a state state = sp.process(env.reset()) processed_state = atari_helpers.atari_make_initial_state(state) processed_states = np.array([processed_state]) # Run feeds feed_dict = { estimator.states: processed_states, estimator.targets: [1.0], estimator.actions: [1] } loss = sess.run(estimator.loss, feed_dict) pred = sess.run(estimator.predictions, feed_dict) # Assertions self.assertTrue(loss != 0.0) self.assertEqual(pred["probs"].shape, (1, len(VALID_ACTIONS))) self.assertEqual(pred["logits"].shape, (1, len(VALID_ACTIONS)))
def testGradient(self): env = make_env() sp = StateProcessor() estimator = PolicyEstimator(len(VALID_ACTIONS)) grads = [g for g, _ in estimator.grads_and_vars] with self.test_session() as sess: sess.run(tf.initialize_all_variables()) # Generate a state state = sp.process(env.reset()) processed_state = atari_helpers.atari_make_initial_state(state) processed_states = np.array([processed_state]) # Run feeds to get gradients feed_dict = { estimator.states: processed_states, estimator.targets: [1.0], estimator.actions: [1] } grads_ = sess.run(grads, feed_dict) # Apply calculated gradients grad_feed_dict = {k: v for k, v in zip(grads, grads_)} _ = sess.run(estimator.train_op, grad_feed_dict)
def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory, self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath( os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except expression as identifier: pass with tf.variable_scope("policy_eval"): self.contrib.slim.get_variables( scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES) self.contrib.slim.get_variables( scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES)
def setUp(self): super(WorkerTest, self).setUp() self.env = make_env() self.discount_factor = 0.99 self.global_step = tf.Variable(0, name="global_step", trainable=False) self.global_counter = itertools.count() self.sp = StateProcessor() with tf.variable_scope("global") as vs: self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS)) self.global_value_net = ValueEstimator(reuse=True)
class WorkerTest(tf.test.TestCase): def setUp(self): super(WorkerTest, self).setUp() self.env = make_env() self.discount_factor = 0.99 self.global_step = tf.Variable(0, name="global_step", trainable=False) self.global_counter = itertools.count() self.sp = StateProcessor() with tf.variable_scope("global") as vs: print("length of the actions: {}".format(len(VALID_ACTIONS))) self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS)) self.global_value_net = ValueEstimator(reuse=True) def testPolicyNetPredict(self): w = Worker(name="test", env=make_env(), policy_net=self.global_policy_net, value_net=self.global_value_net, global_counter=self.global_counter, discount_factor=self.discount_factor) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) state = self.sp.process(self.env.reset()) processed_state = atari_helpers.atari_make_initial_state(state) action_values = w._policy_net_predict(processed_state, sess) self.assertEqual(action_values.shape, (4, )) def testValueNetPredict(self): w = Worker(name="test", env=make_env(), policy_net=self.global_policy_net, value_net=self.global_value_net, global_counter=self.global_counter, discount_factor=self.discount_factor) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) state = self.sp.process(self.env.reset()) processed_state = atari_helpers.atari_make_initial_state(state) w.state = processed_state transitions, local_t, global_t = w.run_n_steps(10, sess) policy_net_loss, value_net_loss, policy_net_summaries, value_net_summaries = w.update( transitions, sess) self.assertEqual(len(transitions), 10) self.assertIsNotNone(policy_net_loss) self.assertIsNotNone(value_net_loss) self.assertIsNotNone(policy_net_summaries) self.assertIsNotNone(value_net_summaries)
def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): self.name = name self.discount_factor = discount_factor self.max_global_steps = max_global_steps self.global_step = tf.contrib.framework.get_global_step() self.global_policy_net = policy_net self.global_value_net = value_net self.global_counter = global_counter self.local_counter = itertools.count() self.sp = StateProcessor() self.summary_writer = summary_writer self.env = env # Create local policy/value nets that are not updated asynchronously with tf.variable_scope(name): self.policy_net = PolicyEstimator(policy_net.num_outputs) self.value_net = ValueEstimator(reuse=True) # Op to copy params from global policy/valuenets self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES)) self.vnet_train_op = make_train_op(self.value_net, self.global_value_net) self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net) self.state = None
def __init__(self, env, global_net, summary_writer, saver=None): self.env = env self.global_net = global_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.checkpoint_path = os.path.abspath( os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except: pass # Local policy net with tf.variable_scope("policy_eval"): self.local_net = PolicyValueEstimator() # Op to copy params from global policy/value net parameters self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables( scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables( scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES))
class PolicyMonitor(object): """ Helps evaluating a policy by running an episode in an environment, saving a video, and plotting summaries to Tensorboard. Args: env: environment to run in policy_net: A policy estimator summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries """ def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except FileExistsError: pass # Local policy net with tf.variable_scope("policy_eval"): self.policy_net = PolicyEstimator(policy_net.num_outputs) # Op to copy params from global policy/value net parameters self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES)) def _policy_net_predict(self, state, sess): feed_dict = { self.policy_net.states: [state] } preds = sess.run(self.policy_net.predictions, feed_dict) return preds["probs"][0] def eval_once(self, sess): with sess.as_default(), sess.graph.as_default(): # Copy params to local model global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op]) # Run an episode done = False state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) total_reward = 0.0 episode_length = 0 while not done: action_probs = self._policy_net_predict(state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state)) total_reward += reward episode_length += 1 state = next_state # Add summaries episode_summary = tf.Summary() episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") self.summary_writer.add_summary(episode_summary, global_step) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length)) f_reward.write(str(global_step) + " " + str(total_reward) + " " + str(episode_length) + "\n") return total_reward, episode_length def continuous_eval(self, eval_every, sess, coord): """ Continuously evaluates the policy every [eval_every] seconds. """ try: while not coord.should_stop(): self.eval_once(sess) # Sleep until next evaluation cycle time.sleep(eval_every) except tf.errors.CancelledError: return
class Worker(object): """ An A3C worker thread. Runs episodes locally and updates global shared value and policy nets. Args: name: A unique name for this worker env: The Gym environment used by this worker policy_net: Instance of the globally shared policy net value_net: Instance of the globally shared value net global_counter: Iterator that holds the global step discount_factor: Reward discount factor summary_writer: A tf.train.SummaryWriter for Tensorboard summaries max_global_steps: If set, stop coordinator when global_counter > max_global_steps """ def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): self.name = name self.discount_factor = discount_factor self.max_global_steps = max_global_steps self.global_step = tf.contrib.framework.get_global_step() self.global_policy_net = policy_net self.global_value_net = value_net self.global_counter = global_counter self.local_counter = itertools.count() self.sp = StateProcessor() self.summary_writer = summary_writer self.env = env # Create local policy/value nets that are not updated asynchronously with tf.variable_scope(name): self.policy_net = PolicyEstimator(policy_net.num_outputs) self.value_net = ValueEstimator(reuse=True) # Op to copy params from global policy/valuenets self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables( scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables( scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES)) self.vnet_train_op = make_train_op(self.value_net, self.global_value_net) self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net) self.state = None def run(self, sess, coord, t_max): with sess.as_default(), sess.graph.as_default(): # Initial state self.state = atari_helpers.atari_make_initial_state( self.sp.process(self.env.reset())) try: while not coord.should_stop(): # Copy Parameters from the global networks sess.run(self.copy_params_op) # Collect some experience transitions, local_t, global_t = self.run_n_steps( t_max, sess) if self.max_global_steps is not None and global_t >= self.max_global_steps: tf.logging.info( "Reached global step {}. Stopping.".format( global_t)) coord.request_stop() return # Update the global networks self.update(transitions, sess) except tf.errors.CancelledError: return def _policy_net_predict(self, state, sess): feed_dict = {self.policy_net.states: [state]} preds = sess.run(self.policy_net.predictions, feed_dict) return preds["probs"][0] def _value_net_predict(self, state, sess): feed_dict = {self.value_net.states: [state]} preds = sess.run(self.value_net.predictions, feed_dict) return preds["logits"][0] def run_n_steps(self, n, sess): transitions = [] for _ in range(n): # Take a step action_probs = self._policy_net_predict(self.state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state( self.state, self.sp.process(next_state)) # Store transition transitions.append( Transition(state=self.state, action=action, reward=reward, next_state=next_state, done=done)) # Increase local and global counters local_t = next(self.local_counter) global_t = next(self.global_counter) if local_t % 100 == 0: tf.logging.info("{}: local Step {}, global step {}".format( self.name, local_t, global_t)) if done: self.state = atari_helpers.atari_make_initial_state( self.sp.process(self.env.reset())) break else: self.state = next_state return transitions, local_t, global_t def update(self, transitions, sess): """ Updates global policy and value networks based on collected experience Args: transitions: A list of experience transitions sess: A Tensorflow session """ # If we episode was not done we bootstrap the value from the last state reward = 0.0 if not transitions[-1].done: reward = self._value_net_predict(transitions[-1].state, sess) # Accumulate minibatch exmaples states = [] policy_targets = [] value_targets = [] actions = [] for transition in transitions[::-1]: reward = transition.reward + self.discount_factor * reward policy_target = (reward - self._value_net_predict(transition.state, sess)) # Accumulate updates states.append(transition.state) actions.append(transition.action) policy_targets.append(policy_target) value_targets.append(reward) feed_dict = { self.policy_net.states: np.array(states), self.policy_net.targets: policy_targets, self.policy_net.actions: actions, self.value_net.states: np.array(states), self.value_net.targets: value_targets, } # Train the global estimators using local gradients global_step, pnet_loss, vnet_loss, _, _, pnet_summaries, vnet_summaries = sess.run( [ self.global_step, self.policy_net.loss, self.value_net.loss, self.pnet_train_op, self.vnet_train_op, self.policy_net.summaries, self.value_net.summaries ], feed_dict) # Write summaries if self.summary_writer is not None: self.summary_writer.add_summary(pnet_summaries, global_step) self.summary_writer.add_summary(vnet_summaries, global_step) self.summary_writer.flush() return pnet_loss, vnet_loss, pnet_summaries, vnet_summaries
class PolicyMonitor(object): """ Helps evaluating a policy by running an episode in an environment, saving a video, and plotting summaries to Tensorboard. Args: env: environment to run in policy_net: A policy estimator summary_writer: a tf.train.SummaryWriter used to write Tensorboard summaries """ def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory=self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath(os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except FileExistsError: pass # Local policy net with tf.variable_scope("policy_eval"): self.policy_net = PolicyEstimator(policy_net.num_outputs) # Op to copy params from global policy/value net parameters self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES)) def _policy_net_predict(self, state, sess): feed_dict = { self.policy_net.states: [state] } preds = sess.run(self.policy_net.predictions, feed_dict) return preds["probs"][0] def eval_once(self, sess): with sess.as_default(), sess.graph.as_default(): # Copy params to local model global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op]) # Run an episode done = False state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) total_reward = 0.0 episode_length = 0 while not done: action_probs = self._policy_net_predict(state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state)) total_reward += reward episode_length += 1 state = next_state # Add summaries episode_summary = tf.Summary() episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") self.summary_writer.add_summary(episode_summary, global_step) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length)) return total_reward, episode_length def continuous_eval(self, eval_every, sess, coord): """ Continuously evaluates the policy every [eval_every] seconds. """ try: while not coord.should_stop(): self.eval_once(sess) # Sleep until next evaluation cycle time.sleep(eval_every) except tf.errors.CancelledError: return
class Worker(object): """ An A3C worker thread. Runs episodes locally and updates global shared value and policy nets. Args: name: A unique name for this worker env: The Gym environment used by this worker policy_net: Instance of the globally shared policy net value_net: Instance of the globally shared value net global_counter: Iterator that holds the global step discount_factor: Reward discount factor summary_writer: A tf.train.SummaryWriter for Tensorboard summaries max_global_steps: If set, stop coordinator when global_counter > max_global_steps """ def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): self.name = name self.discount_factor = discount_factor self.max_global_steps = max_global_steps self.global_step = tf.contrib.framework.get_global_step() self.global_policy_net = policy_net self.global_value_net = value_net self.global_counter = global_counter self.local_counter = itertools.count() self.sp = StateProcessor() self.summary_writer = summary_writer self.env = env # Create local policy/value nets that are not updated asynchronously with tf.variable_scope(name): self.policy_net = PolicyEstimator(policy_net.num_outputs) self.value_net = ValueEstimator(reuse=True) # Op to copy params from global policy/valuenets self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables(scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES), tf.contrib.slim.get_variables(scope=self.name, collection=tf.GraphKeys.TRAINABLE_VARIABLES)) self.vnet_train_op = make_train_op(self.value_net, self.global_value_net) self.pnet_train_op = make_train_op(self.policy_net, self.global_policy_net) self.state = None def run(self, sess, coord, t_max): with sess.as_default(), sess.graph.as_default(): # Initial state self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) try: while not coord.should_stop(): # Copy Parameters from the global networks sess.run(self.copy_params_op) # Collect some experience transitions, local_t, global_t = self.run_n_steps(t_max, sess) if self.max_global_steps is not None and global_t >= self.max_global_steps: tf.logging.info("Reached global step {}. Stopping.".format(global_t)) coord.request_stop() return # Update the global networks self.update(transitions, sess) except tf.errors.CancelledError: return def _policy_net_predict(self, state, sess): feed_dict = { self.policy_net.states: [state] } preds = sess.run(self.policy_net.predictions, feed_dict) return preds["probs"][0] def _value_net_predict(self, state, sess): feed_dict = { self.value_net.states: [state] } preds = sess.run(self.value_net.predictions, feed_dict) return preds["logits"][0] def run_n_steps(self, n, sess): transitions = [] for _ in range(n): # Take a step action_probs = self._policy_net_predict(self.state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(self.state, self.sp.process(next_state)) # Store transition transitions.append(Transition( state=self.state, action=action, reward=reward, next_state=next_state, done=done)) # Increase local and global counters local_t = next(self.local_counter) global_t = next(self.global_counter) if local_t % 100 == 0: tf.logging.info("{}: local Step {}, global step {}".format(self.name, local_t, global_t)) if done: self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) break else: self.state = next_state return transitions, local_t, global_t def update(self, transitions, sess): """ Updates global policy and value networks based on collected experience Args: transitions: A list of experience transitions sess: A Tensorflow session """ # If we episode was not done we bootstrap the value from the last state reward = 0.0 if not transitions[-1].done: reward = self._value_net_predict(transitions[-1].next_state, sess) # Accumulate minibatch exmaples states = [] policy_targets = [] value_targets = [] actions = [] for transition in transitions[::-1]: reward = transition.reward + self.discount_factor * reward policy_target = (reward - self._value_net_predict(transition.state, sess)) # Accumulate updates states.append(transition.state) actions.append(transition.action) policy_targets.append(policy_target) value_targets.append(reward) feed_dict = { self.policy_net.states: np.array(states), self.policy_net.targets: policy_targets, self.policy_net.actions: actions, self.value_net.states: np.array(states), self.value_net.targets: value_targets, } # Train the global estimators using local gradients global_step, pnet_loss, vnet_loss, _, _, pnet_summaries, vnet_summaries = sess.run([ self.global_step, self.policy_net.loss, self.value_net.loss, self.pnet_train_op, self.vnet_train_op, self.policy_net.summaries, self.value_net.summaries ], feed_dict) # Write summaries if self.summary_writer is not None: self.summary_writer.add_summary(pnet_summaries, global_step) self.summary_writer.add_summary(vnet_summaries, global_step) self.summary_writer.flush() return pnet_loss, vnet_loss, pnet_summaries, vnet_summaries
class Worker(object): def __init__(self, name, env, policy_net, value_net, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): self.name = name self.discount_factor = discount_factor self.max_global_steps = max_global_step self.global_step = tf.contrib.framework.get_global_step() self.global_policy_net = policy_net self.global_value_net = value_net self.global_counter = global_counter self.local_counter = itertools.count() self.sp = StateProcessor() self.summary_writer = summary_writer self.env = env with tf.variable_scope(name): self.policy_net = PolicyEstimator(policy_net.num_outputs) self.value_net = ValueEstimator(reuse=True) self.copy_params_op = make_copy_params_op( tf.contrib.slim.get_variables( scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES) tf.contrib.slim.get_variables( scope=self.name+'/', ollection=tf.GraphKeys.TRAINABLE_VARIABLES) ) self.vnet_train_op = make_copy_params_op( self.value_net, self.global_value_net) self.pnet_train_op = make_copy_params_op( self.policy_net, self.global_policy_net) self.state = None def run(self, sess, coord, t_max): with sess.as_default(), sess.graph.as_Default(): self.state = atari_helpers.atari_make_initial_state( self.sp.process(self.env.reset())) try: while not coord.should_stop(): sess.run(self.copy_params_op) transitions, local_t, global_t = self.run_n_steps( t_max, sess) if self.max_global_steps is not None and global_t >= self.max_global_steps: tf.logging.info( "Reached global step {}. Stopping." .format(global_t)) coord.request_stop() return self.update(transitions, sess) except tf.errors.CancelledError: return def _policy_net_predict(self, state, sess): feed_dic = {self.policy_net.states: [states]} preds = sess.run(self.policy_net.predictions, feed_dic) return preds["probs"][0] def _value_net_predict(self, state, sess): feed_dict = {self.value_net.states: [state]} preds = sess.run(self.value_net.predictions, feed_dict) return preds["logits"][0] def run_n_steps(self, n, sess): transitions = [] for _ in range(n): action_probes = self._policy_net_predict(self.state, sess) action = np.random.choice( np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state( self.state, self.sp.process(next_state)) transitions.append(Transition(s tate=self.state, action=action, reward=reward, next_state=next_state, done=done)) local_t = next(self.local_counter) global_t = next(self.global_counter) if local_t % 100 == 0: tf.logging.info("{}: local step {}, global step {}". format( self.name, local_t, global_t)) if done: self.state = atari_helpers.atari_make_initial_state( self.sp.process(self.env.reset())) break else: self.state = next_state
class PolicyMonitor(object): def __init__(self, env, policy_net, summary_writer, saver=None): self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) self.env = Monitor(env, directory, self.video_dir, video_callable=lambda x: True, resume=True) self.global_policy_net = policy_net self.summary_writer = summary_writer self.saver = saver self.sp = StateProcessor() self.checkpoint_path = os.path.abspath( os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) try: os.makedirs(self.video_dir) except expression as identifier: pass with tf.variable_scope("policy_eval"): self.contrib.slim.get_variables( scope="global", collection=tf.GraphKeys.TRAINABLE_VARIABLES) self.contrib.slim.get_variables( scope="policy_eval", collection=tf.GraphKeys.TRAINABLE_VARIABLES) def _policy_net_predict(self, state, sess): feed_dict = {self.policy_net_states: [states]} preds = sess.run(self.run(self.policy_net.predictions, feed_dict)) return preds["probs"][0] def eval_once(self, sess): with sess.as_default(), sess.graph.as_default(): global_step, _ = sess.run( [tf.contrib.framework.get_global_Step(), self.copy_params.op]) done = False state = atari_helpers.atari_make_initial_state( self.sp.process(self.env.reset())) total_reward = 0.0 episode_length = 0 while not done: action_probs = self._policy_net_predict(state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state( state, self.sp.process(next_state)) total_reward += reward episode_length += 1 state = next_state episode_summary = tf.summary() episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") self.summary_writer.add_summary(episode_summary, global_step) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info( "Eval results at step {}: total_reward {},episode_length {}". format(global_step, total_reward, episode_length)) return total_reward, episode_length def continous_eval(self, eval_every, sess, coord): try: while not coord.should_stop(): self.eval_once(sess) time.sleep(eval_every) except tf.errors.CanceledError: return
class WorkerTest(tf.test.TestCase): def setUp(self): super(WorkerTest, self).setUp() self.env = make_env() self.discount_factor = 0.99 self.global_step = tf.Variable(0, name="global_step", trainable=False) self.global_counter = itertools.count() self.sp = StateProcessor() with tf.variable_scope("global") as vs: self.global_policy_net = PolicyEstimator(len(VALID_ACTIONS)) self.global_value_net = ValueEstimator(reuse=True) def testPolicyNetPredict(self): w = Worker( name="test", env=make_env(), policy_net=self.global_policy_net, value_net=self.global_value_net, global_counter=self.global_counter, discount_factor=self.discount_factor) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) state = self.sp.process(self.env.reset()) processed_state = atari_helpers.atari_make_initial_state(state) action_values = w._policy_net_predict(processed_state, sess) self.assertEqual(action_values.shape, (4,)) def testValueNetPredict(self): w = Worker( name="test", env=make_env(), policy_net=self.global_policy_net, value_net=self.global_value_net, global_counter=self.global_counter, discount_factor=self.discount_factor) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) state = self.sp.process(self.env.reset()) processed_state = atari_helpers.atari_make_initial_state(state) state_value = w._value_net_predict(processed_state, sess) self.assertEqual(state_value.shape, ()) def testRunNStepsAndUpdate(self): w = Worker( name="test", env=make_env(), policy_net=self.global_policy_net, value_net=self.global_value_net, global_counter=self.global_counter, discount_factor=self.discount_factor) with self.test_session() as sess: sess.run(tf.initialize_all_variables()) state = self.sp.process(self.env.reset()) processed_state = atari_helpers.atari_make_initial_state(state) w.state = processed_state transitions, local_t, global_t = w.run_n_steps(10, sess) policy_net_loss, value_net_loss, policy_net_summaries, value_net_summaries = w.update(transitions, sess) self.assertEqual(len(transitions), 10) self.assertIsNotNone(policy_net_loss) self.assertIsNotNone(value_net_loss) self.assertIsNotNone(policy_net_summaries) self.assertIsNotNone(value_net_summaries)