def main(args): logging.debug('Configuration: {}'.format(args)) network_creator, env_creator = get_network_and_environment_creator(args) learner = PAACLearner(network_creator, env_creator, args, SolowRunner, SolowStateProcessor()) setup_kill_signal_handler(learner) logging.info('Starting training') learner.train() logging.info('Finished training')
def testEvalOnce(self): pe = PolicyMonitor(env=make_env(), state_processor=SolowStateProcessor(), global_policy_net=self.global_policy_net, summary_writer=self.summary_writer, num_actions=self.num_actions, input_size=self.input_size, temporal_size=self.temporal_size) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) total_reward, episode_length, rewards = pe.eval_once(sess) self.assertTrue(episode_length > 10)
def __init__(self, name, env, policy_net, value_net, shared_layer, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None): super(SolowWorker, self).__init__(name, env, policy_net, value_net, shared_layer, global_counter, discount_factor, summary_writer, max_global_steps, 100., SolowStateProcessor())
def policy_monitor_worker_equal(self): global_counter = itertools.count() worker_env = make_env() worker_env.seed(1692) worker = SolowWorker( 'test_worker', env=worker_env, policy_net=self.global_policy_net, value_net=None, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), global_counter=global_counter, ) env = make_env() pe = PolicyMonitor(env=env, state_processor=SolowStateProcessor(), global_policy_net=self.global_policy_net, summary_writer=self.summary_writer, num_actions=self.num_actions, input_size=self.input_size, temporal_size=self.temporal_size) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) worker.state = worker_env.reset() worker.history.append(worker.process_state(worker.state)) sess.run(worker.copy_params_op) transitions = worker.run_n_steps(10, sess, stochastic=False) worker_rewards = [t.reward for t in transitions[0]] pe.env = make_env() pe.env.seed(1692) pe.policy_net = worker.policy_net total_reward, episode_length, rewards = pe.eval_once(sess) monitor_rewards = rewards[:10] np.testing.assert_almost_equal(monitor_rewards, worker_rewards, decimal=4)
def __init__(self, name, env, policy_net, value_net, shared_layer, global_counter, discount_factor=0.99, summary_writer=None, max_global_steps=None, scale=1., ub=0.99, lb=0.01, n_grid=51): super(GridSolowWorker, self).__init__(name, env, policy_net, value_net, shared_layer, global_counter, discount_factor, summary_writer, max_global_steps, scale, SolowStateProcessor()) self.idx_to_grid = { idx: v for idx, v in zip(range(n_grid), np.linspace(lb, ub, n_grid)) }
def __init__(self, id, emulators, variables, queue, barrier): super().__init__(id, emulators, variables, queue, barrier) self.state_processor = SolowStateProcessor()
policy_net=policy_net, value_net=value_net, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), global_counter=global_counter, discount_factor=0.99, summary_writer=worker_summary_writer, max_global_steps=FLAGS.max_global_steps) workers.append(worker) saver = tf.train.Saver(keep_checkpoint_every_n_hours=2.0, max_to_keep=10) # Used to occasionally save videos for our policy net # and write episode rewards to Tensorboard pe = PolicyMonitor(env=make_eval_env(p, q), global_policy_net=policy_net, state_processor=SolowStateProcessor(), summary_writer=summary_writer, saver=saver, num_actions=NUM_ACTIONS, input_size=INPUT_SIZE, temporal_size=TEMPORAL_SIZE) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) coord = tf.train.Coordinator() # Load a previous checkpoint if it exists latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR) if latest_checkpoint: print("Loading model checkpoint: {}".format(latest_checkpoint))