def test_step(self): tf_env = tf_py_environment.TFPyEnvironment( suite_gym.load('CartPole-v0')) indexed_tf_env = IndexedTFEnv(tf_env, 5) # take first action a1 = PolicyStep(action=tf.convert_to_tensor([1]), state=(), info=()) time_step_0 = indexed_tf_env.step(a1) self.assertEqual(time_step_0["env_id"], 5) self.assertEqual(time_step_0["ts_id"], 0) self.assertEqual(time_step_0["reward"], 0) self.assertEqual(time_step_0["step_type"], 0) self.assertEqual(time_step_0["discount"], 1.0) self.assertTrue("ob_0" in time_step_0) self.assertTrue("ob_1" in time_step_0) self.assertTrue("ob_2" in time_step_0) self.assertTrue("ob_3" in time_step_0) # take second action a2 = PolicyStep(action=tf.convert_to_tensor([0]), state=(), info=()) time_step_1 = indexed_tf_env.step(a2) self.assertEqual(time_step_1["env_id"], 5) self.assertEqual(time_step_1["ts_id"], 1) self.assertEqual(time_step_1["reward"], 1) self.assertEqual(time_step_1["step_type"], 1) self.assertEqual(time_step_1["discount"], 1.0) self.assertTrue("ob_0" in time_step_1) self.assertTrue("ob_1" in time_step_1) self.assertTrue("ob_2" in time_step_1) self.assertTrue("ob_3" in time_step_1)
def make_batch_tfenv(make_env, config, start_dt, training_interval, spark_session): """ This returns a TFEnvRLApplication, which keeps 'config.env.num_envs' envs running in parallel. :param make_env: A function that returns an Environment :param config: An ApplicationConfig :param start_dt: A datetime being used to simulate the first action :param training_interval: A datetime indicating the lag between when an observation is generated and when it can be used for training. This simulates real world environments where there's a delay between data collection and Agent updates. :param spark_session: A Spark session :return: """ envs = [IndexedTFEnv(make_env(), i) for i in range(0, config.env.num_envs)] # setup app training_config = { "num_iterations": config.training.num_iterations, "agent_discount": config.trajectory.agent_discount, "mini_batch_size": config.training.batch_size, "eps_start": config.policy.eps_start, "eps_final": config.policy.eps_final, "eps_steps": config.policy.eps_steps, "initial_collect_steps": config.policy.initial_collect_steps, "log_interval": config.project.log_interval } return TFEnvRLApplication(envs, spark_session, training_config, config.env.num_steps_per_run, start_dt, training_interval)
def test_init_application(self, mock_dm): # init a rl env application envs = [] for i in range(2): envs.append( IndexedTFEnv( tf_py_environment.TFPyEnvironment( suite_gym.load('CartPole-v0')), i)) training_config = { "fc_layer_params": (100, ), "learning_rate": 0.001, "agent_discount": 0.99, "mini_batch_size": 64, "num_training_iterations": 10000, "epsilon_greediness": 0.1, "gradient_clipping": 1.0 } steps_num_per_run = 3 app = TFEnvRLApplication(envs, training_config, steps_num_per_run, datetime.now(), 2) self.assertListEqual(app.obs_cols, ['ob_0', 'ob_1', 'ob_2', 'ob_3'])
def __init__( self, # Params Q network fc_layer_params=(100, ), # Params for training learning_rate=0.001, agent_discount=0.99, mini_batch_size=128, num_iterations=5000, gradient_clipping=None, trajectory_training_window=100, log_interval=200, # Param for simulated environments envs_num=10, runs_num=10, steps_num_per_run=100, # Params for evaluation eval_interval=1, num_eval_episodes=100, # Params for data collection eps_start=1.0, eps_final=0.1, eps_steps=10000, initial_collect_steps=3000, tb_path=None): # store configs used during the training run self._runs_num = runs_num self._eval_interval = eval_interval self._num_eval_episodes = num_eval_episodes # setup rl_app envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)] application_name = "CartPole-example" version = "%s" % str(time.time()) # setup training engine training_interval = timedelta(days=1) start_dt = datetime(year=2019, month=8, day=7, hour=10) self._engine_config = TFEnvEngineConfig(start_dt, training_interval, trajectory_training_window, application_name, version) if tb_path: self._engine_config.tensorboard_path = tb_path # setup app training_config = { "num_iterations": num_iterations, "agent_discount": agent_discount, "mini_batch_size": mini_batch_size, "eps_start": eps_start, "eps_final": eps_final, "eps_steps": eps_steps, "initial_collect_steps": initial_collect_steps, "log_interval": log_interval } self._rl_app = TFEnvRLApplication(envs, training_config, steps_num_per_run, start_dt, training_interval) @staticmethod def init_agent(): """ a DQN agent is set by default in the application""" # get the global step global_step = tf.compat.v1.train.get_or_create_global_step() # TODO: update this to get the optimizer from tensorflow 2.0 if possible optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) q_net = q_network.QNetwork(self._rl_app.observation_spec, self._rl_app.action_spec, fc_layer_params=fc_layer_params) time_step_spec = ts.time_step_spec(self._rl_app.observation_spec) tf_agent = dqn_agent.DqnAgent( time_step_spec, self._rl_app.action_spec, q_network=q_net, optimizer=optimizer, epsilon_greedy=eps_final, gradient_clipping=gradient_clipping, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=global_step, debug_summaries=True, summarize_grads_and_vars=True) tf_agent.initialize() logger.info("tf_agent initialization is complete") # Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) return tf_agent self._rl_app.init_agent = init_agent.__get__(object)
def __init__( self, # Params for training learning_rate=0.001, discount=0.99, mini_batch_size=64, num_iterations=2000, gradient_clipping=None, trajectory_training_window=1, log_interval=100, # Param for simulated environments envs_num=60, runs_num=10, steps_num_per_run=50, # Params for evaluation eval_interval=1, num_eval_episodes=100, # Params for data collection eps_start=1.0, eps_final=0.1, eps_steps=1000, initial_collect_steps=1000, tb_path=None): # store configs used during the training run self._runs_num = runs_num self._eval_interval = eval_interval self._num_eval_episodes = num_eval_episodes # setup rl_app envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)] application_name = "MountainCar-example" version = "%s" % str(time.time()) # setup training engine training_interval = timedelta(days=1) start_dt = datetime(year=2019, month=8, day=7, hour=10) self._engine_config = TFEnvEngineConfig(start_dt, training_interval, trajectory_training_window, application_name, version) if tb_path: self._engine_config.tensorboard_path = tb_path # setup app training_config = { "num_iterations": num_iterations, "mini_batch_size": mini_batch_size, "eps_start": eps_start, "eps_final": eps_final, "eps_steps": eps_steps, "initial_collect_steps": initial_collect_steps, "log_interval": log_interval } self._rl_app = TFEnvRLApplication(envs, training_config, steps_num_per_run, start_dt, training_interval) @staticmethod def init_agent(): """ a DDPG agent is set by default in the application""" # get the global step global_step = tf.compat.v1.train.get_or_create_global_step() # TODO: update this to get the optimizer from tensorflow 2.0 if possible optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=learning_rate) time_step_spec = time_step.time_step_spec( self._rl_app.observation_spec) actor_net = actor_network.ActorNetwork( self._rl_app.observation_spec, self._rl_app.action_spec, fc_layer_params=(400, 300)) value_net = critic_network.CriticNetwork( (time_step_spec.observation, self._rl_app.action_spec), observation_fc_layer_params=(400, ), action_fc_layer_params=None, joint_fc_layer_params=(300, )) tf_agent = ddpg_agent.DdpgAgent( time_step_spec, self._rl_app.action_spec, actor_network=actor_net, critic_network=value_net, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=1e-4), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=1e-3), ou_stddev=0.2, ou_damping=0.15, target_update_tau=0.05, target_update_period=5, dqda_clipping=None, td_errors_loss_fn=tf.compat.v1.losses.huber_loss, gamma=discount, reward_scale_factor=1.0, gradient_clipping=gradient_clipping, debug_summaries=True, summarize_grads_and_vars=True, train_step_counter=global_step) tf_agent.initialize() logger.info("tf_agent initialization is complete") # Optimize by wrapping some of the code in a graph using TF function. tf_agent.train = common.function(tf_agent.train) return tf_agent self._rl_app.init_agent = init_agent.__get__(object)
def __init__( self, # Params Q network fc_layer_params=(100, ), # Params for training learning_rate=0.01, agent_discount=0.99, mini_batch_size=1, num_iterations=10000, gradient_clipping=None, trajectory_training_window=100, log_interval=200, # Param for simulated environments envs_num=10, runs_num=10, steps_num_per_run=1000, # Params for evaluation eval_interval=1, num_eval_episodes=100, # Params for data collection eps_start=1.0, eps_final=0.1, eps_steps=10000, initial_collect_steps=3000, tb_path=None): # store configs used during the training run self._runs_num = runs_num self._eval_interval = eval_interval self._num_eval_episodes = num_eval_episodes # setup rl_app envs = [IndexedTFEnv(self._make_env(), i) for i in range(0, envs_num)] application_name = "CartPole-example" version = "%s" % str(time.time()) # setup training engine training_interval = timedelta(days=1) start_dt = datetime(year=2019, month=8, day=7, hour=10) self._engine_config = TFEnvEngineConfig(start_dt, training_interval, trajectory_training_window, application_name, version) if tb_path: self._engine_config.tensorboard_path = tb_path # setup app training_config = { "n_step": 1, "num_iterations": num_iterations, # "agent_discount": agent_discount, "mini_batch_size": mini_batch_size, # "eps_start": eps_start, # "eps_final": eps_final, # "eps_steps": eps_steps, "initial_collect_steps": initial_collect_steps, "log_interval": log_interval } self._rl_app = TFEnvRLApplication(envs, training_config, steps_num_per_run, start_dt, training_interval) @staticmethod def init_agent(): """ a DQN agent is set by default in the application""" time_step_spec = ts.time_step_spec(self._rl_app.observation_spec) agent = SignAgent(self._rl_app.observation_spec, self._rl_app.action_spec, time_step_spec) agent.initialize() logger.info("tf_agent initialization is complete") # Optimize by wrapping some of the code in a graph using TF function. agent.train = common.function(agent.train) return agent self._rl_app.init_agent = init_agent.__get__(object)