def testDistributionRaisesNotImplementedError(self): mock_tf_py_policy = tf_py_policy.TFPyPolicy( self._get_mock_py_policy()) observation = tf.ones([5], tf.float32) time_step = ts.restart(observation) with self.assertRaises(NotImplementedError): mock_tf_py_policy.distribution(time_step=time_step)
def testZeroState(self): policy_state_length = 5 batch_size = 3 mock_py_policy = mock.create_autospec(py_policy.Base) observation_spec = array_spec.ArraySpec((3,), np.float32) mock_py_policy.time_step_spec = ts.time_step_spec(observation_spec) mock_py_policy.action_spec = array_spec.BoundedArraySpec( (7,), np.int32, 1, 1) py_policy_state_spec = array_spec.BoundedArraySpec((policy_state_length,), np.int32, 1, 1) # Make the mock policy and reset return value. mock_py_policy.policy_state_spec = py_policy_state_spec mock_py_policy.info_spec = () expected_py_policy_state = np.zeros( [batch_size] + list(py_policy_state_spec.shape), py_policy_state_spec.dtype) mock_py_policy.get_initial_state.return_value = expected_py_policy_state tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy) initial_state = tf_mock_py_policy.get_initial_state(batch_size=batch_size) initial_state_ = self.evaluate(initial_state) self.assertEqual(1, mock_py_policy.get_initial_state.call_count) np.testing.assert_equal(initial_state_, expected_py_policy_state)
def testAction(self): py_observation_spec = array_spec.BoundedArraySpec((3, ), np.int32, 1, 1) py_time_step_spec = ts.time_step_spec(py_observation_spec) py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, 1, 1) py_policy_state_spec = array_spec.BoundedArraySpec((5, ), np.int32, 0, 1) py_policy_info_spec = array_spec.BoundedArraySpec((3, ), np.int32, 0, 1) mock_py_policy = mock.create_autospec(py_policy.PyPolicy) mock_py_policy.time_step_spec = py_time_step_spec mock_py_policy.action_spec = py_action_spec mock_py_policy.policy_state_spec = py_policy_state_spec mock_py_policy.info_spec = py_policy_info_spec expected_py_policy_state = np.ones(py_policy_state_spec.shape, py_policy_state_spec.dtype) expected_py_time_step = tf.nest.map_structure( lambda arr_spec: np.ones((1, ) + arr_spec.shape, arr_spec.dtype), py_time_step_spec) expected_py_action = np.ones((1, ) + py_action_spec.shape, py_action_spec.dtype) expected_new_py_policy_state = np.zeros(py_policy_state_spec.shape, py_policy_state_spec.dtype) expected_py_info = np.zeros(py_policy_info_spec.shape, py_policy_info_spec.dtype) mock_py_policy.action.return_value = policy_step.PolicyStep( nest_utils.unbatch_nested_array(expected_py_action), expected_new_py_policy_state, expected_py_info) tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy) time_step = tf.nest.map_structure( lambda arr_spec: tf.ones((1, ) + arr_spec.shape, arr_spec.dtype), py_time_step_spec) action_step = tf_mock_py_policy.action( time_step, tf.ones(py_policy_state_spec.shape, tf.int32)) py_action_step = self.evaluate(action_step) self.assertEqual(1, mock_py_policy.action.call_count) np.testing.assert_equal( mock_py_policy.action.call_args[1]['time_step'], nest_utils.unbatch_nested_array(expected_py_time_step)) np.testing.assert_equal( mock_py_policy.action.call_args[1]['policy_state'], expected_py_policy_state) np.testing.assert_equal(py_action_step.action, expected_py_action) np.testing.assert_equal(py_action_step.state, expected_new_py_policy_state) np.testing.assert_equal(py_action_step.info, expected_py_info)
def testRandomPyPolicyGeneratesActionTensors(self): array_action_spec = array_spec.BoundedArraySpec((7,), np.int32, -10, 10) observation = tf.ones([3], tf.float32) time_step = ts.restart(observation) observation_spec = tensor_spec.TensorSpec.from_tensor(observation) time_step_spec = ts.time_step_spec(observation_spec) tf_py_random_policy = tf_py_policy.TFPyPolicy( random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec, action_spec=array_action_spec)) batched_time_step = nest_utils.batch_nested_tensors(time_step) action_step = tf_py_random_policy.action(time_step=batched_time_step) action, new_policy_state = self.evaluate( [action_step.action, action_step.state]) self.assertEqual((1,) + array_action_spec.shape, action.shape) self.assertTrue(np.all(action >= array_action_spec.minimum)) self.assertTrue(np.all(action <= array_action_spec.maximum)) self.assertEqual(new_policy_state, ())
def testRandomPyPolicyGeneratesActionTensors(self): if tf.executing_eagerly(): self.skipTest('b/123935604') py_action_spec = array_spec.BoundedArraySpec((7,), np.int32, -10, 10) observation = tf.ones([3], tf.float32) time_step = ts.restart(observation) observation_spec = tensor_spec.TensorSpec.from_tensor(observation) time_step_spec = ts.time_step_spec(observation_spec) tf_py_random_policy = tf_py_policy.TFPyPolicy( random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec, action_spec=py_action_spec)) action_step = tf_py_random_policy.action(time_step=time_step) py_action, py_new_policy_state = self.evaluate( [action_step.action, action_step.state]) self.assertEqual(py_action.shape, py_action_spec.shape) self.assertTrue(np.all(py_action >= py_action_spec.minimum)) self.assertTrue(np.all(py_action <= py_action_spec.maximum)) self.assertEqual(py_new_policy_state, ())
def testPyPolicyIsBatchedTrue(self): action_dims = 5 observation_dims = 3 batch_size = 2 array_action_spec = array_spec.BoundedArraySpec((action_dims, ), np.int32, -10, 10) observation_spec = array_spec.ArraySpec((observation_dims, ), np.float32) array_time_step_spec = ts.time_step_spec(observation_spec) observation = tf.ones([batch_size, observation_dims], tf.float32) time_step = ts.restart(observation, batch_size=batch_size) tf_py_random_policy = tf_py_policy.TFPyPolicy( random_py_policy.RandomPyPolicy( time_step_spec=array_time_step_spec, action_spec=array_action_spec), py_policy_is_batched=True) action_step = tf_py_random_policy.action(time_step=time_step) action = self.evaluate(action_step.action) self.assertEqual(action.shape, (batch_size, action_dims))
def testVariables(self): mock_tf_py_policy = tf_py_policy.TFPyPolicy( self._get_mock_py_policy()) np.testing.assert_equal(mock_tf_py_policy.variables(), [])
def train(): global VERBOSE environment = TradeEnvironment() # utils.validate_py_environment(environment, episodes=5) # Environments train_env = tf_py_environment.TFPyEnvironment(environment) eval_env = tf_py_environment.TFPyEnvironment(environment) num_iterations = 50 fc_layer_params = (512, ) # ~ (17 + 1001) / 2 input_fc_layer_params = (17, ) output_fc_layer_params = (20, ) lstm_size = (17, ) initial_collect_steps = 20 collect_steps_per_iteration = 1 batch_size = 64 replay_buffer_capacity = 10000 gamma = 0.99 # check if 1 will work here target_update_tau = 0.05 target_update_period = 5 epsilon_greedy = 0.1 reward_scale_factor = 1.0 learning_rate = 1e-2 log_interval = 30 num_eval_episodes = 5 eval_interval = 15 # q_net = q_network.QNetwork( # train_env.observation_spec(), # train_env.action_spec(), # fc_layer_params=fc_layer_params, # ) q_net = q_rnn_network.QRnnNetwork( train_env.observation_spec(), train_env.action_spec(), input_fc_layer_params=input_fc_layer_params, lstm_size=lstm_size, output_fc_layer_params=output_fc_layer_params, ) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) train_step_counter = tf.compat.v2.Variable(0) tf_agent = dqn_agent.DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, epsilon_greedy=epsilon_greedy, target_update_tau=target_update_tau, target_update_period=target_update_period, gamma=gamma, reward_scale_factor=reward_scale_factor, td_errors_loss_fn=dqn_agent.element_wise_squared_loss, train_step_counter=train_step_counter, gradient_clipping=None, debug_summaries=False, summarize_grads_and_vars=False, ) q_policy = FilteredQPolicy( tf_agent._time_step_spec, tf_agent._action_spec, q_network=tf_agent._q_network, ) # Valid policy to pre-fill replay buffer dummy_policy = DummyTradePolicy( train_env.time_step_spec(), train_env.action_spec(), ) # Main agent's policy; greedy one policy = greedy_policy.GreedyPolicy(q_policy) filtered_random_py_policy = FilteredRandomPyPolicy( time_step_spec=policy.time_step_spec, action_spec=policy.action_spec, ) filtered_random_tf_policy = tf_py_policy.TFPyPolicy( filtered_random_py_policy) collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy( q_policy, epsilon=tf_agent._epsilon_greedy) # Patch random policy for epsilon greedy collect policy filtered_random_tf_policy = FilteredRandomTFPolicy( time_step_spec=policy.time_step_spec, action_spec=policy.action_spec, ) collect_policy._random_policy = filtered_random_tf_policy tf_agent._policy = policy tf_agent._collect_policy = collect_policy tf_agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=train_env.batch_size, max_length=replay_buffer_capacity, ) print( 'Pre-filling replay buffer in {} steps'.format(initial_collect_steps)) for _ in range(initial_collect_steps): traj = collect_step(train_env, dummy_policy) replay_buffer.add_batch(traj) dataset = replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2, ).prefetch(3) iterator = iter(dataset) # Train tf_agent.train = common.function(tf_agent.train) tf_agent.train_step_counter.assign(0) avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes) returns = [avg_return] print('Starting iterations...') for i in range(num_iterations): # fill replay buffer for _ in range(collect_steps_per_iteration): traj = collect_step(train_env, tf_agent.collect_policy) # Add trajectory to the replay buffer replay_buffer.add_batch(traj) experience, _ = next(iterator) train_loss = tf_agent.train(experience) step = tf_agent.train_step_counter.numpy() if step % log_interval == 0: print('step = {0}: loss = {1}'.format(step, train_loss.loss)) if step % eval_interval == 0: avg_return = compute_avg_return(eval_env, tf_agent.policy, num_eval_episodes) print('step = {0}: avg return = {1}'.format(step, avg_return)) returns.append(avg_return) print('Finished {} iterations!'.format(num_iterations)) print('Playing with resulting policy') VERBOSE = True r = compute_avg_return(eval_env, tf_agent.policy, 1) print('Result: {}'.format(r)) steps = range(0, num_iterations + 1, eval_interval) # merged = tf.summary.merge_all() # writer = tf.summary.FileWriter(FLAGS.log_dir) # # writer.close() print('Check out chart for learning') plt.plot(steps, returns) plt.ylabel('Average Return') plt.xlabel('Step') plt.ylim(top=1000) plt.show()