Пример #1
0
 def testDistributionRaisesNotImplementedError(self):
   mock_tf_py_policy = tf_py_policy.TFPyPolicy(
       self._get_mock_py_policy())
   observation = tf.ones([5], tf.float32)
   time_step = ts.restart(observation)
   with self.assertRaises(NotImplementedError):
     mock_tf_py_policy.distribution(time_step=time_step)
Пример #2
0
  def testZeroState(self):
    policy_state_length = 5
    batch_size = 3
    mock_py_policy = mock.create_autospec(py_policy.Base)
    observation_spec = array_spec.ArraySpec((3,), np.float32)
    mock_py_policy.time_step_spec = ts.time_step_spec(observation_spec)
    mock_py_policy.action_spec = array_spec.BoundedArraySpec(
        (7,), np.int32, 1, 1)
    py_policy_state_spec = array_spec.BoundedArraySpec((policy_state_length,),
                                                       np.int32, 1, 1)
    # Make the mock policy and reset return value.
    mock_py_policy.policy_state_spec = py_policy_state_spec
    mock_py_policy.info_spec = ()

    expected_py_policy_state = np.zeros(
        [batch_size] + list(py_policy_state_spec.shape),
        py_policy_state_spec.dtype)
    mock_py_policy.get_initial_state.return_value = expected_py_policy_state

    tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy)
    initial_state = tf_mock_py_policy.get_initial_state(batch_size=batch_size)
    initial_state_ = self.evaluate(initial_state)

    self.assertEqual(1, mock_py_policy.get_initial_state.call_count)
    np.testing.assert_equal(initial_state_, expected_py_policy_state)
    def testAction(self):
        py_observation_spec = array_spec.BoundedArraySpec((3, ), np.int32, 1,
                                                          1)
        py_time_step_spec = ts.time_step_spec(py_observation_spec)
        py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, 1, 1)
        py_policy_state_spec = array_spec.BoundedArraySpec((5, ), np.int32, 0,
                                                           1)
        py_policy_info_spec = array_spec.BoundedArraySpec((3, ), np.int32, 0,
                                                          1)

        mock_py_policy = mock.create_autospec(py_policy.PyPolicy)
        mock_py_policy.time_step_spec = py_time_step_spec
        mock_py_policy.action_spec = py_action_spec
        mock_py_policy.policy_state_spec = py_policy_state_spec
        mock_py_policy.info_spec = py_policy_info_spec

        expected_py_policy_state = np.ones(py_policy_state_spec.shape,
                                           py_policy_state_spec.dtype)
        expected_py_time_step = tf.nest.map_structure(
            lambda arr_spec: np.ones((1, ) + arr_spec.shape, arr_spec.dtype),
            py_time_step_spec)
        expected_py_action = np.ones((1, ) + py_action_spec.shape,
                                     py_action_spec.dtype)
        expected_new_py_policy_state = np.zeros(py_policy_state_spec.shape,
                                                py_policy_state_spec.dtype)
        expected_py_info = np.zeros(py_policy_info_spec.shape,
                                    py_policy_info_spec.dtype)

        mock_py_policy.action.return_value = policy_step.PolicyStep(
            nest_utils.unbatch_nested_array(expected_py_action),
            expected_new_py_policy_state, expected_py_info)

        tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy)
        time_step = tf.nest.map_structure(
            lambda arr_spec: tf.ones((1, ) + arr_spec.shape, arr_spec.dtype),
            py_time_step_spec)
        action_step = tf_mock_py_policy.action(
            time_step, tf.ones(py_policy_state_spec.shape, tf.int32))
        py_action_step = self.evaluate(action_step)

        self.assertEqual(1, mock_py_policy.action.call_count)
        np.testing.assert_equal(
            mock_py_policy.action.call_args[1]['time_step'],
            nest_utils.unbatch_nested_array(expected_py_time_step))
        np.testing.assert_equal(
            mock_py_policy.action.call_args[1]['policy_state'],
            expected_py_policy_state)
        np.testing.assert_equal(py_action_step.action, expected_py_action)
        np.testing.assert_equal(py_action_step.state,
                                expected_new_py_policy_state)
        np.testing.assert_equal(py_action_step.info, expected_py_info)
Пример #4
0
  def testRandomPyPolicyGeneratesActionTensors(self):
    array_action_spec = array_spec.BoundedArraySpec((7,), np.int32, -10, 10)
    observation = tf.ones([3], tf.float32)
    time_step = ts.restart(observation)

    observation_spec = tensor_spec.TensorSpec.from_tensor(observation)
    time_step_spec = ts.time_step_spec(observation_spec)

    tf_py_random_policy = tf_py_policy.TFPyPolicy(
        random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec,
                                        action_spec=array_action_spec))

    batched_time_step = nest_utils.batch_nested_tensors(time_step)
    action_step = tf_py_random_policy.action(time_step=batched_time_step)
    action, new_policy_state = self.evaluate(
        [action_step.action, action_step.state])

    self.assertEqual((1,) + array_action_spec.shape, action.shape)
    self.assertTrue(np.all(action >= array_action_spec.minimum))
    self.assertTrue(np.all(action <= array_action_spec.maximum))
    self.assertEqual(new_policy_state, ())
Пример #5
0
  def testRandomPyPolicyGeneratesActionTensors(self):
    if tf.executing_eagerly():
      self.skipTest('b/123935604')

    py_action_spec = array_spec.BoundedArraySpec((7,), np.int32, -10, 10)

    observation = tf.ones([3], tf.float32)
    time_step = ts.restart(observation)
    observation_spec = tensor_spec.TensorSpec.from_tensor(observation)
    time_step_spec = ts.time_step_spec(observation_spec)

    tf_py_random_policy = tf_py_policy.TFPyPolicy(
        random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec,
                                        action_spec=py_action_spec))

    action_step = tf_py_random_policy.action(time_step=time_step)
    py_action, py_new_policy_state = self.evaluate(
        [action_step.action, action_step.state])

    self.assertEqual(py_action.shape, py_action_spec.shape)
    self.assertTrue(np.all(py_action >= py_action_spec.minimum))
    self.assertTrue(np.all(py_action <= py_action_spec.maximum))
    self.assertEqual(py_new_policy_state, ())
    def testPyPolicyIsBatchedTrue(self):
        action_dims = 5
        observation_dims = 3
        batch_size = 2
        array_action_spec = array_spec.BoundedArraySpec((action_dims, ),
                                                        np.int32, -10, 10)
        observation_spec = array_spec.ArraySpec((observation_dims, ),
                                                np.float32)
        array_time_step_spec = ts.time_step_spec(observation_spec)

        observation = tf.ones([batch_size, observation_dims], tf.float32)
        time_step = ts.restart(observation, batch_size=batch_size)

        tf_py_random_policy = tf_py_policy.TFPyPolicy(
            random_py_policy.RandomPyPolicy(
                time_step_spec=array_time_step_spec,
                action_spec=array_action_spec),
            py_policy_is_batched=True)

        action_step = tf_py_random_policy.action(time_step=time_step)
        action = self.evaluate(action_step.action)

        self.assertEqual(action.shape, (batch_size, action_dims))
Пример #7
0
 def testVariables(self):
   mock_tf_py_policy = tf_py_policy.TFPyPolicy(
       self._get_mock_py_policy())
   np.testing.assert_equal(mock_tf_py_policy.variables(), [])
Пример #8
0
def train():
    global VERBOSE
    environment = TradeEnvironment()
    # utils.validate_py_environment(environment, episodes=5)
    # Environments
    train_env = tf_py_environment.TFPyEnvironment(environment)
    eval_env = tf_py_environment.TFPyEnvironment(environment)

    num_iterations = 50
    fc_layer_params = (512, )  # ~ (17 + 1001) / 2
    input_fc_layer_params = (17, )
    output_fc_layer_params = (20, )
    lstm_size = (17, )
    initial_collect_steps = 20
    collect_steps_per_iteration = 1
    batch_size = 64
    replay_buffer_capacity = 10000

    gamma = 0.99  # check if 1 will work here
    target_update_tau = 0.05
    target_update_period = 5
    epsilon_greedy = 0.1
    reward_scale_factor = 1.0
    learning_rate = 1e-2
    log_interval = 30
    num_eval_episodes = 5
    eval_interval = 15

    # q_net = q_network.QNetwork(
    #     train_env.observation_spec(),
    #     train_env.action_spec(),
    #     fc_layer_params=fc_layer_params,
    # )

    q_net = q_rnn_network.QRnnNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        input_fc_layer_params=input_fc_layer_params,
        lstm_size=lstm_size,
        output_fc_layer_params=output_fc_layer_params,
    )

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

    train_step_counter = tf.compat.v2.Variable(0)

    tf_agent = dqn_agent.DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        epsilon_greedy=epsilon_greedy,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
        train_step_counter=train_step_counter,
        gradient_clipping=None,
        debug_summaries=False,
        summarize_grads_and_vars=False,
    )

    q_policy = FilteredQPolicy(
        tf_agent._time_step_spec,
        tf_agent._action_spec,
        q_network=tf_agent._q_network,
    )

    # Valid policy to pre-fill replay buffer
    dummy_policy = DummyTradePolicy(
        train_env.time_step_spec(),
        train_env.action_spec(),
    )

    # Main agent's policy; greedy one
    policy = greedy_policy.GreedyPolicy(q_policy)
    filtered_random_py_policy = FilteredRandomPyPolicy(
        time_step_spec=policy.time_step_spec,
        action_spec=policy.action_spec,
    )
    filtered_random_tf_policy = tf_py_policy.TFPyPolicy(
        filtered_random_py_policy)
    collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
        q_policy, epsilon=tf_agent._epsilon_greedy)
    # Patch random policy for epsilon greedy collect policy

    filtered_random_tf_policy = FilteredRandomTFPolicy(
        time_step_spec=policy.time_step_spec,
        action_spec=policy.action_spec,
    )
    collect_policy._random_policy = filtered_random_tf_policy

    tf_agent._policy = policy
    tf_agent._collect_policy = collect_policy
    tf_agent.initialize()

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=replay_buffer_capacity,
    )
    print(
        'Pre-filling replay buffer in {} steps'.format(initial_collect_steps))
    for _ in range(initial_collect_steps):
        traj = collect_step(train_env, dummy_policy)
        replay_buffer.add_batch(traj)

    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=batch_size,
        num_steps=2,
    ).prefetch(3)

    iterator = iter(dataset)
    # Train
    tf_agent.train = common.function(tf_agent.train)

    tf_agent.train_step_counter.assign(0)

    avg_return = compute_avg_return(eval_env, tf_agent.policy,
                                    num_eval_episodes)

    returns = [avg_return]

    print('Starting iterations...')
    for i in range(num_iterations):

        # fill replay buffer
        for _ in range(collect_steps_per_iteration):
            traj = collect_step(train_env, tf_agent.collect_policy)
            # Add trajectory to the replay buffer
            replay_buffer.add_batch(traj)

        experience, _ = next(iterator)
        train_loss = tf_agent.train(experience)

        step = tf_agent.train_step_counter.numpy()

        if step % log_interval == 0:
            print('step = {0}: loss = {1}'.format(step, train_loss.loss))

        if step % eval_interval == 0:
            avg_return = compute_avg_return(eval_env, tf_agent.policy,
                                            num_eval_episodes)
            print('step = {0}: avg return = {1}'.format(step, avg_return))
            returns.append(avg_return)

    print('Finished {} iterations!'.format(num_iterations))

    print('Playing with resulting policy')
    VERBOSE = True
    r = compute_avg_return(eval_env, tf_agent.policy, 1)
    print('Result: {}'.format(r))
    steps = range(0, num_iterations + 1, eval_interval)

    # merged = tf.summary.merge_all()
    # writer = tf.summary.FileWriter(FLAGS.log_dir)
    #
    # writer.close()
    print('Check out chart for learning')
    plt.plot(steps, returns)
    plt.ylabel('Average Return')
    plt.xlabel('Step')
    plt.ylim(top=1000)
    plt.show()