示例#1
0
    def test_step(self):
        tf_env = tf_py_environment.TFPyEnvironment(
            suite_gym.load('CartPole-v0'))
        indexed_tf_env = IndexedTFEnv(tf_env, 5)
        # take first action
        a1 = PolicyStep(action=tf.convert_to_tensor([1]), state=(), info=())
        time_step_0 = indexed_tf_env.step(a1)
        self.assertEqual(time_step_0["env_id"], 5)
        self.assertEqual(time_step_0["ts_id"], 0)
        self.assertEqual(time_step_0["reward"], 0)
        self.assertEqual(time_step_0["step_type"], 0)
        self.assertEqual(time_step_0["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_0)
        self.assertTrue("ob_1" in time_step_0)
        self.assertTrue("ob_2" in time_step_0)
        self.assertTrue("ob_3" in time_step_0)

        # take second action
        a2 = PolicyStep(action=tf.convert_to_tensor([0]), state=(), info=())
        time_step_1 = indexed_tf_env.step(a2)
        self.assertEqual(time_step_1["env_id"], 5)
        self.assertEqual(time_step_1["ts_id"], 1)
        self.assertEqual(time_step_1["reward"], 1)
        self.assertEqual(time_step_1["step_type"], 1)
        self.assertEqual(time_step_1["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_1)
        self.assertTrue("ob_1" in time_step_1)
        self.assertTrue("ob_2" in time_step_1)
        self.assertTrue("ob_3" in time_step_1)
示例#2
0
    def _actor_train_step(self, exp: Experience, state: DdpgActorState):
        action, actor_state = self._actor_network(exp.observation,
                                                  exp.step_type,
                                                  network_state=state.actor)

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(action)
            q_value, critic_state = self._critic_network(
                (exp.observation, action), network_state=state.critic)

        dqda = tape.gradient(q_value, action)

        def actor_loss_fn(dqda, action):
            if self._dqda_clipping:
                dqda = tf.clip_by_value(dqda, -self._dqda_clipping,
                                        self._dqda_clipping)
            loss = 0.5 * losses.element_wise_squared_loss(
                tf.stop_gradient(dqda + action), action)
            loss = tf.reduce_sum(loss, axis=list(range(1, len(loss.shape))))
            return loss

        actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action)
        state = DdpgActorState(actor=actor_state, critic=critic_state)
        info = LossInfo(loss=tf.add_n(tf.nest.flatten(actor_loss)),
                        extra=actor_loss)
        return PolicyStep(action=action, state=state, info=info)
示例#3
0
 def action(self, time_step):
     observation = time_step.observation
     batch_size = observation.shape[0]
     action = tf.constant(self._action,
                          dtype=tf.float32,
                          shape=[batch_size, 1])
     return PolicyStep(action=action)
示例#4
0
    def rollout(self, time_step: ActionTimeStep, state: AgentState):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(time_step)
        if self._icm is not None:
            icm_step = self._icm.train_step(
                (observation, time_step.prev_action), state=state.icm)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.rollout(
            time_step._replace(observation=observation), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        # TODO
        # avoid computing this when rollout (off policy train)
        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.action, step_type=time_step.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
 def predict(self, time_step: ActionTimeStep, state=None):
     observation = self._encode(time_step)
     action_distribution, actor_state = self._actor_network(
         observation, step_type=time_step.step_type, network_state=state)
     return PolicyStep(action=action_distribution,
                       state=actor_state,
                       info=())
示例#6
0
    def rollout(self,
                time_step: ActionTimeStep,
                state,
                mode,
                epsilon_greedy=1.0):
        """Train one step.

        Args:
            time_step: time_step.observation should be the latent vector
            state: state of the model
        """
        latent_vector = time_step.observation
        rnn_output, rnn_state = self._rnn(latent_vector, state)
        mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output)
        policy_input = tf.concat(
            [tf.stop_gradient(latent_vector), rnn_output, mem_readout],
            axis=-1)
        action_distribution, _ = self._actor_net(policy_input,
                                                 step_type=time_step.step_type,
                                                 network_state=None)

        value, _ = self._value_net(latent_vector,
                                   step_type=time_step.step_type,
                                   network_state=None)

        info = ActorCriticInfo(action_distribution=action_distribution,
                               value=value)
        action = common.epsilon_greedy_sample(action_distribution,
                                              epsilon_greedy)
        return PolicyStep(action=action, state=rnn_state, info=info)
示例#7
0
 def _rollout_partial_state(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(time_step.observation,
                                         step_type=time_step.step_type,
                                         network_state=state.share.actor)
     empty_state = tf.nest.map_structure(lambda x: (),
                                         self.train_state_spec)
     state = empty_state._replace(share=SacShareState(actor=state))
     return PolicyStep(action=action, state=state, info=())
示例#8
0
    def predict(self, time_step: ActionTimeStep, state: ActorCriticState):
        """Predict for one step."""
        action_distribution, actor_state = self._actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state.actor)

        return PolicyStep(action=action_distribution,
                          state=ActorCriticState(actor=actor_state),
                          info=())
示例#9
0
    def greedy_predict(self, time_step: ActionTimeStep, state=None, eps=0.1):
        observation = self._encode(time_step)

        new_state = AgentState()

        rl_step = self._rl_algorithm.greedy_predict(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())
示例#10
0
 def greedy_predict(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(
         time_step.observation,
         step_type=time_step.step_type,
         network_state=state.actor.actor)
     empty_state = tf.nest.map_structure(lambda x: (),
                                         self.train_state_spec)
     state = empty_state._replace(
         actor=DdpgActorState(actor=state, critic=()))
     return PolicyStep(action=action, state=state, info=())
示例#11
0
    def predict(self, time_step: ActionTimeStep, state: AgentState):
        """Predict for one step."""
        observation = self._encode(time_step)

        new_state = AgentState()

        rl_step = self._rl_algorithm.predict(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())
示例#12
0
    def rollout(self, time_step: ActionTimeStep, state):
        """Train one step."""
        mbp_step = self._mbp.train_step(inputs=(time_step.observation,
                                                time_step.prev_action),
                                        state=state.mbp_state)
        mba_step = self._mba.rollout(
            time_step=time_step._replace(observation=mbp_step.outputs),
            state=state.mba_state)

        return PolicyStep(action=mba_step.action,
                          state=MerlinState(mbp_state=mbp_step.state,
                                            mba_state=mba_step.state),
                          info=MerlinInfo(mbp_info=mbp_step.info,
                                          mba_info=mba_step.info))
示例#13
0
    def rollout(self, time_step: ActionTimeStep, state: ActorCriticState):
        """Rollout for one step."""
        value, value_state = self._value_network(time_step.observation,
                                                 step_type=time_step.step_type,
                                                 network_state=state.value)

        action_distribution, actor_state = self._actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state.actor)

        return PolicyStep(action=action_distribution,
                          state=ActorCriticState(actor=actor_state,
                                                 value=value_state),
                          info=ActorCriticInfo(value=value))
def test_eval_logger():
    """
    Tests the per step logging mediated through a custom TensorFlow metric.

    Note that due to the fact that TensorFlow places logging in a graph built through C++ which is
    only triggered when tensors are evaluated it is very difficult to capture the logging message
    even through using mocked output streams. Therefore, the test checks the attributes that can be
    tested and logs expected logging values for by-eye comparison. This is a fairly simple case
    since the logging code is simple but the test is in this sense in complete.
    """
    # Set up the logger using default parameters.
    logger = EvalPerStepLogger()
    # Test that the time step counter is initialised to zero.
    assert logger._t == 0

    # Build one time step's worth of data to be logged.
    observation = tf.convert_to_tensor(np.random.randint(10, size=(1, 1)),
                                       dtype=tf.float32)
    action = tf.convert_to_tensor(np.eye(2)[np.random.randint(2)])
    reward = -1 * observation
    discount = tf.convert_to_tensor(np.array([0.99]))

    # The logger takes in a tuple of (TimeStep, PolicyStep, TimeStep) the second time step
    # represents the next period and is not used so we simply pass a copy of the original time step.
    time_step = ts.TimeStep(ts.StepType(1), reward, discount, observation)
    policy_step = PolicyStep(action, state=(), info=())
    next_time_step = copy.deepcopy(time_step)
    # Collect the data in a tuple as required by the logger.
    time_step_data = (time_step, policy_step, next_time_step)

    # Print the expected logging term for comparison by eye.
    tf.print("\nExpected Values\nStep: ",
             0,
             "\t",
             "State: ",
             observation,
             "\t",
             "Action: ",
             action,
             end="\n",
             output_stream=sys.stdout)
    # Run the logging for a single time step.
    logger(time_step_data)
    # Check that the time step counter has incremented.
    assert logger._t == 1
示例#15
0
文件: agent.py 项目: runjerry/alf
    def predict(self, time_step: ActionTimeStep, state: AgentState,
                epsilon_greedy):
        """Predict for one step."""
        observation = self._encode(time_step)

        new_state = AgentState()
        if self._goal_generator is not None:
            goal_step = self._goal_generator.predict(
                time_step._replace(observation=observation),
                state.goal_generator, epsilon_greedy)
            new_state = new_state._replace(goal_generator=goal_step.state)
            observation = [observation, goal_step.action]

        rl_step = self._rl_algorithm.predict(
            time_step._replace(observation=observation), state.rl,
            epsilon_greedy)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())
示例#16
0
    def train_step(self, exp: Experience, state: SacState):
        action_distribution, share_actor_state = self._actor_network(
            exp.observation,
            step_type=exp.step_type,
            network_state=state.share.actor)
        action = tf.nest.map_structure(lambda d: d.sample(),
                                       action_distribution)
        log_pi = tfa_common.log_probability(action_distribution, action,
                                            self._action_spec)

        actor_state, actor_info = self._actor_train_step(
            exp, state.actor, action_distribution, action, log_pi)
        critic_state, critic_info = self._critic_train_step(
            exp, state.critic, action, log_pi)
        alpha_info = self._alpha_train_step(log_pi)
        state = SacState(share=SacShareState(actor=share_actor_state),
                         actor=actor_state,
                         critic=critic_state)
        info = SacInfo(actor=actor_info, critic=critic_info, alpha=alpha_info)
        return PolicyStep(action_distribution, state, info)
示例#17
0
    def predict(self, time_step: ActionTimeStep, state, epsilon_greedy):
        action, state = self._actor_network(time_step.observation,
                                            step_type=time_step.step_type,
                                            network_state=state.actor.actor)
        empty_state = tf.nest.map_structure(lambda x: (),
                                            self.train_state_spec)

        def _sample(a, ou):
            return tf.cond(
                tf.less(tf.random.uniform((), 0, 1), epsilon_greedy),
                lambda: a + ou(), lambda: a)

        noisy_action = tf.nest.map_structure(_sample, action, self._ou_process)
        noisy_action = tf.nest.map_structure(tfa_common.clip_to_spec,
                                             noisy_action, self._action_spec)
        state = empty_state._replace(
            actor=DdpgActorState(actor=state, critic=()))
        return PolicyStep(action=noisy_action,
                          state=state,
                          info=DdpgInfo(action_distribution=action))
示例#18
0
    def rollout(self, time_step: ActionTimeStep, state=None):
        observation = self._encode(time_step)

        value, value_state = self._value_network(
            observation,
            step_type=time_step.step_type,
            network_state=state.value_state)
        # ValueRnnNetwork will add a time dim to value
        # See value_rnn_network.py L153
        if isinstance(self._value_network, ValueRnnNetwork):
            value = tf.squeeze(value, axis=1)

        action_distribution, actor_state = self._actor_network(
            observation,
            step_type=time_step.step_type,
            network_state=state.actor_state)

        info = ActorCriticInfo(value=value,
                               icm_reward=(),
                               icm_info=(),
                               entropy_target_info=())
        if self._icm is not None:
            icm_step = self._icm.train_step(
                (observation, time_step.prev_action), state=state.icm_state)
            info = info._replace(icm_reward=icm_step.outputs,
                                 icm_info=icm_step.info)
            icm_state = icm_step.state
        else:
            icm_state = ()

        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                action_distribution)
            info = info._replace(entropy_target_info=et_step.info)

        state = ActorCriticState(actor_state=actor_state,
                                 value_state=value_state,
                                 icm_state=icm_state)

        return PolicyStep(action=action_distribution, state=state, info=info)
示例#19
0
文件: agent.py 项目: runjerry/alf
    def rollout(self, time_step: ActionTimeStep, state: AgentState, mode):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(time_step)

        if self._goal_generator is not None:
            goal_step = self._goal_generator.rollout(
                time_step._replace(observation=time_step.observation),
                state.goal_generator, mode)
            new_state = new_state._replace(goal_generator=goal_step.state)
            info = info._replace(goal_generator=goal_step.info)
            observation = [observation, goal_step.action]

        if self._icm is not None:
            icm_step = self._icm.train_step(
                time_step._replace(observation=observation), state=state.icm)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.rollout(
            time_step._replace(observation=observation), state.rl, mode)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            # TODO: For off-policy training, skip entropy_target_algorithm
            # during rollout()
            assert 'action_distribution' in rl_step.info._fields, (
                "PolicyStep from rl_algorithm.rollout() does not contain "
                "`action_distribution`, which is required by "
                "`enforce_entropy_target`")
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.info.action_distribution,
                step_type=time_step.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
示例#20
0
    def _prepare_specs(self, algorithm):
        """Prepare various tensor specs."""

        time_step = self.get_initial_time_step()
        self._time_step_spec = common.extract_spec(time_step)
        self._action_spec = self._env.action_spec()

        policy_step = algorithm.rollout(
            algorithm.transform_timestep(time_step), self._initial_state)
        info_spec = common.extract_spec(policy_step.info)
        self._policy_step_spec = PolicyStep(
            action=self._action_spec,
            state=algorithm.train_state_spec,
            info=info_spec)

        self._action_distribution_spec = tf.nest.map_structure(
            common.to_distribution_spec, algorithm.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        algorithm.prepare_off_policy_specs(time_step)
示例#21
0
    def train_step(self, time_step: ActionTimeStep, state):
        """Train one step.

        Args:
            time_step: time_step.observation should be the latent vector
            state: state of the model
        """
        latent_vector = time_step.observation
        rnn_output, rnn_state = self._rnn(latent_vector, state)
        mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output)
        policy_input = tf.concat(
            [tf.stop_gradient(latent_vector), rnn_output, mem_readout],
            axis=-1)
        action_distribution, _ = self._actor_net(
            policy_input, step_type=time_step.step_type, network_state=None)

        value, _ = self._value_net(
            latent_vector, step_type=time_step.step_type, network_state=None)

        info = ActorCriticInfo(
            value=value, icm_reward=(), icm_info=(), entropy_target_info=())
        return PolicyStep(
            action=action_distribution, state=rnn_state, info=info)
示例#22
0
文件: agent.py 项目: runjerry/alf
    def train_step(self, exp: Experience, state):
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(exp)

        if self._goal_generator is not None:
            goal_step = self._goal_generator.train_step(
                exp._replace(observation=observation), state.goal_generator)
            info = info._replace(goal_generator=goal_step.info)
            new_state = new_state._replace(goal_generator=goal_step.state)
            observation = [observation, goal_step.action]

        if self._icm is not None:
            icm_step = self._icm.train_step(
                exp._replace(observation=observation),
                state=state.icm,
                calc_intrinsic_reward=False)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.train_step(
            exp._replace(observation=observation,
                         rollout_info=exp.rollout_info.rl), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            assert 'action_distribution' in rl_step.info._fields, (
                "PolicyStep from rl_algorithm.train_step() does not contain "
                "`action_distribution`, which is required by "
                "`enforce_entropy_target`")
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.info.action_distribution, step_type=exp.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
示例#23
0
    def train_step(self, exp: Experience, state):
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(exp)

        if self._icm is not None:
            icm_step = self._icm.train_step((observation, exp.prev_action),
                                            state=state.icm,
                                            calc_intrinsic_reward=False)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.train_step(
            exp._replace(observation=observation), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.action, step_type=exp.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)
示例#24
0
 def greedy_predict(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(time_step.observation,
                                         step_type=time_step.step_type,
                                         network_state=state)
     return PolicyStep(action=action, state=state, info=())
 def distribution(self, time_step, policy_state=()):
   del policy_state
   action = self.action(time_step).action
   return PolicyStep(action=_MockDistribution(action))
示例#26
0
    def _prepare_specs(self, algorithm):
        """Prepare various tensor specs."""
        def extract_spec(nest):
            return tf.nest.map_structure(
                lambda t: tf.TensorSpec(t.shape[1:], t.dtype), nest)

        time_step = self.get_initial_time_step()
        self._time_step_spec = extract_spec(time_step)
        self._action_spec = self._env.action_spec()

        policy_step = algorithm.predict(time_step, self._initial_state)
        info_spec = extract_spec(policy_step.info)
        self._pred_policy_step_spec = PolicyStep(
            action=self._action_spec,
            state=algorithm.predict_state_spec,
            info=info_spec)

        def _to_distribution_spec(spec):
            if isinstance(spec, tf.TensorSpec):
                return DistributionSpec(tfp.distributions.Deterministic,
                                        input_params_spec={"loc": spec},
                                        sample_spec=spec)
            return spec

        self._action_distribution_spec = tf.nest.map_structure(
            _to_distribution_spec, algorithm.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        self._experience_spec = Experience(
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            observation=self._time_step_spec.observation,
            prev_action=self._action_spec,
            action=self._action_spec,
            info=info_spec,
            action_distribution=self._action_dist_param_spec)

        action_dist_params = common.zero_tensor_from_nested_spec(
            self._experience_spec.action_distribution, self._env.batch_size)
        action_dist = nested_distributions_from_specs(
            self._action_distribution_spec, action_dist_params)
        exp = Experience(step_type=time_step.step_type,
                         reward=time_step.reward,
                         discount=time_step.discount,
                         observation=time_step.observation,
                         prev_action=time_step.prev_action,
                         action=time_step.prev_action,
                         info=policy_step.info,
                         action_distribution=action_dist)

        processed_exp = algorithm.preprocess_experience(exp)
        self._processed_experience_spec = self._experience_spec._replace(
            info=extract_spec(processed_exp.info))

        policy_step = common.algorithm_step(
            algorithm,
            ob_transformer=self._observation_transformer,
            time_step=exp,
            state=common.get_initial_policy_state(self._env.batch_size,
                                                  algorithm.train_state_spec),
            training=True)
        info_spec = extract_spec(policy_step.info)
        self._training_info_spec = make_training_info(
            action=self._action_spec,
            action_distribution=self._action_dist_param_spec,
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            info=info_spec,
            collect_info=self._processed_experience_spec.info,
            collect_action_distribution=self._action_dist_param_spec)
示例#27
0
 def action(self, time_step):
     del time_step
     action = tf.constant(self._action, dtype=tf.float32, shape=[1])
     return PolicyStep(action=action)