Python PolicyStep示例，tf_agents.trajectories.policy_step.PolicyStep Python示例

示例#1

0

显示文件

    def test_step(self):
        tf_env = tf_py_environment.TFPyEnvironment(
            suite_gym.load('CartPole-v0'))
        indexed_tf_env = IndexedTFEnv(tf_env, 5)
        # take first action
        a1 = PolicyStep(action=tf.convert_to_tensor([1]), state=(), info=())
        time_step_0 = indexed_tf_env.step(a1)
        self.assertEqual(time_step_0["env_id"], 5)
        self.assertEqual(time_step_0["ts_id"], 0)
        self.assertEqual(time_step_0["reward"], 0)
        self.assertEqual(time_step_0["step_type"], 0)
        self.assertEqual(time_step_0["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_0)
        self.assertTrue("ob_1" in time_step_0)
        self.assertTrue("ob_2" in time_step_0)
        self.assertTrue("ob_3" in time_step_0)

        # take second action
        a2 = PolicyStep(action=tf.convert_to_tensor([0]), state=(), info=())
        time_step_1 = indexed_tf_env.step(a2)
        self.assertEqual(time_step_1["env_id"], 5)
        self.assertEqual(time_step_1["ts_id"], 1)
        self.assertEqual(time_step_1["reward"], 1)
        self.assertEqual(time_step_1["step_type"], 1)
        self.assertEqual(time_step_1["discount"], 1.0)
        self.assertTrue("ob_0" in time_step_1)
        self.assertTrue("ob_1" in time_step_1)
        self.assertTrue("ob_2" in time_step_1)
        self.assertTrue("ob_3" in time_step_1)

示例#2

0

显示文件

    def _actor_train_step(self, exp: Experience, state: DdpgActorState):
        action, actor_state = self._actor_network(exp.observation,
                                                  exp.step_type,
                                                  network_state=state.actor)

        with tf.GradientTape(watch_accessed_variables=False) as tape:
            tape.watch(action)
            q_value, critic_state = self._critic_network(
                (exp.observation, action), network_state=state.critic)

        dqda = tape.gradient(q_value, action)

        def actor_loss_fn(dqda, action):
            if self._dqda_clipping:
                dqda = tf.clip_by_value(dqda, -self._dqda_clipping,
                                        self._dqda_clipping)
            loss = 0.5 * losses.element_wise_squared_loss(
                tf.stop_gradient(dqda + action), action)
            loss = tf.reduce_sum(loss, axis=list(range(1, len(loss.shape))))
            return loss

        actor_loss = tf.nest.map_structure(actor_loss_fn, dqda, action)
        state = DdpgActorState(actor=actor_state, critic=critic_state)
        info = LossInfo(loss=tf.add_n(tf.nest.flatten(actor_loss)),
                        extra=actor_loss)
        return PolicyStep(action=action, state=state, info=info)

示例#3

0

显示文件

文件： sac_agent_test.py 项目： zircote/agents

 def action(self, time_step):
     observation = time_step.observation
     batch_size = observation.shape[0]
     action = tf.constant(self._action,
                          dtype=tf.float32,
                          shape=[batch_size, 1])
     return PolicyStep(action=action)

示例#4

0

显示文件

    def rollout(self, time_step: ActionTimeStep, state: AgentState):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(time_step)
        if self._icm is not None:
            icm_step = self._icm.train_step(
                (observation, time_step.prev_action), state=state.icm)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.rollout(
            time_step._replace(observation=observation), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        # TODO
        # avoid computing this when rollout (off policy train)
        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.action, step_type=time_step.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)

示例#5

0

显示文件

文件： actor_critic_algorithm.py 项目： zhaoyinfu123/alf

 def predict(self, time_step: ActionTimeStep, state=None):
     observation = self._encode(time_step)
     action_distribution, actor_state = self._actor_network(
         observation, step_type=time_step.step_type, network_state=state)
     return PolicyStep(action=action_distribution,
                       state=actor_state,
                       info=())

示例#6

0

显示文件

文件： merlin_algorithm.py 项目： runjerry/alf

    def rollout(self,
                time_step: ActionTimeStep,
                state,
                mode,
                epsilon_greedy=1.0):
        """Train one step.

        Args:
            time_step: time_step.observation should be the latent vector
            state: state of the model
        """
        latent_vector = time_step.observation
        rnn_output, rnn_state = self._rnn(latent_vector, state)
        mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output)
        policy_input = tf.concat(
            [tf.stop_gradient(latent_vector), rnn_output, mem_readout],
            axis=-1)
        action_distribution, _ = self._actor_net(policy_input,
                                                 step_type=time_step.step_type,
                                                 network_state=None)

        value, _ = self._value_net(latent_vector,
                                   step_type=time_step.step_type,
                                   network_state=None)

        info = ActorCriticInfo(action_distribution=action_distribution,
                               value=value)
        action = common.epsilon_greedy_sample(action_distribution,
                                              epsilon_greedy)
        return PolicyStep(action=action, state=rnn_state, info=info)

示例#7

0

显示文件

文件： sac_algorithm.py 项目： zhaoyinfu123/alf

 def _rollout_partial_state(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(time_step.observation,
                                         step_type=time_step.step_type,
                                         network_state=state.share.actor)
     empty_state = tf.nest.map_structure(lambda x: (),
                                         self.train_state_spec)
     state = empty_state._replace(share=SacShareState(actor=state))
     return PolicyStep(action=action, state=state, info=())

示例#8

0

显示文件

文件： actor_critic_algorithm.py 项目： ruizhaogit/alf

    def predict(self, time_step: ActionTimeStep, state: ActorCriticState):
        """Predict for one step."""
        action_distribution, actor_state = self._actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state.actor)

        return PolicyStep(action=action_distribution,
                          state=ActorCriticState(actor=actor_state),
                          info=())

示例#9

0

显示文件

    def greedy_predict(self, time_step: ActionTimeStep, state=None, eps=0.1):
        observation = self._encode(time_step)

        new_state = AgentState()

        rl_step = self._rl_algorithm.greedy_predict(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())

示例#10

0

显示文件

文件： ddpg_algorithm.py 项目： ruizhaogit/alf

 def greedy_predict(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(
         time_step.observation,
         step_type=time_step.step_type,
         network_state=state.actor.actor)
     empty_state = tf.nest.map_structure(lambda x: (),
                                         self.train_state_spec)
     state = empty_state._replace(
         actor=DdpgActorState(actor=state, critic=()))
     return PolicyStep(action=action, state=state, info=())

示例#11

0

显示文件

    def predict(self, time_step: ActionTimeStep, state: AgentState):
        """Predict for one step."""
        observation = self._encode(time_step)

        new_state = AgentState()

        rl_step = self._rl_algorithm.predict(
            time_step._replace(observation=observation), state.rl)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())

示例#12

0

显示文件

    def rollout(self, time_step: ActionTimeStep, state):
        """Train one step."""
        mbp_step = self._mbp.train_step(inputs=(time_step.observation,
                                                time_step.prev_action),
                                        state=state.mbp_state)
        mba_step = self._mba.rollout(
            time_step=time_step._replace(observation=mbp_step.outputs),
            state=state.mba_state)

        return PolicyStep(action=mba_step.action,
                          state=MerlinState(mbp_state=mbp_step.state,
                                            mba_state=mba_step.state),
                          info=MerlinInfo(mbp_info=mbp_step.info,
                                          mba_info=mba_step.info))

示例#13

0

显示文件

文件： actor_critic_algorithm.py 项目： ruizhaogit/alf

    def rollout(self, time_step: ActionTimeStep, state: ActorCriticState):
        """Rollout for one step."""
        value, value_state = self._value_network(time_step.observation,
                                                 step_type=time_step.step_type,
                                                 network_state=state.value)

        action_distribution, actor_state = self._actor_network(
            time_step.observation,
            step_type=time_step.step_type,
            network_state=state.actor)

        return PolicyStep(action=action_distribution,
                          state=ActorCriticState(actor=actor_state,
                                                 value=value_state),
                          info=ActorCriticInfo(value=value))

示例#14

0

显示文件

文件： test_custom_tf_metrics.py 项目： stochasticnetworkcontrol/snc

def test_eval_logger():
    """
    Tests the per step logging mediated through a custom TensorFlow metric.

    Note that due to the fact that TensorFlow places logging in a graph built through C++ which is
    only triggered when tensors are evaluated it is very difficult to capture the logging message
    even through using mocked output streams. Therefore, the test checks the attributes that can be
    tested and logs expected logging values for by-eye comparison. This is a fairly simple case
    since the logging code is simple but the test is in this sense in complete.
    """
    # Set up the logger using default parameters.
    logger = EvalPerStepLogger()
    # Test that the time step counter is initialised to zero.
    assert logger._t == 0

    # Build one time step's worth of data to be logged.
    observation = tf.convert_to_tensor(np.random.randint(10, size=(1, 1)),
                                       dtype=tf.float32)
    action = tf.convert_to_tensor(np.eye(2)[np.random.randint(2)])
    reward = -1 * observation
    discount = tf.convert_to_tensor(np.array([0.99]))

    # The logger takes in a tuple of (TimeStep, PolicyStep, TimeStep) the second time step
    # represents the next period and is not used so we simply pass a copy of the original time step.
    time_step = ts.TimeStep(ts.StepType(1), reward, discount, observation)
    policy_step = PolicyStep(action, state=(), info=())
    next_time_step = copy.deepcopy(time_step)
    # Collect the data in a tuple as required by the logger.
    time_step_data = (time_step, policy_step, next_time_step)

    # Print the expected logging term for comparison by eye.
    tf.print("\nExpected Values\nStep: ",
             0,
             "\t",
             "State: ",
             observation,
             "\t",
             "Action: ",
             action,
             end="\n",
             output_stream=sys.stdout)
    # Run the logging for a single time step.
    logger(time_step_data)
    # Check that the time step counter has incremented.
    assert logger._t == 1

示例#15

0

显示文件

文件： agent.py 项目： runjerry/alf

    def predict(self, time_step: ActionTimeStep, state: AgentState,
                epsilon_greedy):
        """Predict for one step."""
        observation = self._encode(time_step)

        new_state = AgentState()
        if self._goal_generator is not None:
            goal_step = self._goal_generator.predict(
                time_step._replace(observation=observation),
                state.goal_generator, epsilon_greedy)
            new_state = new_state._replace(goal_generator=goal_step.state)
            observation = [observation, goal_step.action]

        rl_step = self._rl_algorithm.predict(
            time_step._replace(observation=observation), state.rl,
            epsilon_greedy)
        new_state = new_state._replace(rl=rl_step.state)

        return PolicyStep(action=rl_step.action, state=new_state, info=())

示例#16

0

显示文件

文件： sac_algorithm.py 项目： mathkobe/alf

    def train_step(self, exp: Experience, state: SacState):
        action_distribution, share_actor_state = self._actor_network(
            exp.observation,
            step_type=exp.step_type,
            network_state=state.share.actor)
        action = tf.nest.map_structure(lambda d: d.sample(),
                                       action_distribution)
        log_pi = tfa_common.log_probability(action_distribution, action,
                                            self._action_spec)

        actor_state, actor_info = self._actor_train_step(
            exp, state.actor, action_distribution, action, log_pi)
        critic_state, critic_info = self._critic_train_step(
            exp, state.critic, action, log_pi)
        alpha_info = self._alpha_train_step(log_pi)
        state = SacState(share=SacShareState(actor=share_actor_state),
                         actor=actor_state,
                         critic=critic_state)
        info = SacInfo(actor=actor_info, critic=critic_info, alpha=alpha_info)
        return PolicyStep(action_distribution, state, info)

示例#17

0

显示文件

    def predict(self, time_step: ActionTimeStep, state, epsilon_greedy):
        action, state = self._actor_network(time_step.observation,
                                            step_type=time_step.step_type,
                                            network_state=state.actor.actor)
        empty_state = tf.nest.map_structure(lambda x: (),
                                            self.train_state_spec)

        def _sample(a, ou):
            return tf.cond(
                tf.less(tf.random.uniform((), 0, 1), epsilon_greedy),
                lambda: a + ou(), lambda: a)

        noisy_action = tf.nest.map_structure(_sample, action, self._ou_process)
        noisy_action = tf.nest.map_structure(tfa_common.clip_to_spec,
                                             noisy_action, self._action_spec)
        state = empty_state._replace(
            actor=DdpgActorState(actor=state, critic=()))
        return PolicyStep(action=noisy_action,
                          state=state,
                          info=DdpgInfo(action_distribution=action))

示例#18

0

显示文件

文件： actor_critic_algorithm.py 项目： zhaoyinfu123/alf

    def rollout(self, time_step: ActionTimeStep, state=None):
        observation = self._encode(time_step)

        value, value_state = self._value_network(
            observation,
            step_type=time_step.step_type,
            network_state=state.value_state)
        # ValueRnnNetwork will add a time dim to value
        # See value_rnn_network.py L153
        if isinstance(self._value_network, ValueRnnNetwork):
            value = tf.squeeze(value, axis=1)

        action_distribution, actor_state = self._actor_network(
            observation,
            step_type=time_step.step_type,
            network_state=state.actor_state)

        info = ActorCriticInfo(value=value,
                               icm_reward=(),
                               icm_info=(),
                               entropy_target_info=())
        if self._icm is not None:
            icm_step = self._icm.train_step(
                (observation, time_step.prev_action), state=state.icm_state)
            info = info._replace(icm_reward=icm_step.outputs,
                                 icm_info=icm_step.info)
            icm_state = icm_step.state
        else:
            icm_state = ()

        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                action_distribution)
            info = info._replace(entropy_target_info=et_step.info)

        state = ActorCriticState(actor_state=actor_state,
                                 value_state=value_state,
                                 icm_state=icm_state)

        return PolicyStep(action=action_distribution, state=state, info=info)

示例#19

0

显示文件

文件： agent.py 项目： runjerry/alf

    def rollout(self, time_step: ActionTimeStep, state: AgentState, mode):
        """Rollout for one step."""
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(time_step)

        if self._goal_generator is not None:
            goal_step = self._goal_generator.rollout(
                time_step._replace(observation=time_step.observation),
                state.goal_generator, mode)
            new_state = new_state._replace(goal_generator=goal_step.state)
            info = info._replace(goal_generator=goal_step.info)
            observation = [observation, goal_step.action]

        if self._icm is not None:
            icm_step = self._icm.train_step(
                time_step._replace(observation=observation), state=state.icm)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.rollout(
            time_step._replace(observation=observation), state.rl, mode)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            # TODO: For off-policy training, skip entropy_target_algorithm
            # during rollout()
            assert 'action_distribution' in rl_step.info._fields, (
                "PolicyStep from rl_algorithm.rollout() does not contain "
                "`action_distribution`, which is required by "
                "`enforce_entropy_target`")
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.info.action_distribution,
                step_type=time_step.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)

示例#20

0

显示文件

文件： off_policy_driver.py 项目： ruizhaogit/alf

    def _prepare_specs(self, algorithm):
        """Prepare various tensor specs."""

        time_step = self.get_initial_time_step()
        self._time_step_spec = common.extract_spec(time_step)
        self._action_spec = self._env.action_spec()

        policy_step = algorithm.rollout(
            algorithm.transform_timestep(time_step), self._initial_state)
        info_spec = common.extract_spec(policy_step.info)
        self._policy_step_spec = PolicyStep(
            action=self._action_spec,
            state=algorithm.train_state_spec,
            info=info_spec)

        self._action_distribution_spec = tf.nest.map_structure(
            common.to_distribution_spec, algorithm.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        algorithm.prepare_off_policy_specs(time_step)

示例#21

0

显示文件

    def train_step(self, time_step: ActionTimeStep, state):
        """Train one step.

        Args:
            time_step: time_step.observation should be the latent vector
            state: state of the model
        """
        latent_vector = time_step.observation
        rnn_output, rnn_state = self._rnn(latent_vector, state)
        mem_readout = self._memory.genkey_and_read(self._key_net, rnn_output)
        policy_input = tf.concat(
            [tf.stop_gradient(latent_vector), rnn_output, mem_readout],
            axis=-1)
        action_distribution, _ = self._actor_net(
            policy_input, step_type=time_step.step_type, network_state=None)

        value, _ = self._value_net(
            latent_vector, step_type=time_step.step_type, network_state=None)

        info = ActorCriticInfo(
            value=value, icm_reward=(), icm_info=(), entropy_target_info=())
        return PolicyStep(
            action=action_distribution, state=rnn_state, info=info)

示例#22

0

显示文件

文件： agent.py 项目： runjerry/alf

    def train_step(self, exp: Experience, state):
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(exp)

        if self._goal_generator is not None:
            goal_step = self._goal_generator.train_step(
                exp._replace(observation=observation), state.goal_generator)
            info = info._replace(goal_generator=goal_step.info)
            new_state = new_state._replace(goal_generator=goal_step.state)
            observation = [observation, goal_step.action]

        if self._icm is not None:
            icm_step = self._icm.train_step(
                exp._replace(observation=observation),
                state=state.icm,
                calc_intrinsic_reward=False)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.train_step(
            exp._replace(observation=observation,
                         rollout_info=exp.rollout_info.rl), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            assert 'action_distribution' in rl_step.info._fields, (
                "PolicyStep from rl_algorithm.train_step() does not contain "
                "`action_distribution`, which is required by "
                "`enforce_entropy_target`")
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.info.action_distribution, step_type=exp.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)

示例#23

0

显示文件

    def train_step(self, exp: Experience, state):
        new_state = AgentState()
        info = AgentInfo()
        observation = self._encode(exp)

        if self._icm is not None:
            icm_step = self._icm.train_step((observation, exp.prev_action),
                                            state=state.icm,
                                            calc_intrinsic_reward=False)
            info = info._replace(icm=icm_step.info)
            new_state = new_state._replace(icm=icm_step.state)

        rl_step = self._rl_algorithm.train_step(
            exp._replace(observation=observation), state.rl)

        new_state = new_state._replace(rl=rl_step.state)
        info = info._replace(rl=rl_step.info)

        if self._entropy_target_algorithm:
            et_step = self._entropy_target_algorithm.train_step(
                rl_step.action, step_type=exp.step_type)
            info = info._replace(entropy_target=et_step.info)

        return PolicyStep(action=rl_step.action, state=new_state, info=info)

示例#24

0

显示文件

 def greedy_predict(self, time_step: ActionTimeStep, state=None):
     action, state = self._actor_network(time_step.observation,
                                         step_type=time_step.step_type,
                                         network_state=state)
     return PolicyStep(action=action, state=state, info=())

示例#25

0

显示文件

文件： sac_agent_test.py 项目： NunoEdgarGFlowHub/agents-1

 def distribution(self, time_step, policy_state=()):
   del policy_state
   action = self.action(time_step).action
   return PolicyStep(action=_MockDistribution(action))

示例#26

0

显示文件

文件： off_policy_driver.py 项目： LiuQiangOpenMind/alf

    def _prepare_specs(self, algorithm):
        """Prepare various tensor specs."""
        def extract_spec(nest):
            return tf.nest.map_structure(
                lambda t: tf.TensorSpec(t.shape[1:], t.dtype), nest)

        time_step = self.get_initial_time_step()
        self._time_step_spec = extract_spec(time_step)
        self._action_spec = self._env.action_spec()

        policy_step = algorithm.predict(time_step, self._initial_state)
        info_spec = extract_spec(policy_step.info)
        self._pred_policy_step_spec = PolicyStep(
            action=self._action_spec,
            state=algorithm.predict_state_spec,
            info=info_spec)

        def _to_distribution_spec(spec):
            if isinstance(spec, tf.TensorSpec):
                return DistributionSpec(tfp.distributions.Deterministic,
                                        input_params_spec={"loc": spec},
                                        sample_spec=spec)
            return spec

        self._action_distribution_spec = tf.nest.map_structure(
            _to_distribution_spec, algorithm.action_distribution_spec)
        self._action_dist_param_spec = tf.nest.map_structure(
            lambda spec: spec.input_params_spec,
            self._action_distribution_spec)

        self._experience_spec = Experience(
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            observation=self._time_step_spec.observation,
            prev_action=self._action_spec,
            action=self._action_spec,
            info=info_spec,
            action_distribution=self._action_dist_param_spec)

        action_dist_params = common.zero_tensor_from_nested_spec(
            self._experience_spec.action_distribution, self._env.batch_size)
        action_dist = nested_distributions_from_specs(
            self._action_distribution_spec, action_dist_params)
        exp = Experience(step_type=time_step.step_type,
                         reward=time_step.reward,
                         discount=time_step.discount,
                         observation=time_step.observation,
                         prev_action=time_step.prev_action,
                         action=time_step.prev_action,
                         info=policy_step.info,
                         action_distribution=action_dist)

        processed_exp = algorithm.preprocess_experience(exp)
        self._processed_experience_spec = self._experience_spec._replace(
            info=extract_spec(processed_exp.info))

        policy_step = common.algorithm_step(
            algorithm,
            ob_transformer=self._observation_transformer,
            time_step=exp,
            state=common.get_initial_policy_state(self._env.batch_size,
                                                  algorithm.train_state_spec),
            training=True)
        info_spec = extract_spec(policy_step.info)
        self._training_info_spec = make_training_info(
            action=self._action_spec,
            action_distribution=self._action_dist_param_spec,
            step_type=self._time_step_spec.step_type,
            reward=self._time_step_spec.reward,
            discount=self._time_step_spec.discount,
            info=info_spec,
            collect_info=self._processed_experience_spec.info,
            collect_action_distribution=self._action_dist_param_spec)

示例#27

0

显示文件

 def action(self, time_step):
     del time_step
     action = tf.constant(self._action, dtype=tf.float32, shape=[1])
     return PolicyStep(action=action)