def collect_steps(env: tf_py_environment.TFPyEnvironment,
                  policy: tf_policy.Base, buffer: ReplayBuffer):
    time_step = env.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = env.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    buffer.add_batch(traj)
Пример #2
0
def collect_step(env: tf_py_environment.TFPyEnvironment, policy, buffer):
    time_step = env.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = env.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)
Пример #3
0
def step(
    environment: TFPyEnvironment, policy: tf_policy.TFPolicy, replay_buffer: ReplayBuffer
) -> typing.Tuple[float, bool]:
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)
    return next_time_step.reward.numpy()[0], next_time_step.is_last()