예제 #1
0
from tf_rl.common.memory import ReplayBuffer
from tf_rl.common.wrappers import wrap_deepmind, make_atari

size = 100000

env = wrap_deepmind(make_atari("PongNoFrameskip-v4"))
memory = ReplayBuffer(size=size, traj_dir="./traj/")

state = env.reset()
action = env.action_space.sample()
next_state, reward, done, info = env.step(action)
env.close()

for _ in range(size):
    memory.add(state, action, reward, next_state, done)
print(len(memory))
memory.save()

del memory
memory = ReplayBuffer(size=size, recover_data=True, traj_dir="./traj/")
print(len(memory))
        for i in itertools.count():
            state = env.reset()
            total_reward = 0
            start = time.time()
            agent.random_process.reset_states()
            done = False
            episode_len = 0
            while not done:
                # env.render()
                if global_timestep.numpy() < agent.params.learning_start:
                    action = env.action_space.sample()
                else:
                    action = agent.predict(state)
                # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict)
                next_state, reward, done, info = env.step(action * env.action_space.high)
                replay_buffer.add(state, action, reward, next_state, done)

                global_timestep.assign_add(1)
                episode_len += 1
                total_reward += reward
                state = next_state

                # for evaluation purpose
                if global_timestep.numpy() % agent.params.eval_interval == 0:
                    agent.eval_flg = True

            """
            ===== After 1 Episode is Done =====
            """

            # train the model at this point
예제 #3
0
        expert_action = expert.main_model(states)
        expert_action = tf.argmax(expert_action, axis=-1)
        return expert_action

    for epoch in range(300):
        state = env.reset()
        done = False
        reward_ep = 0
        while not done:
            if epoch <= 1:
                action = env.action_space.sample()
            else:
                action = agent.select_action(state=state)
                action = np.squeeze(action).astype(np.int8)
            next_state, reward, done, info = env.step(action)
            buffer.add(state, action, reward, next_state, done)
            state = next_state
            reward_ep += reward
        reward_total.append(reward_ep)

        losses = list()
        for grad_step in range(10):
            states, _, _, _, _ = buffer.sample(batch_size=32)
            expert_action = ask_expert(states)
            loss = agent.update(states, expert_action)
            losses.append(loss.numpy())

        print("Ep: {} Reward: {} MAR: {:.4f} Loss: {:.4f}".format(
            epoch, reward_ep, np.mean(reward_total), np.mean(losses)))

    env.close()