示例#1
0
            action = actor.act(obs0)  # TODO add noise for exploration
            obs1, reward, done, info = env.step(action)
            replay_buffer.add(obs0.reshape(state_dim),
                              action.reshape(action_dim), reward, t,
                              obs1.reshape(state_dim))

            if replay_buffer.size() > MINIBATCH_SIZE:
                minibatch = replay_buffer.sample_batch(MINIBATCH_SIZE)
                s0_batch, a_batch, r_batch, t_batch, s1_batch = minibatch

                actor_target_batch = actor.predict_target(s1_batch)
                q_target_batch = critic.predict_target(
                    np.hstack((s1_batch, actor_target_batch)))
                target_batch = r_batch + GAMMA * q_target_batch

                loss = critic.learn(np.hstack((s0_batch, a_batch)),
                                    target_batch)
                # TODO update actor policy

                actor.update_target()
                critic.update_target()

            obs0 = obs1
            ep_reward += reward[0]
            if done:
                break
        total += ep_reward
        print("Episode {0:8d}: {1:4d} timesteps, {2:4f} average".format(
            episode, t, total / (episode + 1)))
        print(loss)