示例#1
0
        ep_reward = 0
        for t in range(max_steps):
            if episode % 25 == 0:
                env.render()
            action = actor.act(obs0)  # TODO add noise for exploration
            obs1, reward, done, info = env.step(action)
            replay_buffer.add(obs0.reshape(state_dim),
                              action.reshape(action_dim), reward, t,
                              obs1.reshape(state_dim))

            if replay_buffer.size() > MINIBATCH_SIZE:
                minibatch = replay_buffer.sample_batch(MINIBATCH_SIZE)
                s0_batch, a_batch, r_batch, t_batch, s1_batch = minibatch

                actor_target_batch = actor.predict_target(s1_batch)
                q_target_batch = critic.predict_target(
                    np.hstack((s1_batch, actor_target_batch)))
                target_batch = r_batch + GAMMA * q_target_batch

                loss = critic.learn(np.hstack((s0_batch, a_batch)),
                                    target_batch)
                # TODO update actor policy

                actor.update_target()
                critic.update_target()

            obs0 = obs1
            ep_reward += reward[0]
            if done:
                break
        total += ep_reward
        print("Episode {0:8d}: {1:4d} timesteps, {2:4f} average".format(
示例#2
0
文件: main.py 项目: yosider/RLSnipets
def main():
    with tf.Session() as sess:

        actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                             ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE)
        critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

        #TODO: Ornstein-Uhlenbeck noise.

        sess.run(tf.global_variables_initializer())

        # initialize target net
        actor.update_target_network()
        critic.update_target_network()

        # initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE)

        # main loop.
        for ep in range(MAX_EPISODES):

            episode_reward = 0
            ep_batch_avg_q = 0

            s = ENV.reset()

            for step in range(MAX_EP_STEPS):

                a = actor.predict(np.reshape(s,
                                             (1, STATE_DIM)))  #+ actor_noise()
                s2, r, terminal, info = ENV.step(a[0])
                #print(s2)

                replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                                np.reshape(a, (ACTION_DIM,)), \
                                r, \
                                terminal, \
                                np.reshape(s2, (STATE_DIM,)))

                # Batch sampling.
                if replay_buffer.size() > MINIBATCH_SIZE and \
                    step % TRAIN_INTERVAL == 0:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # target Q値を計算.
                    target_action = actor.predict_target(s2_batch)
                    target_q = critic.predict_target(s2_batch, target_action)

                    # critic の target V値を計算.
                    targets = []
                    for i in range(MINIBATCH_SIZE):
                        if t_batch[i]:
                            # terminal
                            targets.append(r_batch[i])
                        else:
                            targets.append(r_batch[i] + GAMMA * target_q[i])

                    # Critic を train.
                    #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                    pred_q, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(targets, (MINIBATCH_SIZE, 1)))

                    # Actor を train.
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    #print(grads[0].shape)
                    #exit(1)
                    actor.train(s_batch, grads[0])

                    # Update target networks.
                    # 数batchに一度にするべき?
                    actor.update_target_network()
                    critic.update_target_network()

                    ep_batch_avg_q += np.mean(pred_q)

                s = s2
                episode_reward += r

                if terminal:
                    print('Episode:', ep, 'Reward:', episode_reward)
                    reward_log.append(episode_reward)
                    q_log.append(ep_batch_avg_q / step)

                    break
示例#3
0
文件: main.py 项目: yosider/RLSnipets
            s2, r, terminal, info = ENV.step(a[0])

            replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                              np.reshape(a, (ACTION_DIM,)), \
                              r, \
                              terminal, \
                              np.reshape(s2, (STATE_DIM,)))

            # Batch sampling.
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # target Q値を計算.
                target_action = actor.predict_target(s2_batch)
                target_q = critic.predict_target(s2_batch, target_action)

                # critic の target V値を計算.
                targets = []
                for i in range(MINIBATCH_SIZE):
                    if t_batch[i]:
                        # terminal
                        targets.append(r_batch[i])
                    else:
                        targets.append(r_batch[i] + GAMMA * target_q[i])

                # Critic を train.
                #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                pred_q, _ = critic.train(
                    s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1)))