示例#1
0
# initialize mdp state structure
mdp = MDP_state(STATE_SIZE_POST, FRAMES)

# initialize replay buffer
R = BipolarReplayBuffer(MDP_STATE_SIZE, 1, BUFFER_SIZE)
buf = R.LoadBuffer(OUT_DIR + BUFFER_FILE)
if buf:
    EXP_PROB = EPSILON
    populated = R.GetOccupency()
    print("Replay buffer loaded from disk, occupied: " + str(populated))
else:
    print("Creating new replay buffer")

# initialize logger
L = Logger()
log_not_empty = L.Load(OUT_DIR + LOG_FILE)
if log_not_empty:
    print("Log file loaded")
else:
    ("Creating new log file")
    L.AddNewLog('network_left')
    L.AddNewLog('network_middle')
    L.AddNewLog('network_right')
    L.AddNewLog('policy_left')
    L.AddNewLog('policy_middle')
    L.AddNewLog('policy_right')

# load saved model
ckpt = tf.train.get_checkpoint_state(OUT_DIR)
if ckpt and ckpt.model_checkpoint_path:
    saver.restore(sess, ckpt.model_checkpoint_path)
示例#2
0
文件: ddpg.py 项目: ataitler/DQN
def train(sess, env, actor, critic):

    env_left = gym.make(ENV_LEFT)
    env_middle = gym.make(ENV_MIDDLE)
    env_right = gym.make(ENV_RIGHT)
    L = Logger()
    log_not_empty = L.Load(LOG_FILE)
    if log_not_empty:
        print("Log file loaded")
    else:
        ("Creating new log file")
        L.AddNewLog('network_left')
        L.AddNewLog('network_middle')
        L.AddNewLog('network_right')
        L.AddNewLog('total_reward')
        L.AddNewLog('estimated_value')
        L.AddNewLog('network_random')

    simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None)

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    n = OUnoise(INPUT)
    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
        n.Reset()
        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j))
            a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample()

            s2, r, terminal, info = env.step(a[0])
            r += -0.5

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                break

        summary_str = sess.run(summary_ops,
                               feed_dict={
                                   summary_vars[0]: ep_reward,
                                   summary_vars[1]: ep_ave_max_q / float(j)
                               })

        writer.add_summary(summary_str, i)
        writer.flush()

        print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \
            '| Qmax: %.4f' % (ep_ave_max_q / float(j))

        # log statistics
        L.AddRecord(
            'network_left',
            simulator.SimulateContNeuralEpisode(actor, sess, env_left, False))
        L.AddRecord(
            'network_middle',
            simulator.SimulateContNeuralEpisode(actor, sess, env_middle,
                                                False))
        L.AddRecord(
            'network_right',
            simulator.SimulateContNeuralEpisode(actor, sess, env_right, False))
        temp_r = 0
        for rand_i in xrange(10):
            temp_r = temp_r + simulator.SimulateContNeuralEpisode(
                actor, sess, env, False) * 0.1
        L.AddRecord('network_random', temp_r)
        L.AddRecord('total_reward', ep_reward)
        if replay_buffer.size() > V_EST:
            num = V_EST
        else:
            num = replay_buffer.size()
        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
            num)
        Q = critic.predict(s_batch, actor.predict(s_batch))
        V_est = Q.sum() / num * 1.0
        L.AddRecord('estimated_value', V_est)

        if i % SAVE_RATE == 0:
            L.Save(LOG_FILE)