示例#1
0
    print("Model loaded from disk")

# define action discretization
max_a = env.action_space.high[0]
min_a = env.action_space.low[0]

act = actions(ACTION_SIZE, max_a)
actions_deque, _ = act.get_action()
discretizer = Discretizer(actions_deque)
policy = ChasePolicy(STATE_SIZE, ACTION_SIZE, max_a, min_a)
n = OUnoise(2, 0.5, 50)

################
# simulators
################
simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, actions_deque)

# main learning loop
print("Starting to learn in environment: " + ENVIRONMENT)
steps = 0
for episode_i in xrange(1, EPISODES + 1):

    policy_exp = np.random.uniform()
    if policy_exp <= EPSILON:
        onPolicy = True
        n.Reset()
    else:
        onPolicy = False

    st = env.reset()
    mdp.reset()
示例#2
0
	print("Model loaded from disk")

# define action discretization
max_a = env.action_space.high[0]
min_a = env.action_space.low[0]

act = actions(ACTION_SIZE, max_a)
actions_deque,_ = act.get_action()
discretizer = Discretizer(actions_deque)
policy = ChasePolicy(STATE_SIZE, ACTION_SIZE, max_a, min_a)
n = OUnoise(2,0.5,NOISE)

################
# simulators
################
simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, actions_deque)

# main learning loop
print("Starting to learn in environment: " + ENVIRONMENT)
steps = 0
for episode_i in xrange(1,EPISODES+1):

	policy_exp = np.random.uniform()
	n.Reset()
	if policy_exp <= EPSILON_P:
		onPolicy = True
	else:
		onPolicy = False
	
	st = env.reset()
	mdp.reset() 
示例#3
0
if ckpt and ckpt.model_checkpoint_path:
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Model loaded from disk")

# define action discretization
max_a = env.action_space.high[0]
min_a = env.action_space.low[0]

act = actions(ACTION_SIZE, max_a)
actions_deque, _ = act.get_action()
discretizer = Discretizer(actions_deque)

################
# simulators
################
simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, actions_deque)

# main learning loop
print("Starting to learn in environment: " + ENVIRONMENT)
steps = steps_counter.evaluate(sess)
C_steps_counter.evaluate(sess)
for episode_i in xrange(1, EPISODES + 1):
    episodes_counter.increment(sess)
    st = env.reset()
    mdp.add_frame(st)
    st = mdp.get_MDP_state()
    totalR = 0
    totalE = 0
    for t in xrange(1, STEPS + 1):
        if DISPLAY:
            env.render()
示例#4
0
文件: ddpg.py 项目: ataitler/DQN
def train(sess, env, actor, critic):

    env_left = gym.make(ENV_LEFT)
    env_middle = gym.make(ENV_MIDDLE)
    env_right = gym.make(ENV_RIGHT)
    L = Logger()
    log_not_empty = L.Load(LOG_FILE)
    if log_not_empty:
        print("Log file loaded")
    else:
        ("Creating new log file")
        L.AddNewLog('network_left')
        L.AddNewLog('network_middle')
        L.AddNewLog('network_right')
        L.AddNewLog('total_reward')
        L.AddNewLog('estimated_value')
        L.AddNewLog('network_random')

    simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None)

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.initialize_all_variables())
    writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    n = OUnoise(INPUT)
    for i in xrange(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0
        n.Reset()
        for j in xrange(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j))
            a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample()

            s2, r, terminal, info = env.step(a[0])
            r += -0.5

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \
                terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in xrange(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                break

        summary_str = sess.run(summary_ops,
                               feed_dict={
                                   summary_vars[0]: ep_reward,
                                   summary_vars[1]: ep_ave_max_q / float(j)
                               })

        writer.add_summary(summary_str, i)
        writer.flush()

        print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \
            '| Qmax: %.4f' % (ep_ave_max_q / float(j))

        # log statistics
        L.AddRecord(
            'network_left',
            simulator.SimulateContNeuralEpisode(actor, sess, env_left, False))
        L.AddRecord(
            'network_middle',
            simulator.SimulateContNeuralEpisode(actor, sess, env_middle,
                                                False))
        L.AddRecord(
            'network_right',
            simulator.SimulateContNeuralEpisode(actor, sess, env_right, False))
        temp_r = 0
        for rand_i in xrange(10):
            temp_r = temp_r + simulator.SimulateContNeuralEpisode(
                actor, sess, env, False) * 0.1
        L.AddRecord('network_random', temp_r)
        L.AddRecord('total_reward', ep_reward)
        if replay_buffer.size() > V_EST:
            num = V_EST
        else:
            num = replay_buffer.size()
        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
            num)
        Q = critic.predict(s_batch, actor.predict(s_batch))
        V_est = Q.sum() / num * 1.0
        L.AddRecord('estimated_value', V_est)

        if i % SAVE_RATE == 0:
            L.Save(LOG_FILE)
示例#5
0
if log_not_empty:
    print("Log file loaded")
else:
    ("Creating new log file")
    if ENVIRONMENT_NAME is 'Hockey-v2':
        L.AddNewLog('network_left')
        L.AddNewLog('network_middle')
        L.AddNewLog('network_right')
        L.AddNewLog('network_random')
#	L.AddNewLog('error')
    L.AddNewLog('total_reward')
    L.AddNewLog('estimated_value')
    L.AddNewLog('network_random')

if ENVIRONMENT_NAME is 'Hockey-v2':
    simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, None)
steps = steps_counter.evaluate(sess)
C_steps_counter.evaluate(sess)
for ep in range(EPISODES):
    episodes_counter.increment(sess)
    # open up a game state
    s_t, r_0, done = env.reset(), 0, False
    n.Reset()
    REWARD = 0
    totalR = 0
    totalE = 0
    # exploration.reset()
    for t in range(STEPS):
        if DISPLAY:
            env.render()
    # select action according to current policy and exploration noise