# initialize mdp state structure mdp = MDP_state(STATE_SIZE_POST, FRAMES) # initialize replay buffer R = BipolarReplayBuffer(MDP_STATE_SIZE, 1, BUFFER_SIZE) buf = R.LoadBuffer(OUT_DIR + BUFFER_FILE) if buf: EXP_PROB = EPSILON populated = R.GetOccupency() print("Replay buffer loaded from disk, occupied: " + str(populated)) else: print("Creating new replay buffer") # initialize logger L = Logger() log_not_empty = L.Load(OUT_DIR + LOG_FILE) if log_not_empty: print("Log file loaded") else: ("Creating new log file") L.AddNewLog('network_left') L.AddNewLog('network_middle') L.AddNewLog('network_right') L.AddNewLog('policy_left') L.AddNewLog('policy_middle') L.AddNewLog('policy_right') # load saved model ckpt = tf.train.get_checkpoint_state(OUT_DIR) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path)
def train(sess, env, actor, critic): env_left = gym.make(ENV_LEFT) env_middle = gym.make(ENV_MIDDLE) env_right = gym.make(ENV_RIGHT) L = Logger() log_not_empty = L.Load(LOG_FILE) if log_not_empty: print("Log file loaded") else: ("Creating new log file") L.AddNewLog('network_left') L.AddNewLog('network_middle') L.AddNewLog('network_right') L.AddNewLog('total_reward') L.AddNewLog('estimated_value') L.AddNewLog('network_random') simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None) # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) n = OUnoise(INPUT) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 n.Reset() for j in xrange(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j)) a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample() s2, r, terminal, info = env.step(a[0]) r += -0.5 replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: break summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) # log statistics L.AddRecord( 'network_left', simulator.SimulateContNeuralEpisode(actor, sess, env_left, False)) L.AddRecord( 'network_middle', simulator.SimulateContNeuralEpisode(actor, sess, env_middle, False)) L.AddRecord( 'network_right', simulator.SimulateContNeuralEpisode(actor, sess, env_right, False)) temp_r = 0 for rand_i in xrange(10): temp_r = temp_r + simulator.SimulateContNeuralEpisode( actor, sess, env, False) * 0.1 L.AddRecord('network_random', temp_r) L.AddRecord('total_reward', ep_reward) if replay_buffer.size() > V_EST: num = V_EST else: num = replay_buffer.size() s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( num) Q = critic.predict(s_batch, actor.predict(s_batch)) V_est = Q.sum() / num * 1.0 L.AddRecord('estimated_value', V_est) if i % SAVE_RATE == 0: L.Save(LOG_FILE)