class DDPG(): def __init__(self, task, sess): self.sess = sess self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.tau = 0.001 self.minibatch_size = 64 self.critic_lr = 0.001 self.gamma = 0.99 self.buffer_size = 1000000 self.random_seed = 1234 self.summary_dir = "/" #self.max_episode = 100 #self.max_episode_len = 100 self.mu = 0 self.actor = ActorNetwork(self.sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.tau, self.minibatch_size) self.critic = CriticNetwork(self.sess, self.state_size, self.action_size, self.critic_lr, self.tau, self.gamma, self.actor.get_num_trainable_vars()) # Initialize replay memory self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed) self.sess.run(tf.global_variables_initializer()) self.actor.update_target_network() self.critic.update_target_network() self.noise = OUNoise(self.action_size, self.mu) self.sess.run(tf.global_variables_initializer()) def reset_episode(self): #self.actor_noise.reset() state = self.env.reset() self.last_state = state self.ep_ave_max_q = 0 self.ep_reward = 0 return state def step(self, s, a, r, terminal, s2): # Save experience / reward #self.memory.add(self.last_state, action, reward, next_state, done) #summary_ops, summary_vars = self.build_summaries() self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )), np.reshape(a, (self.actor.a_dim, )), r, terminal, np.reshape(s2, (self.actor.s_dim, ))) # Learn, if enough samples are available in memory if self.replay_buffer.size() > self.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch) target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() # Roll over last state and action self.last_state = s2 ''' self.ep_reward +=r if terminal: summary_str = self.sess.run( , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)}) writer.add_summary(summary_str, i) #writer.flush() print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \ (self.ep_ave_max_q / float(j)))) ''' def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor.predict(states)[0] #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size)) #print(actions) return actions + self.noise.sample() # add some noise for exploration def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch): target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() def build_summaries(self): episode_reward = tf.Variable(0.) tf.summary.scalar("Reward", episode_reward) episode_ave_max_q = tf.Variable(0.) tf.summary.scalar("Qmax Value", episode_ave_max_q) summary_vars = [episode_reward, episode_ave_max_q] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars
K.set_session(sess) actor = ActorNetwork(sess, num_state, num_action, batch_size, tau, actor_alpha) critic = CriticNetwork(sess, num_state, num_action, batch_size, tau, critic_alpha) buff = ReplayBuffer(buffer_size) with open('actor_model.json', 'w') as json_file: json_file.write(actor.model.to_json()) with open('critic_model.json', 'w') as json_file: json_file.write(critic.model.to_json()) print 'start training' best_r = -10000 actor.update_target_network() critic.update_target_network() try: for i in range(num_episode): total_reward = 0 s = env.reset() s_t = np.hstack((s[0], s[1], s[2])) while True: #epsilon *= 0.995 loss = 0.0 #epsilon -= 1.0/10000.0 a = actor.model.predict(s_t.reshape(1,s_t.shape[0])) noise = Ornstein_Uhlenbeck(a[0]) #noise = max(epsilon,0) * noise.function(a[0], 0.0, 0.15, 0.3) # a = a[0] + noise a = a[0] + noise()[0]
def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False): with tf.Session() as sess: # configuring environment env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary # Creating agent ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except: pass # print('********************************') # print('Failed to restore models') # print('********************************') for i in range(epochs): state = env.reset() state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= (epsilon/EXPLORE) epsilon = np.maximum(min_epsilon,epsilon) while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1) action = action_original + max(epsilon,0)*ruido.noise() # remove comment if you want to see a step by step update # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward, done, np.reshape(next_state, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step +=1 if done: ruido.reset() if state[0] > 0.45: #print('****************************************') #print('got it!') #print('****************************************') goal += 1 if max_state_episode > max_state: max_state = max_state_episode print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) ) # print('Efficiency', 100.*((goal)/(i+1.))) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def trainer(env, outdir, epochs=100, MINIBATCH_SIZE=64, GAMMA=0.99, epsilon=0.01, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=False, render=False): tf.reset_default_graph() with tf.Session(config=config) as sess: # configuring environment #env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space action_dim = env.action_space action_bound = np.float64( 1 ) # I choose this number since the mountain continuos does not have a boundary # Creating agent # FOR the RNN #tf.contrib.rnn.core_rnn_cell.BasicLSTMCell from https://github.com/tensorflow/tensorflow/issues/8771 #cell = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) #cell_target = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) ruido = OUNoise(action_dim, mu=0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, outdir) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), outdir) #sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) replay_buffer.load() #goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except Exception as e: print('********************************') print(e) print('********************************') #critic.recover_critic() #actor.recover_actor() for i in range(epochs): state = env.reset() #state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= epsilon / EXPLORE if epsilon < min_epsilon: epsilon = min_epsilon while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise np.set_printoptions(precision=4) # remove comment if you want to see a step by step update #print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : action_original = actor.predict( np.reshape(state, (1, actor.s_dim ))) # + (10. / (10. + i))* np.random.randn(1) action = action_original #+ max(epsilon, 0) * ruido.noise() ''' for j in range(action.shape[1]): if abs(action[0,j]) > 1: act=action[0,j] action[0,j]=act/abs(act) else: continue ''' action = np.reshape(action, (actor.a_dim, )) next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim, )), np.reshape(action, (actor.a_dim, )), reward, done, np.reshape(next_state, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch), 20) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), 20) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs, 20) c = np.array(grads) #print(c.shape) #print('...') #print('...',c[0].shape) #print('...') actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step += 1 if max_state_episode > max_state: max_state = max_state_episode print('th', i + 1, 'Step', step, 'Reward:', ep_reward, 'Pos', next_state[0], next_state[1], 'epsilon', epsilon) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #proc = Popen(['rosclean','purge'],stdout=PIPE, stdin=PIPE, stderr=PIPE,universal_newlines=True) #out,err = proc.communicate(input="{}\n".format("y")) #print('maxmimum state reach', max_state) #print('the reward at the end of the episode,', reward) #print('Efficiency', 100.*((goal)/(i+1.))) ''' print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #env.close() ''' sess.close() return 0