def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False): with tf.Session() as sess: # configuring environment env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary # Creating agent ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except: pass # print('********************************') # print('Failed to restore models') # print('********************************') for i in range(epochs): state = env.reset() state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= (epsilon/EXPLORE) epsilon = np.maximum(min_epsilon,epsilon) while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1) action = action_original + max(epsilon,0)*ruido.noise() # remove comment if you want to see a step by step update # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward, done, np.reshape(next_state, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step +=1 if done: ruido.reset() if state[0] > 0.45: #print('****************************************') #print('got it!') #print('****************************************') goal += 1 if max_state_episode > max_state: max_state = max_state_episode print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) ) # print('Efficiency', 100.*((goal)/(i+1.))) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def actor_critic(epochs=1000, GAMMA=0.99, load_file=False, render=False, temp=False, verbose=False): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear #robot = gym_pendulum(render, temp) robot = gym_mountaincar(render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE, ACTION_BOUND, device=DEVICE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars(), device=DEVICE) # starting tensorflow sess.run(tf.global_variables_initializer()) if load_file: actor.recover_actor() critic.recover_critic() for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 while (not done): # Choose and take action, and observe reward action, mu, sigma = actor.predict( np.reshape(state, (1, robot.state_dim))) new_action = action + 0.2 * (np.random.rand(1)[0]) action_noise = np.clip(new_action, -ACTION_BOUND, ACTION_BOUND) # print(round(action,3), round(new_action,3), round(action_noise,3), round(mu,3), round(sigma,3)) next_state, reward, done, step = robot.update(action_noise) # Train V_minib = critic.predict( np.reshape(state, (1, robot.state_dim))) V_minib_next = critic.predict( np.reshape(next_state, (1, robot.state_dim))) if done: td_target = reward td_error = reward - V_minib # not - V_minib[k] ? else: td_target = reward + GAMMA * V_minib_next td_error = reward + GAMMA * V_minib_next - V_minib #critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) actor.train(np.reshape(state, (1, robot.state_dim)), np.reshape(action, (1, 1)), np.reshape(td_error, (1, 1))) state = next_state ep_reward = ep_reward + reward # this print is usefull for debuggin if verbose: print(step, 'action', round(action, 3), 'state', round(robot.state[0], 3), round(robot.state[1], 3), 'r', round(reward, 3)) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') #time.sleep(1) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def actor_critic(epochs=1000, GAMMA=0.99, train_indicator=True, render=False, temp=False, baseline=True): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars()) # starting tensorflow sess.run(tf.global_variables_initializer()) for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 total_reward = np.zeros(max_episode) total_state = deque() total_action = deque() k = 0 while (not done) and k < max_episode: # Choose and take action, and observe reward action_prob = actor.predict( np.reshape(state, (1, robot.state_dim))) action = np.random.choice(np.arange(len(action_prob)), p=action_prob) next_state, reward, done, step = robot.update(action) # store episode information total_reward[k] = reward total_state.append(state) total_action.append(action) state = next_state k = k + 1 # Train # get G for l in range(k): G = np.sum(total_reward[l:k + 1]) #print(l,G) # print for debug state = np.reshape(total_state[l], (1, robot.state_dim)) action = np.reshape(total_action[l], (1, 1)) if baseline: delta = G - critic.predict(state) critic.train(state, delta) actor.train(state, action, delta) else: actor.train(state, action, G) # this print is usefull for debuggin #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def actor_critic(epochs=1000, GAMMA=0.99, train_indicator=True, render=False, temp=False): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars()) # starting tensorflow sess.run(tf.global_variables_initializer()) for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 while (not done): # Choose and take action, and observe reward action_prob = actor.predict( np.reshape(state, (1, robot.state_dim))) action = np.random.choice(np.arange(len(action_prob)), p=action_prob) next_state, reward, done, step = robot.update(action) # Train V_minib = critic.predict( np.reshape(state, (1, robot.state_dim))) V_minib_next = critic.predict( np.reshape(next_state, (1, robot.state_dim))) if done: td_target = reward td_error = reward - V_minib # not - V_minib[k] ? else: td_target = reward + GAMMA * V_minib_next td_error = reward + GAMMA * V_minib_next - V_minib critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) actor.train(np.reshape(state, (1, robot.state_dim)), np.reshape(action, (1, 1)), np.reshape(td_error, (1, 1))) state = next_state ep_reward = ep_reward + reward # this print is usefull for debuggin #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def trainer(env, outdir, epochs=100, MINIBATCH_SIZE=64, GAMMA=0.99, epsilon=0.01, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=False, render=False): tf.reset_default_graph() with tf.Session(config=config) as sess: # configuring environment #env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space action_dim = env.action_space action_bound = np.float64( 1 ) # I choose this number since the mountain continuos does not have a boundary # Creating agent # FOR the RNN #tf.contrib.rnn.core_rnn_cell.BasicLSTMCell from https://github.com/tensorflow/tensorflow/issues/8771 #cell = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) #cell_target = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) ruido = OUNoise(action_dim, mu=0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, outdir) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), outdir) #sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) replay_buffer.load() #goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except Exception as e: print('********************************') print(e) print('********************************') #critic.recover_critic() #actor.recover_actor() for i in range(epochs): state = env.reset() #state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= epsilon / EXPLORE if epsilon < min_epsilon: epsilon = min_epsilon while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise np.set_printoptions(precision=4) # remove comment if you want to see a step by step update #print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : action_original = actor.predict( np.reshape(state, (1, actor.s_dim ))) # + (10. / (10. + i))* np.random.randn(1) action = action_original #+ max(epsilon, 0) * ruido.noise() ''' for j in range(action.shape[1]): if abs(action[0,j]) > 1: act=action[0,j] action[0,j]=act/abs(act) else: continue ''' action = np.reshape(action, (actor.a_dim, )) next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim, )), np.reshape(action, (actor.a_dim, )), reward, done, np.reshape(next_state, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch), 20) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), 20) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs, 20) c = np.array(grads) #print(c.shape) #print('...') #print('...',c[0].shape) #print('...') actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step += 1 if max_state_episode > max_state: max_state = max_state_episode print('th', i + 1, 'Step', step, 'Reward:', ep_reward, 'Pos', next_state[0], next_state[1], 'epsilon', epsilon) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #proc = Popen(['rosclean','purge'],stdout=PIPE, stdin=PIPE, stderr=PIPE,universal_newlines=True) #out,err = proc.communicate(input="{}\n".format("y")) #print('maxmimum state reach', max_state) #print('the reward at the end of the episode,', reward) #print('Efficiency', 100.*((goal)/(i+1.))) ''' print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #env.close() ''' sess.close() return 0