class DrlAgent: def __init__(self, sess, is_train, dim_state, dim_action, num_paths, actor_learn_rate, critic_learn_rate, tau, buffer_size, mini_batch, ep_begin, epsilon_end, gamma, max_epoch, seed=66): self.__is_train = is_train self.__dim_state = dim_state self.__dim_action = dim_action self.__mini_batch = mini_batch self.__ep_begin = ep_begin self.__gamma = gamma self.__max_epoch = max_epoch self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0, actor_learn_rate, tau, num_paths) self.__critic = CriticNetwork(sess, dim_state, dim_action, critic_learn_rate, tau) self.__replay = ReplayBuffer(buffer_size, seed) self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch, dim_action, num_paths, seed) self.__state_curt = np.zeros(dim_state) self.__action_curt = self.__explorer.convert_action( np.ones(dim_action)) self.__episode = 0 self.__step = 0 def target_paras_init(self): self.__actor.update_target_paras() self.__critic.update_target_paras() def predict(self, state, reward): action_original = self.__actor.predict([state])[0] if not self.__is_train: return action_original action = self.__explorer.get_act(action_original) self.__replay.add(self.__state_curt, self.__action_curt, reward, state) self.__state_curt = state self.__action_curt = action if len(self.__replay) > self.__mini_batch: self.train() self.__step += 1 if self.__step >= self.__max_epoch: self.__step = 0 self.__episode += 1 self.__explorer.reset_ep(self.__ep_begin) return action def train(self): batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch( self.__mini_batch) weights = [1.0] * self.__mini_batch weights = np.expand_dims(weights, axis=1) target_q = self.__critic.predict_target( batch_state_next, self.__actor.predict_target(batch_state_next)) value_q = self.__critic.predict(batch_state, batch_action) batch_y = [] batch_error = [] for k in range(len(batch_reward)): target_y = batch_reward[k] + self.__gamma * target_q[k] batch_error.append(abs(target_y - value_q[k])) batch_y.append(target_y) predicted_q, _ = self.__critic.train(batch_state, batch_action, batch_y, weights) a_outs = self.__actor.predict(batch_state) grads = self.__critic.calculate_gradients(batch_state, a_outs) weighted_grads = weights * grads[0] self.__actor.train(batch_state, weighted_grads) self.__actor.update_target_paras() self.__critic.update_target_paras()
def actor_critic(epochs=1000, GAMMA=0.99, train_indicator=True, render=False, temp=False): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars()) # starting tensorflow sess.run(tf.global_variables_initializer()) for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 while (not done): # Choose and take action, and observe reward action_prob = actor.predict( np.reshape(state, (1, robot.state_dim))) action = np.random.choice(np.arange(len(action_prob)), p=action_prob) next_state, reward, done, step = robot.update(action) # Train V_minib = critic.predict( np.reshape(state, (1, robot.state_dim))) V_minib_next = critic.predict( np.reshape(next_state, (1, robot.state_dim))) if done: td_target = reward td_error = reward - V_minib # not - V_minib[k] ? else: td_target = reward + GAMMA * V_minib_next td_error = reward + GAMMA * V_minib_next - V_minib critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) actor.train(np.reshape(state, (1, robot.state_dim)), np.reshape(action, (1, 1)), np.reshape(td_error, (1, 1))) state = next_state ep_reward = ep_reward + reward # this print is usefull for debuggin #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def actor_critic(epochs=1000, GAMMA=0.99, load_file=False, render=False, temp=False, verbose=False): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear #robot = gym_pendulum(render, temp) robot = gym_mountaincar(render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE, ACTION_BOUND, device=DEVICE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars(), device=DEVICE) # starting tensorflow sess.run(tf.global_variables_initializer()) if load_file: actor.recover_actor() critic.recover_critic() for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 while (not done): # Choose and take action, and observe reward action, mu, sigma = actor.predict( np.reshape(state, (1, robot.state_dim))) new_action = action + 0.2 * (np.random.rand(1)[0]) action_noise = np.clip(new_action, -ACTION_BOUND, ACTION_BOUND) # print(round(action,3), round(new_action,3), round(action_noise,3), round(mu,3), round(sigma,3)) next_state, reward, done, step = robot.update(action_noise) # Train V_minib = critic.predict( np.reshape(state, (1, robot.state_dim))) V_minib_next = critic.predict( np.reshape(next_state, (1, robot.state_dim))) if done: td_target = reward td_error = reward - V_minib # not - V_minib[k] ? else: td_target = reward + GAMMA * V_minib_next td_error = reward + GAMMA * V_minib_next - V_minib #critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) actor.train(np.reshape(state, (1, robot.state_dim)), np.reshape(action, (1, 1)), np.reshape(td_error, (1, 1))) state = next_state ep_reward = ep_reward + reward # this print is usefull for debuggin if verbose: print(step, 'action', round(action, 3), 'state', round(robot.state[0], 3), round(robot.state[1], 3), 'r', round(reward, 3)) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') #time.sleep(1) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def actor_critic(epochs=1000, GAMMA=0.99, train_indicator=True, render=False, temp=False, baseline=True): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars()) # starting tensorflow sess.run(tf.global_variables_initializer()) for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 total_reward = np.zeros(max_episode) total_state = deque() total_action = deque() k = 0 while (not done) and k < max_episode: # Choose and take action, and observe reward action_prob = actor.predict( np.reshape(state, (1, robot.state_dim))) action = np.random.choice(np.arange(len(action_prob)), p=action_prob) next_state, reward, done, step = robot.update(action) # store episode information total_reward[k] = reward total_state.append(state) total_action.append(action) state = next_state k = k + 1 # Train # get G for l in range(k): G = np.sum(total_reward[l:k + 1]) #print(l,G) # print for debug state = np.reshape(total_state[l], (1, robot.state_dim)) action = np.reshape(total_action[l], (1, 1)) if baseline: delta = G - critic.predict(state) critic.train(state, delta) actor.train(state, action, delta) else: actor.train(state, action, G) # this print is usefull for debuggin #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')