예제 #1
0
class DrlAgent:
    def __init__(self,
                 sess,
                 is_train,
                 dim_state,
                 dim_action,
                 num_paths,
                 actor_learn_rate,
                 critic_learn_rate,
                 tau,
                 buffer_size,
                 mini_batch,
                 ep_begin,
                 epsilon_end,
                 gamma,
                 max_epoch,
                 seed=66):
        self.__is_train = is_train
        self.__dim_state = dim_state
        self.__dim_action = dim_action
        self.__mini_batch = mini_batch
        self.__ep_begin = ep_begin
        self.__gamma = gamma
        self.__max_epoch = max_epoch

        self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0,
                                    actor_learn_rate, tau, num_paths)
        self.__critic = CriticNetwork(sess, dim_state, dim_action,
                                      critic_learn_rate, tau)

        self.__replay = ReplayBuffer(buffer_size, seed)

        self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch,
                                   dim_action, num_paths, seed)

        self.__state_curt = np.zeros(dim_state)
        self.__action_curt = self.__explorer.convert_action(
            np.ones(dim_action))

        self.__episode = 0
        self.__step = 0

    def target_paras_init(self):
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()

    def predict(self, state, reward):
        action_original = self.__actor.predict([state])[0]
        if not self.__is_train:
            return action_original

        action = self.__explorer.get_act(action_original)
        self.__replay.add(self.__state_curt, self.__action_curt, reward, state)
        self.__state_curt = state
        self.__action_curt = action

        if len(self.__replay) > self.__mini_batch:
            self.train()

        self.__step += 1
        if self.__step >= self.__max_epoch:
            self.__step = 0
            self.__episode += 1
            self.__explorer.reset_ep(self.__ep_begin)
        return action

    def train(self):
        batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch(
            self.__mini_batch)
        weights = [1.0] * self.__mini_batch
        weights = np.expand_dims(weights, axis=1)
        target_q = self.__critic.predict_target(
            batch_state_next, self.__actor.predict_target(batch_state_next))
        value_q = self.__critic.predict(batch_state, batch_action)

        batch_y = []
        batch_error = []
        for k in range(len(batch_reward)):
            target_y = batch_reward[k] + self.__gamma * target_q[k]
            batch_error.append(abs(target_y - value_q[k]))
            batch_y.append(target_y)

        predicted_q, _ = self.__critic.train(batch_state, batch_action,
                                             batch_y, weights)
        a_outs = self.__actor.predict(batch_state)
        grads = self.__critic.calculate_gradients(batch_state, a_outs)
        weighted_grads = weights * grads[0]
        self.__actor.train(batch_state, weighted_grads)
        self.__actor.update_target_paras()
        self.__critic.update_target_paras()
예제 #2
0
def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 train_indicator=True,
                 render=False,
                 temp=False):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp)
        actor = ActorNetwork(sess, robot.state_dim, robot.action_dim,
                             ACTOR_LEARNING_RATE)
        critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars())
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0

            while (not done):
                # Choose and take action, and observe reward
                action_prob = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))
                action = np.random.choice(np.arange(len(action_prob)),
                                          p=action_prob)
                next_state, reward, done, step = robot.update(action)
                # Train
                V_minib = critic.predict(
                    np.reshape(state, (1, robot.state_dim)))
                V_minib_next = critic.predict(
                    np.reshape(next_state, (1, robot.state_dim)))
                if done:
                    td_target = reward
                    td_error = reward - V_minib  # not - V_minib[k] ?
                else:
                    td_target = reward + GAMMA * V_minib_next
                    td_error = reward + GAMMA * V_minib_next - V_minib

                critic.train(np.reshape(state, (1, robot.state_dim)),
                             np.reshape(td_target, (1, 1)))
                actor.train(np.reshape(state, (1, robot.state_dim)),
                            np.reshape(action, (1, 1)),
                            np.reshape(td_error, (1, 1)))

                state = next_state
                ep_reward = ep_reward + reward
                # this print is usefull for debuggin
                #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob)

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
예제 #3
0
def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 load_file=False,
                 render=False,
                 temp=False,
                 verbose=False):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        #robot = gym_pendulum(render, temp)
        robot = gym_mountaincar(render, temp)
        actor = ActorNetwork(sess,
                             robot.state_dim,
                             robot.action_dim,
                             ACTOR_LEARNING_RATE,
                             ACTION_BOUND,
                             device=DEVICE)
        critic = CriticNetwork(sess,
                               robot.state_dim,
                               CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars(),
                               device=DEVICE)
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        if load_file:
            actor.recover_actor()
            critic.recover_critic()

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0

            while (not done):
                # Choose and take action, and observe reward
                action, mu, sigma = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))

                new_action = action + 0.2 * (np.random.rand(1)[0])
                action_noise = np.clip(new_action, -ACTION_BOUND, ACTION_BOUND)
                # print(round(action,3), round(new_action,3), round(action_noise,3),  round(mu,3), round(sigma,3))
                next_state, reward, done, step = robot.update(action_noise)

                # Train
                V_minib = critic.predict(
                    np.reshape(state, (1, robot.state_dim)))
                V_minib_next = critic.predict(
                    np.reshape(next_state, (1, robot.state_dim)))
                if done:
                    td_target = reward
                    td_error = reward - V_minib  # not - V_minib[k] ?
                else:
                    td_target = reward + GAMMA * V_minib_next
                    td_error = reward + GAMMA * V_minib_next - V_minib

                #critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1)))
                critic.train(np.reshape(state, (1, robot.state_dim)),
                             np.reshape(td_target, (1, 1)))
                actor.train(np.reshape(state, (1, robot.state_dim)),
                            np.reshape(action, (1, 1)),
                            np.reshape(td_error, (1, 1)))

                state = next_state
                ep_reward = ep_reward + reward
                # this print is usefull for debuggin
                if verbose:
                    print(step, 'action', round(action, 3), 'state',
                          round(robot.state[0], 3), round(robot.state[1], 3),
                          'r', round(reward, 3))

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')
            #time.sleep(1)

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')
예제 #4
0
def actor_critic(epochs=1000,
                 GAMMA=0.99,
                 train_indicator=True,
                 render=False,
                 temp=False,
                 baseline=True):
    with tf.Session() as sess:

        # define objects
        # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear
        robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp)
        actor = ActorNetwork(sess, robot.state_dim, robot.action_dim,
                             ACTOR_LEARNING_RATE)
        critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE,
                               actor.get_num_trainable_vars())
        # starting tensorflow
        sess.run(tf.global_variables_initializer())

        for i in range(epochs):
            # Reset the environment
            state, done, step = robot.reset()
            ep_reward = 0
            total_reward = np.zeros(max_episode)
            total_state = deque()
            total_action = deque()
            k = 0
            while (not done) and k < max_episode:
                # Choose and take action, and observe reward
                action_prob = actor.predict(
                    np.reshape(state, (1, robot.state_dim)))
                action = np.random.choice(np.arange(len(action_prob)),
                                          p=action_prob)
                next_state, reward, done, step = robot.update(action)
                # store episode information
                total_reward[k] = reward
                total_state.append(state)
                total_action.append(action)
                state = next_state
                k = k + 1

            # Train
            # get G
            for l in range(k):

                G = np.sum(total_reward[l:k + 1])
                #print(l,G) # print for debug
                state = np.reshape(total_state[l], (1, robot.state_dim))
                action = np.reshape(total_action[l], (1, 1))

                if baseline:
                    delta = G - critic.predict(state)
                    critic.train(state, delta)
                    actor.train(state, action, delta)
                else:
                    actor.train(state, action, G)

            # this print is usefull for debuggin
            #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob)

            print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward,
                  'goal achieved:', robot.goal, 'Efficiency',
                  round(100. * ((robot.goal) / (i + 1.)), 0), '%')

        print('*************************')
        print('now we save the model')
        critic.save_critic()
        actor.save_actor()
        print('model saved succesfuly')
        print('*************************')