Пример #1
0
class SAC_v2:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.tau = args.tau
        self.gamma = args.gamma
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.critic_update = args.critic_update

        self.log_alpha = tf.Variable(np.log(args.alpha),
                                     dtype=tf.float32,
                                     trainable=True)
        self.target_entropy = -action_dim
        self.alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr)
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'SAC_v2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action, _ = self.actor(state)
        action = np.clip(action.numpy()[0], -1, 1)

        return action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action, _ = self.actor(state, deterministic=True)
        action = np.clip(action.numpy()[0], -1, 1)

        return action

    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0

        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            ns_action, ns_logpi = self.actor(ns)

            target_min_aq = tf.minimum(self.target_critic1(ns, ns_action),
                                       self.target_critic2(ns, ns_action))

            target_q = tf.stop_gradient(
                r + self.gamma * (1 - d) *
                (target_min_aq - self.alpha.numpy() * ns_logpi))

            with tf.GradientTape(persistent=True) as tape1:

                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape1.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))
            critic2_gradients = tape1.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape1

            with tf.GradientTape() as tape2:
                s_action, s_logpi = self.actor(s)

                min_aq_rep = tf.minimum(self.critic1(s, s_action),
                                        self.critic2(s, s_action))

                actor_loss = 0.5 * tf.reduce_mean(self.alpha.numpy() *
                                                  s_logpi - min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(s)
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        (tf.stop_gradient(s_logpi + self.target_entropy)))
                    alpha_loss = tf.nn.compute_average_loss(
                        alpha_loss)  #from softlearning package

                alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))

                del tape3

            if self.current_step % self.critic_update == 0:
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)

            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()
            if self.train_alpha == True:
                total_alpha_loss += alpha_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss],
                ['Loss/alpha', total_alpha_loss],
                ['Alpha', tf.exp(self.log_alpha).numpy()]]
Пример #2
0
class DBC_SACv2:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau

        self.actor_update = args.actor_update
        self.critic_update = args.critic_update

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)

        self.dynamics_model = Transition_Network(self.feature_dim,
                                                 action_dim,
                                                 deterministic=False)
        self.reward_model = Reward_Network(self.feature_dim)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr)

        self.dynamics_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)
        self.reward_optimizer = tf.keras.optimizers.Adam(args.decoder_lr)

        self.current_step = 0

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder,
            'Dynamics': self.dynamics_model,
            'Reward': self.reward_model
        }

        self.name = 'DBC_SACv2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, local_step):
        self.current_step += 1
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        total_encoder_loss = 0
        total_dynamics_loss = 0
        total_reward_loss = 0
        loss_list = []
        s, a, r, ns, d = self.buffer.sample(self.batch_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))

        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        if self.current_step % self.actor_update == 0:
            with tf.GradientTape() as tape2:

                s_action, s_logpi = self.actor(
                    tf.stop_gradient(self.encoder(s)))

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

                actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                            min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(self.encoder(s))
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        tf.stop_gradient(s_logpi + self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)
                    #alpha_loss = tf.reduce_mean(alpha_loss)

                log_alpha_gradients = tape3.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape3

        if self.current_step % self.critic_update == 0:
            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        #train encoder
        with tf.GradientTape() as tape4:
            new_ids = np.arange(len(s))
            np.random.shuffle(new_ids)
            s2 = tf.gather(s, new_ids)

            feature = self.encoder(s)
            #feature2 = tf.gather(feature, new_ids)
            feature2 = self.encoder(s2)

            reward = self.reward_model(tf.stop_gradient(feature))
            #reward2 = tf.gather(reward, new_ids)
            reward2 = self.reward_model(tf.stop_gradient(feature2))

            feature_action, _ = self.actor(tf.stop_gradient(feature), True)
            feature2_action, _ = self.actor(tf.stop_gradient(feature2), True)

            mu, sigma = self.dynamics_model(tf.stop_gradient(feature),
                                            feature_action)
            mu2, sigma2 = self.dynamics_model(tf.stop_gradient(feature2),
                                              feature2_action)

            z_dist = tf.reshape(tf.keras.losses.huber(feature, feature2),
                                shape=[-1, 1])
            r_dist = tf.reshape(tf.keras.losses.huber(reward, reward2),
                                shape=[-1, 1])
            transition_dist = tf.sqrt(
                tf.square(mu - mu2) + tf.square(sigma - sigma2))

            bisimilarity = r_dist + self.gamma * transition_dist
            encoder_loss = tf.reduce_mean(tf.square(z_dist - bisimilarity))

        encoder_gradients = tape4.gradient(encoder_loss,
                                           self.encoder.trainable_variables)
        self.encoder_optimizer.apply_gradients(
            zip(encoder_gradients, self.encoder.trainable_variables))

        #train dynamics
        with tf.GradientTape() as tape5:
            feature = self.encoder(s)
            mu, sigma = self.dynamics_model(feature, a)

            if (sigma[0][0].numpy() == 0):
                if self.dynamics_model.deterministic == False:
                    print("error")
                sigma = tf.ones_like(mu)

            next_feature = self.encoder(ns)
            diff = (mu - tf.stop_gradient(next_feature)) / sigma

            dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) +
                                           tf.math.log(sigma))

        dynamics_gradients = tape5.gradient(
            dynamics_loss, self.encoder.trainable_variables +
            self.dynamics_model.trainable_variables)
        self.dynamics_optimizer.apply_gradients(
            zip(
                dynamics_gradients, self.encoder.trainable_variables +
                self.dynamics_model.trainable_variables))

        #train reward
        with tf.GradientTape() as tape6:
            feature = self.encoder(s)
            sample_dynamics = self.dynamics_model.sample(feature, a)
            reward_prediction = self.reward_model(sample_dynamics)

            reward_loss = tf.reduce_mean(tf.square(reward_prediction - r))

        reward_gradients = tape6.gradient(
            reward_loss, self.encoder.trainable_variables +
            self.reward_model.trainable_variables)
        self.reward_optimizer.apply_gradients(
            zip(
                reward_gradients, self.encoder.trainable_variables +
                self.reward_model.trainable_variables))

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        if self.current_step % self.actor_update == 0:
            total_a_loss += actor_loss.numpy()
            loss_list.append(['Loss/Actor', total_a_loss])

        total_encoder_loss += encoder_loss.numpy()
        loss_list.append(['Loss/Encoder', total_encoder_loss])

        total_dynamics_loss += dynamics_loss.numpy()
        loss_list.append(['Loss/Dynamics', total_dynamics_loss])

        total_reward_loss += reward_loss.numpy()
        loss_list.append(['Loss/Reward', total_reward_loss])

        if self.current_step % self.actor_update == 0 and self.train_alpha == True:
            total_alpha_loss += alpha_loss.numpy()
            loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Пример #3
0
class TD3:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.actor_lr = args.actor_lr
        self.critic_lr = args.critic_lr
        self.policy_delay = args.policy_delay
        self.actor_noise = args.actor_noise
        self.target_noise = args.target_noise
        self.noise_clip = args.noise_clip
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0

        self.actor = Policy_network(self.state_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.state_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        args.hidden_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'TD3'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        noise = np.random.normal(loc=0,
                                 scale=self.actor_noise,
                                 size=self.action_dim)
        action = self.actor(state).numpy()[0] + noise
        action = np.clip(action, -1, 1)

        return action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action = self.actor(state).numpy()[0]
        action = np.clip(action, -1, 1)

        return action

    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_action = tf.clip_by_value(
                self.target_actor(ns) + tf.clip_by_value(
                    tf.random.normal(shape=self.target_actor(ns).shape,
                                     mean=0,
                                     stddev=self.target_noise),
                    -self.noise_clip, self.noise_clip), -1, 1)

            target_value = tf.stop_gradient(
                r + self.gamma *
                (1 - d) * tf.minimum(self.target_critic1(ns, target_action),
                                     self.target_critic2(ns, target_action)))

            with tf.GradientTape(persistent=True) as tape:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic1(s, a)))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic2(s, a)))

            critic1_grad = tape.gradient(critic1_loss,
                                         self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_grad, self.critic1.trainable_variables))

            critic2_grad = tape.gradient(critic2_loss,
                                         self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_grad, self.critic2.trainable_variables))

            if self.current_step % self.policy_delay == 0:
                with tf.GradientTape() as tape2:
                    actor_loss = -tf.reduce_mean(self.critic1(
                        s, self.actor(s)))

                actor_grad = tape2.gradient(actor_loss,
                                            self.actor.trainable_variables)
                self.actor_optimizer.apply_gradients(
                    zip(actor_grad, self.actor.trainable_variables))

                soft_update(self.actor, self.target_actor, self.tau)
                soft_update(self.critic1, self.target_critic1, self.tau)
                soft_update(self.critic2, self.target_critic2, self.tau)

            del tape, tape2
            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss]]
Пример #4
0
class DDQN:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.optimizer = tf.keras.optimizers.Adam(args.learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.lr = args.learning_rate
        self.epsilon = args.epsilon
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.copy_iter = args.copy_iter

        self.network = Policy_network(self.state_dim, self.action_dim,
                                      args.hidden_dim)
        self.target_network = Policy_network(self.state_dim, self.action_dim,
                                             args.hidden_dim)

        copy_weight(self.network, self.target_network)

        self.network_list = {
            'Network': self.network,
            'Target_Network': self.target_network
        }
        self.name = 'Double DQN'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        q_value = self.network(state, activation='linear').numpy()
        best_action = np.argmax(q_value, axis=1)[0]

        if np.random.random() < self.epsilon:
            return np.random.randint(low=0, high=self.action_dim)
        else:
            return best_action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        q_value = self.network(state, activation='linear').numpy()
        best_action = np.argmax(q_value, axis=1)[0]

        return best_action

    def train(self, training_num):
        total_loss = 0

        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            q_value = tf.expand_dims(tf.argmax(self.network(
                ns, activation='linear'),
                                               axis=1,
                                               output_type=tf.int32),
                                     axis=1)
            q_value_one = tf.squeeze(tf.one_hot(q_value,
                                                depth=self.action_dim),
                                     axis=1)

            target_value = r + self.gamma * (1 - d) * tf.reduce_sum(
                self.target_network(ns, activation='linear') * q_value_one,
                axis=1,
                keepdims=True)
            target_value = tf.stop_gradient(target_value)

            with tf.GradientTape() as tape:
                selected_values = tf.reduce_sum(
                    self.network(s, activation='linear') * tf.squeeze(
                        tf.one_hot(tf.cast(a, tf.int32), self.action_dim),
                        axis=1),
                    axis=1,
                    keepdims=True)
                loss = 0.5 * tf.math.reduce_mean(
                    tf.square(target_value - selected_values))

            variables = self.network.trainable_variables
            gradients = tape.gradient(loss, variables)
            self.optimizer.apply_gradients(zip(gradients, variables))

            if self.current_step % self.copy_iter == 0:
                copy_weight(self.network, self.target_network)

            total_loss += loss.numpy()

        return [['Loss/Loss', total_loss]]
Пример #5
0
class ImageDQN:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)
        self.optimizer = tf.keras.optimizers.Adam(args.learning_rate)

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.feature_dim = args.feature_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.learning_rate = args.learning_rate
        self.epsilon = args.epsilon
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0
        self.copy_iter = args.copy_iter

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num

        self.network = Policy_network(self.feature_dim, self.action_dim,
                                      args.hidden_dim)
        self.target_network = Policy_network(self.feature_dim, self.action_dim,
                                             args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num,
                                    'channels_last')
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num,
                                           'channels_last')

        copy_weight(self.network, self.target_network)
        copy_weight(self.encoder, self.target_encoder)

        self.network_list = {
            'Network': self.network,
            'Target_Network': self.target_network
        }
        self.name = 'ImageDQN'

    def get_action(self, obs):
        if np.random.random() < self.epsilon:
            return np.random.randint(low=0, high=self.action_dim)
        else:
            obs = np.expand_dims(np.array(obs), axis=0)
            feature = self.encoder(obs)
            q_value = self.network(feature, activation='linear').numpy()
            best_action = np.argmax(q_value, axis=1)[0]
            return best_action

    def eval_action(self, obs):
        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        q_value = self.network(feature, activation='linear').numpy()
        best_action = np.argmax(q_value, axis=1)[0]

        return best_action

    def train(self, training_num):
        total_loss = 0
        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_q = tf.reduce_max(self.target_network(
                self.target_encoder(ns), activation='linear'),
                                     axis=1,
                                     keepdims=True)
            target_value = r + self.gamma * (1 - d) * target_q
            target_value = tf.stop_gradient(target_value)

            with tf.GradientTape() as tape:
                selected_values = tf.reduce_sum(
                    self.network(self.encoder(s), activation='linear') *
                    tf.squeeze(tf.one_hot(tf.cast(a, tf.int32),
                                          self.action_dim),
                               axis=1),
                    axis=1,
                    keepdims=True)
                loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - selected_values))

            gradients = tape.gradient(
                loss, self.encoder.trainable_variables +
                self.network.trainable_variables)

            self.optimizer.apply_gradients(
                zip(
                    gradients, self.encoder.trainable_variables +
                    self.network.trainable_variables))

            if self.current_step % self.copy_iter == 0:
                copy_weight(self.network, self.target_network)
                copy_weight(self.encoder, self.target_encoder)

            total_loss += loss.numpy()

            del tape

        return [['Loss/Loss', total_loss]]
Пример #6
0
class SACv2_AE:
    def __init__(self, obs_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.image_size = args.image_size
        self.current_step = 0

        self.log_alpha = tf.Variable(initial_value=tf.math.log(args.alpha),
                                     trainable=True)
        self.target_entropy = -action_dim
        self.gamma = args.gamma

        self.batch_size = args.batch_size
        self.feature_dim = args.feature_dim

        self.layer_num = args.layer_num
        self.filter_num = args.filter_num
        self.tau = args.tau
        self.encoder_tau = args.encoder_tau
        self.actor_update = args.actor_update
        self.critic_update = args.critic_update
        self.decoder_update = args.decoder_update
        self.decoder_latent_lambda = args.decoder_latent_lambda
        self.decoder_weight_lambda = args.decoder_weight_lambda

        self.training_start = args.training_start
        self.training_step = args.training_step
        self.train_alpha = args.train_alpha

        self.actor = Squashed_Gaussian_Actor(self.feature_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.feature_dim, self.action_dim,
                                 args.hidden_dim)
        self.target_critic1 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)
        self.target_critic2 = Q_network(self.feature_dim, self.action_dim,
                                        args.hidden_dim)

        self.encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)
        self.target_encoder = PixelEncoder(self.obs_dim, self.feature_dim,
                                           self.layer_num, self.filter_num)
        self.decoder = PixelDecoder(self.obs_dim, self.feature_dim,
                                    self.layer_num, self.filter_num)

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)
        copy_weight(self.encoder, self.target_encoder)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.encoder_optimizer = tf.keras.optimizers.Adam(args.encoder_lr)
        self.decoder_optimizer = tfa.optimizers.AdamW(
            weight_decay=self.decoder_weight_lambda,
            learning_rate=args.decoder_lr)

        self.log_alpha_optimizer = tf.keras.optimizers.Adam(args.alpha_lr,
                                                            beta_1=0.5)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2,
            'Encoder': self.encoder,
            'Target_Encoder': self.target_encoder,
            'Decoder': self.decoder
        }
        self.name = 'SACv2_AE'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, obs):

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature)
        action = action.numpy()[0]

        return action

    def eval_action(self, obs):

        obs = np.expand_dims(np.array(obs), axis=0)
        feature = self.encoder(obs)
        action, _ = self.actor(feature, deterministic=True)
        action = action.numpy()[0]

        return action

    def train(self, local_step):
        self.current_step += 1

        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_alpha_loss = 0
        total_ae_loss = 0
        loss_list = []

        s, a, r, ns, d = self.buffer.sample(self.batch_size)

        ns_action, ns_logpi = self.actor(self.encoder(ns))

        target_min_aq = tf.minimum(
            self.target_critic1(self.target_encoder(ns), ns_action),
            self.target_critic2(self.target_encoder(ns), ns_action))

        target_q = tf.stop_gradient(
            r + self.gamma * (1 - d) *
            (target_min_aq - self.alpha.numpy() * ns_logpi))
        #critic update
        with tf.GradientTape(persistent=True) as tape1:
            critic1_loss = tf.reduce_mean(
                tf.square(self.critic1(self.encoder(s), a) - target_q))
            critic2_loss = tf.reduce_mean(
                tf.square(self.critic2(self.encoder(s), a) - target_q))

        critic1_gradients = tape1.gradient(
            critic1_loss, self.encoder.trainable_variables +
            self.critic1.trainable_variables)
        self.critic1_optimizer.apply_gradients(
            zip(
                critic1_gradients, self.encoder.trainable_variables +
                self.critic1.trainable_variables))

        critic2_gradients = tape1.gradient(
            critic2_loss, self.encoder.trainable_variables +
            self.critic2.trainable_variables)
        self.critic2_optimizer.apply_gradients(
            zip(
                critic2_gradients, self.encoder.trainable_variables +
                self.critic2.trainable_variables))

        del tape1

        #actor update
        if self.current_step % self.actor_update == 0:
            with tf.GradientTape() as tape2:

                s_action, s_logpi = self.actor(
                    tf.stop_gradient(self.encoder(s)))

                min_aq_rep = tf.minimum(
                    self.critic1(tf.stop_gradient(self.encoder(s)), s_action),
                    self.critic2(tf.stop_gradient(self.encoder(s)), s_action))

                actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi -
                                            min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2
            #alpha update
            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    _, s_logpi = self.actor(self.encoder(s))
                    alpha_loss = -(
                        tf.exp(self.log_alpha) *
                        tf.stop_gradient(s_logpi + self.target_entropy))
                    alpha_loss = tf.nn.compute_average_loss(alpha_loss)

                log_alpha_gradients = tape3.gradient(alpha_loss,
                                                     [self.log_alpha])
                self.log_alpha_optimizer.apply_gradients(
                    zip(log_alpha_gradients, [self.log_alpha]))

                del tape3

        if self.current_step % self.decoder_update == 0:
            #encoder, decoder update
            with tf.GradientTape(persistent=True) as tape4:
                feature = self.encoder(s)
                recovered_s = self.decoder(feature)
                real_s = preprocess_obs(s)

                rec_loss = tf.reduce_mean(tf.square(recovered_s - real_s))
                latent_loss = tf.reduce_mean(
                    0.5 * tf.reduce_sum(tf.square(feature), axis=1))

                ae_loss = rec_loss + self.decoder_latent_lambda * latent_loss

            encoder_gradients = tape4.gradient(
                ae_loss, self.encoder.trainable_variables)
            decoder_gradients = tape4.gradient(
                ae_loss, self.decoder.trainable_variables)

            self.encoder_optimizer.apply_gradients(
                zip(encoder_gradients, self.encoder.trainable_variables))
            self.decoder_optimizer.apply_gradients(
                zip(decoder_gradients, self.decoder.trainable_variables))

        if self.current_step % self.critic_update == 0:

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
            soft_update(self.encoder, self.target_encoder, self.encoder_tau)

        del tape4

        total_c1_loss += critic1_loss.numpy()
        total_c2_loss += critic2_loss.numpy()

        loss_list.append(['Loss/Critic1', total_c1_loss])
        loss_list.append(['Loss/Critic2', total_c2_loss])

        if self.current_step % self.decoder_update == 0:
            total_ae_loss += ae_loss.numpy()
            loss_list.append(['Loss/AutoEncoder', total_ae_loss])

        if self.current_step % self.actor_update == 0:
            total_a_loss += actor_loss.numpy()
            loss_list.append(['Loss/Actor', total_a_loss])
            if self.train_alpha == True:
                total_alpha_loss += alpha_loss.numpy()
                loss_list.append(['Loss/Alpha', total_alpha_loss])

        loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()])

        return loss_list
Пример #7
0
class SAC_v2:
    def __init__(self,
                 state_dim,
                 action_dim,
                 hidden_dim=256,
                 training_step=1,
                 alpha=0.1,
                 train_alpha=True,
                 batch_size=128,
                 buffer_size=1e6,
                 tau=0.005,
                 learning_rate=0.0003,
                 gamma=0.99,
                 reward_scale=1,
                 training_start=500):

        self.buffer = Buffer(buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.reward_scale = reward_scale
        self.training_start = training_start
        self.training_step = training_step

        self.log_alpha = tf.Variable(np.log(alpha),
                                     dtype=tf.float32,
                                     trainable=True)
        self.target_entropy = -action_dim
        self.alpha_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.train_alpha = train_alpha

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic1 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.target_critic2 = Q_network(self.state_dim, self.action_dim,
                                        (hidden_dim, hidden_dim))

        copy_weight(self.critic1, self.target_critic1)
        copy_weight(self.critic2, self.target_critic2)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'Target_Critic1': self.target_critic1,
            'Target_Critic2': self.target_critic2
        }
        self.name = 'SAC_v2'

    @property
    def alpha(self):
        return tf.exp(self.log_alpha)

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        action = self.actor(state).numpy()[0]

        return action

    def train(self, training_num):
        for i in range(training_num):
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            target_min_aq = tf.minimum(self.target_critic1(ns, self.actor(ns)),
                                       self.target_critic2(ns, self.actor(ns)))

            target_q = tf.stop_gradient(
                r + self.gamma * (1 - d) *
                (target_min_aq - self.alpha.numpy() * self.actor.log_pi(ns)))

            #critic training
            with tf.GradientTape(persistent=True) as tape1:
                critic1_loss = tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape1.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))
            critic2_gradients = tape1.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape1

            #actor training
            with tf.GradientTape() as tape2:
                mu, sigma = self.actor.mu_sigma(s)
                output = mu + tf.random.normal(shape=mu.shape) * sigma

                min_aq_rep = tf.minimum(self.critic1(s, output),
                                        self.critic2(s, output))

                actor_loss = tf.reduce_mean(self.alpha.numpy() *
                                            self.actor.log_pi(s) - min_aq_rep)

            actor_gradients = tape2.gradient(actor_loss,
                                             self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_gradients, self.actor.trainable_variables))

            del tape2

            #alpha(temperature) training
            if self.train_alpha == True:
                with tf.GradientTape() as tape3:
                    alpha_loss = -(tf.exp(self.log_alpha) * (tf.stop_gradient(
                        self.actor.log_pi(s) + self.target_entropy)))
                    alpha_loss = tf.nn.compute_average_loss(
                        alpha_loss)  #from softlearning package

                alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha])
                self.alpha_optimizer.apply_gradients(
                    zip(alpha_grad, [self.log_alpha]))

                del tape3

            soft_update(self.critic1, self.target_critic1, self.tau)
            soft_update(self.critic2, self.target_critic2, self.tau)
Пример #8
0
class SAC_v1:
    def __init__(self,
                 state_dim,
                 action_dim,
                 hidden_dim=256,
                 training_step=1,
                 batch_size=128,
                 buffer_size=1e6,
                 tau=0.005,
                 learning_rate=0.0003,
                 gamma=0.99,
                 alpha=0.2,
                 reward_scale=1,
                 training_start=500):

        self.buffer = Buffer(buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic1_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.critic2_optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.v_network_optimizer = tf.keras.optimizers.Adam(learning_rate)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = batch_size
        self.tau = tau
        self.gamma = gamma
        self.alpha = alpha
        self.reward_scale = reward_scale
        self.training_start = training_start
        self.training_step = training_step

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             (hidden_dim, hidden_dim))
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 (hidden_dim, hidden_dim))
        self.v_network = V_network(self.state_dim, (hidden_dim, hidden_dim))
        self.target_v_network = V_network(self.state_dim,
                                          (hidden_dim, hidden_dim))

        copy_weight(self.v_network, self.target_v_network)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network
        }
        self.name = 'SAC_v1'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)

        action = self.actor(state).numpy()[0]

        return action

    def train(self, training_num):
        for i in range(training_num):
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            min_aq = tf.minimum(self.critic1(s, self.actor(s)),
                                self.critic2(s, self.actor(s)))

            target_v = tf.stop_gradient(min_aq -
                                        self.alpha * self.actor.log_pi(s))
            #v_network training
            with tf.GradientTape(persistent=True) as tape1:
                v_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.v_network(s) - target_v))

            v_gradients = tape1.gradient(v_loss,
                                         self.v_network.trainable_variables)
            self.v_network_optimizer.apply_gradients(
                zip(v_gradients, self.v_network.trainable_variables))

            del tape1

            target_q = tf.stop_gradient(r + self.gamma *
                                        (1 - d) * self.target_v_network(ns))
            #critic training
            with tf.GradientTape(persistent=True) as tape2:

                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape2.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))

            critic2_gradients = tape2.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            del tape2
            #actor training
            with tf.GradientTape() as tape3:
                mu, sigma = self.actor.mu_sigma(s)
                output = mu + tf.random.normal(shape=sigma.shape) * sigma

                min_aq_rep = tf.minimum(self.critic1(s, output),
                                        self.critic2(s, output))

                actor_loss = tf.reduce_mean(self.alpha * self.actor.log_pi(s) -
                                            min_aq_rep)

            actor_grad = tape3.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            del tape3

            soft_update(self.v_network, self.target_v_network, self.tau)
Пример #9
0
class SAC_v1:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic1_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.critic2_optimizer = tf.keras.optimizers.Adam(args.critic_lr)
        self.v_network_optimizer = tf.keras.optimizers.Adam(args.v_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.tau = args.tau
        self.gamma = args.gamma
        self.alpha = args.alpha
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0

        self.actor = Squashed_Gaussian_Actor(self.state_dim, self.action_dim,
                                             args.hidden_dim, args.log_std_min,
                                             args.log_std_max)
        self.critic1 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.critic2 = Q_network(self.state_dim, self.action_dim,
                                 args.hidden_dim)
        self.v_network = V_network(self.state_dim, args.hidden_dim)
        self.target_v_network = V_network(self.state_dim, args.hidden_dim)

        copy_weight(self.v_network, self.target_v_network)

        self.network_list = {
            'Actor': self.actor,
            'Critic1': self.critic1,
            'Critic2': self.critic2,
            'V_network': self.v_network,
            'Target_V_network': self.target_v_network
        }
        self.name = 'SAC_v1'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action, _ = self.actor(state)
        action = np.clip(action.numpy()[0], -1, 1)

        return action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action, _ = self.actor(state, deterministic=True)
        action = np.clip(action.numpy()[0], -1, 1)

        return action

    def train(self, training_num):
        total_a_loss = 0
        total_c1_loss, total_c2_loss = 0, 0
        total_v_loss = 0
        for i in range(training_num):
            self.current_step += 1

            s, a, r, ns, d = self.buffer.sample(self.batch_size)
            s_action, s_logpi = self.actor(s)
            min_aq = tf.minimum(self.critic1(s, s_action),
                                self.critic2(s, s_action))
            target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi)

            with tf.GradientTape() as tape1:
                v_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.v_network(s) - target_v))

            v_gradients = tape1.gradient(v_loss,
                                         self.v_network.trainable_variables)
            self.v_network_optimizer.apply_gradients(
                zip(v_gradients, self.v_network.trainable_variables))

            target_q = tf.stop_gradient(r + self.gamma *
                                        (1 - d) * self.target_v_network(ns))

            with tf.GradientTape(persistent=True) as tape2:
                critic1_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic1(s, a) - target_q))
                critic2_loss = 0.5 * tf.reduce_mean(
                    tf.square(self.critic2(s, a) - target_q))

            critic1_gradients = tape2.gradient(
                critic1_loss, self.critic1.trainable_variables)
            self.critic1_optimizer.apply_gradients(
                zip(critic1_gradients, self.critic1.trainable_variables))

            critic2_gradients = tape2.gradient(
                critic2_loss, self.critic2.trainable_variables)
            self.critic2_optimizer.apply_gradients(
                zip(critic2_gradients, self.critic2.trainable_variables))

            with tf.GradientTape() as tape3:
                s_action, s_logpi = self.actor(s)

                min_aq_rep = tf.minimum(self.critic1(s, s_action),
                                        self.critic2(s, s_action))
                actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep)

            actor_grad = tape3.gradient(actor_loss,
                                        self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor.trainable_variables))

            soft_update(self.v_network, self.target_v_network, self.tau)

            del tape1, tape2, tape3

            total_a_loss += actor_loss.numpy()
            total_c1_loss += critic1_loss.numpy()
            total_c2_loss += critic2_loss.numpy()
            total_v_loss += v_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss],
                ['Loss/Critic2', total_c2_loss], ['Loss/V', total_v_loss]]
Пример #10
0
class DDPG:
    def __init__(self, state_dim, action_dim, args):

        self.buffer = Buffer(args.buffer_size)

        self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr)
        self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr)

        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.tau = args.tau
        self.noise_scale = args.noise_scale
        self.training_start = args.training_start
        self.training_step = args.training_step
        self.current_step = 0

        self.actor = Policy_network(self.state_dim, self.action_dim,
                                    args.hidden_dim)
        self.target_actor = Policy_network(self.state_dim, self.action_dim,
                                           args.hidden_dim)
        self.critic = Q_network(self.state_dim, self.action_dim,
                                args.hidden_dim)
        self.target_critic = Q_network(self.state_dim, self.action_dim,
                                       args.hidden_dim)

        copy_weight(self.actor, self.target_actor)
        copy_weight(self.critic, self.target_critic)

        self.network_list = {
            'Actor': self.actor,
            'Target_Actor': self.target_actor,
            'Critic': self.critic,
            'Target_Critic': self.target_critic
        }
        self.name = 'DDPG'

    def get_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        noise = np.random.normal(loc=0,
                                 scale=self.noise_scale,
                                 size=self.action_dim)
        action = self.actor(state).numpy()[0] + noise

        action = np.clip(action, -1, 1)

        return action

    def eval_action(self, state):
        state = np.expand_dims(np.array(state), axis=0)
        action = self.actor(state).numpy()[0]

        action = np.clip(action, -1, 1)

        return action

    def train(self, training_num):
        total_a_loss = 0
        total_c_loss = 0

        for i in range(training_num):
            self.current_step += 1
            s, a, r, ns, d = self.buffer.sample(self.batch_size)

            value_next = tf.stop_gradient(
                self.target_critic(ns, self.target_actor(ns)))
            target_value = r + (1 - d) * self.gamma * value_next

            with tf.GradientTape(persistent=True) as tape:
                critic_loss = 0.5 * tf.reduce_mean(
                    tf.square(target_value - self.critic(s, a)))
                actor_loss = -tf.reduce_mean(self.critic(s, self.actor(s)))

            critic_grad = tape.gradient(critic_loss,
                                        self.critic.trainable_variables)
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic.trainable_variables))

            actor_grad = tape.gradient(actor_loss,
                                       self.actor.trainable_variables)
            self.actor_optimizer.apply_gradients(
                (zip(actor_grad, self.actor.trainable_variables)))

            soft_update(self.actor, self.target_actor, self.tau)
            soft_update(self.critic, self.target_critic, self.tau)

            del tape

            total_a_loss += actor_loss.numpy()
            total_c_loss += critic_loss.numpy()

        return [['Loss/Actor', total_a_loss], ['Loss/Critic', total_c_loss]]