示例#1
0
    def __init__(self,
                 env,
                 actor_model,
                 critic_model,
                 gamma=0.99,
                 tau=1e-3,
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 critic_decay=0.):
        # Changed this to use generic env instead of Task
        super().__init__(env)

        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
示例#2
0
    def __init__(self,
                 actor_model,
                 tgt_actor_model,
                 critic_model,
                 tgt_critic_model,
                 action_limits,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 critic_decay=1e-2,
                 tau=1e-3,
                 gamma=0.99,
                 process=None,
                 rb_size=1e6,
                 minibatch_size=64,
                 warmup_episodes=0,
                 episodes_trained=0,
                 train_scores=None,
                 test_scores=None,
                 best_train_score=-np.inf):
        # Changed this to use generic env instead of Task
        super().__init__(warmup_episodes, episodes_trained, train_scores,
                         test_scores, best_train_score)
        self.actor = Actor(actor_model, critic_model, lr=actor_lr)
        self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr)
        self.tgt_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay)
        self.tgt_critic = Critic(tgt_critic_model,
                                 lr=critic_lr,
                                 decay=critic_decay)
        self.tgt_critic.set_weights(self.critic.get_weights())

        self.action_limits = action_limits
        self.process = process
        self.minibatch_size = minibatch_size
        self.buffer = ReplayBuffer(int(rb_size), self.minibatch_size)
        self.tau = tau
        self.gamma = gamma

        self.state_space = K.int_shape(critic_model.inputs[0])[1]
        self.action_space = K.int_shape(critic_model.inputs[1])[1]

        self.learning_phase = 1
        if process is None:
            self.process = OUNoise(size=self.action_space,
                                   theta=0.15,
                                   mu=0,
                                   sigma=0.2)
        else:
            self.process = process
示例#3
0
    def __init__(self, env_reset, state_size, action_size, action_low, action_high):
        """Params:
        env_reset: callback function to reset environemnt at end of episode
        state_size: dimension of state space
        action_size: dimension of action space
        action_low: float - minimum action value
        action_high: float - maximum action value
        """
        self.training_steps = 0 # number of training steps run so far

        self.env_reset = env_reset
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 1e-3  # for soft update of target parameters
        self.critic_decay = 1e-2 # L2 weight decay for critic (regularization)
        self.critic_lr = 1e-3 # Learning rate for critic
        self.critic_alpha = 1e-2 # Leaky ReLU alpha for critic
        self.actor_lr = 1e-4 # Learning rate for actor
        self.actor_alpha = 1e-2 # Leaky ReLU alpha for actor

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay, self.critic_alpha)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay,self.critic_alpha)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = int(1e6)
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
示例#4
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
示例#5
0
class DDPGAgent(Agent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self,
                 actor_model,
                 tgt_actor_model,
                 critic_model,
                 tgt_critic_model,
                 action_limits,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 critic_decay=1e-2,
                 tau=1e-3,
                 gamma=0.99,
                 process=None,
                 rb_size=1e6,
                 minibatch_size=64,
                 warmup_episodes=0,
                 episodes_trained=0,
                 train_scores=None,
                 test_scores=None,
                 best_train_score=-np.inf):
        # Changed this to use generic env instead of Task
        super().__init__(warmup_episodes, episodes_trained, train_scores,
                         test_scores, best_train_score)
        self.actor = Actor(actor_model, critic_model, lr=actor_lr)
        self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr)
        self.tgt_actor.set_weights(self.actor.get_weights())

        self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay)
        self.tgt_critic = Critic(tgt_critic_model,
                                 lr=critic_lr,
                                 decay=critic_decay)
        self.tgt_critic.set_weights(self.critic.get_weights())

        self.action_limits = action_limits
        self.process = process
        self.minibatch_size = minibatch_size
        self.buffer = ReplayBuffer(int(rb_size), self.minibatch_size)
        self.tau = tau
        self.gamma = gamma

        self.state_space = K.int_shape(critic_model.inputs[0])[1]
        self.action_space = K.int_shape(critic_model.inputs[1])[1]

        self.learning_phase = 1
        if process is None:
            self.process = OUNoise(size=self.action_space,
                                   theta=0.15,
                                   mu=0,
                                   sigma=0.2)
        else:
            self.process = process

    def sense(self, s, a, r, s_new, done):
        s = np.reshape(s, [-1, self.state_space])
        s_new = np.reshape(s_new, [-1, self.state_space])
        self.buffer.add(s, a, r, s_new, done)

    def act(self, s):
        s = np.reshape(s, [-1, self.state_space])
        a = self.tgt_actor(s)
        # Cache.
        self.last_state = np.copy(s)
        self.last_action = np.copy(a)
        if self.learning_phase:
            a += self.process.sample()
        a = np.clip(a, self.action_limits[0], self.action_limits[1])

        self.last_action_noisy = np.copy(a)
        return a

    def new_episode(self):
        self.process.reset()

    def train_step(self):
        if len(self.buffer.memory) < self.minibatch_size:
            return

        minibatch = self.buffer.sample(self.minibatch_size)
        states = np.zeros([len(minibatch), self.state_space])
        states_new = np.zeros([len(minibatch), self.state_space])
        actions = np.zeros([len(minibatch), self.action_space])
        r = np.zeros([len(minibatch), 1])
        dones = np.zeros([len(minibatch), 1])

        for i in range(len(minibatch)):
            states[i], actions[i], r[i], states_new[i], dones[i] = minibatch[i]

        # Estimate Q_values
        critic_out = self.critic(states_new, self.actor(states_new))
        tgt_critic_out = self.tgt_critic(states_new,
                                         self.tgt_actor(states_new))

        # Q-values using tgt_critic
        ys = r + self.gamma * tgt_critic_out

        # Train local critic and actor
        self.critic.step(states, actions, ys)
        self.actor.step(states)

        # Soft weight updates for target critic and actor
        critic_weights = self.critic.get_weights()
        tgt_critic_weights = self.tgt_critic.get_weights()
        for i in range(len(critic_weights)):
            tgt_critic_weights[i] = (1 - self.tau) * tgt_critic_weights[i] + \
                self.tau * critic_weights[i]
        self.tgt_critic.set_weights(tgt_critic_weights)

        actor_weights = self.actor.get_weights()
        tgt_actor_weights = self.tgt_actor.get_weights()
        for i in range(len(actor_weights)):
            tgt_actor_weights[i] = (1 - self.tau) * tgt_actor_weights[i] + \
                self.tau * actor_weights[i]
        self.tgt_actor.set_weights(tgt_actor_weights)
示例#6
0
class DDPG(Agent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, env):
        # Changed this to use generic env instead of Task
        super().__init__(env)

        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.action_low = env.action_space.low
        self.action_high = env.action_space.high

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 1e-2  # for soft update of target parameters

        # Critic Params
        self.critic_lr = 1e-3
        self.critic_decay = 1e-2

        # Actor Params
        self.actor_lr = 1e-4
        self.actor_decay = 0

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.actor_decay)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_lr, self.actor_decay)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_lr, self.critic_decay)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_lr, self.critic_decay)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

    def reset_episode(self):
        self.noise.reset()
        state = self.env.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done, training=True):
        # Since DDPG is an off-policy learner, add a training flag

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if training and len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
            self.steps_trained += 1

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, training=True):
        # Add a training flag to decide whether to explore
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if training:
            return list(action +
                        self.noise.sample())  # add some noise for exploration
        else:
            return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def save_model(self, filename):
        al = self.actor_local
        at = self.actor_target
        cl = self.critic_local
        ct = self.critic_target

        self.actor_local = None
        self.actor_target = None
        self.critic_local = None
        self.critic_target = None

        with open(filename + '.ddpg_agent') as f:
            pickle.dump(self, f)

        al.save(filename + '.actor_local')
        at.save(filename + '.actor_target')
        cl.save(filename + '.critic_local')
        ct.save(filename + '.critic_target')

        self.actor_local = al
        self.actor_target = at
        self.critic_local = cl
        self.critic_target = ct

    @classmethod
    def load_model(cls, filename):
        with open(filename + '.ddpg_agent') as f:
            m = pickle.load(f)
        m.actor_local = load_model(filename + '.actor_local')
        m.actor_target = load_model(filename + '.actor_target')
        m.critic_local = load_model(filename + '.critic_local')
        m.critic_target = load_model(filename + '.critic_target')
        return m
示例#7
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG.    """
    def __init__(self, env_reset, state_size, action_size, action_low, action_high):
        """Params:
        env_reset: callback function to reset environemnt at end of episode
        state_size: dimension of state space
        action_size: dimension of action space
        action_low: float - minimum action value
        action_high: float - maximum action value
        """
        self.training_steps = 0 # number of training steps run so far

        self.env_reset = env_reset
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 1e-3  # for soft update of target parameters
        self.critic_decay = 1e-2 # L2 weight decay for critic (regularization)
        self.critic_lr = 1e-3 # Learning rate for critic
        self.critic_alpha = 1e-2 # Leaky ReLU alpha for critic
        self.actor_lr = 1e-4 # Learning rate for actor
        self.actor_alpha = 1e-2 # Leaky ReLU alpha for actor

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay, self.critic_alpha)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay,self.critic_alpha)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = int(1e6)
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)



    def reset_episode(self):
        self.noise.reset()
        state = self.env_reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done, training=True):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if training and len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)
                self.training_steps += 1

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, training=True):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if training: # add some noise for exploration
            return list(action + self.noise.sample())
        else:
            return list(action)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function


        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
示例#8
0
class DDPGAgent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)