Пример #1
0
class DDPG():
    """Reinforcement learning agent who learns using DDPG"""

    def __init__(self,task):
        """Initialize models"""
        self.env = task
        self.state_size = task.observation_space.shape[0]
        self.action_size = task.action_space.shape[0]
        self.action_high = task.action_space.high
        self.action_low = task.action_space.low
        

        # Initialize Actor (policy) models
        self.actor_local = Actor(self.state_size,self.action_size,self.action_low,self.action_high)
        self.actor_target = Actor(self.state_size,self.action_size,self.action_low,self.action_high)

        # Initialize Critic (value) models
        self.critic_local = Critic(self.state_size,self.action_size)
        self.critic_target = Critic(self.state_size,self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay buffer

        self.buffer_size = 100000 
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size,self.batch_size)

         # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self,task):
        """Return state after reseting task"""
        self.noise.reset()
        state = task.reset()
        self.last_state = state
        return state

    def step(self,action,reward,next_state,done):
        # Add experience to memory
        self.memory.add_experience(self.last_state,action,reward,next_state,done)

        # Learn is memory is larger than batch size
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        
        # Roll over state
        self.last_state = next_state

    def act(self,state):
        """Returns action using the policy network """
        state = np.reshape(state,[-1,self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action+self.noise.sample())

    def learn(self,experiences):
        # Convert experience tuples to separate arrays for each element

        states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size) 
        next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1,1)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict(next_states)
        Q_targets_next = self.critic_target.model.predict([next_states,actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma*Q_targets_next*(1-dones)
        self.critic_local.model.train_on_batch(x=[states,actions],y=Q_targets)
        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states,actions,0]),
        [-1,self.action_size])
        self.actor_local.train_fn([states,action_gradients,1])

        # Soft-update target models
        self.soft_update(self.actor_local.model,self.actor_target.model)
        self.soft_update(self.critic_local.model,self.critic_target.model)

    def soft_update(self,local_model,target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau*local_weights + (1-self.tau)*target_weights
        target_model.set_weights(new_weights)

    def save_model(self,path):
        self.actor_local.model.save_weights(path)

    def load_model(self,path):
        self.actor_local.model.load_weights(path)

    def act_only(self,state):
        state = np.reshape(state,[-1,self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action)
Пример #2
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, verbose=False):
        self.verbose = verbose

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        #log_path = '/tmp/logs'
        #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
        #                        write_images=False, write_grads=True, write_graph=False)
        #self.callback.set_model(self.critic_local.model)

        #log_path = '/tmp/logs'
        #self.writer = tf.summary.FileWriter(log_path)

        #self.learn_counter = 0

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.1
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 512
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.015  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        #self.learn_counter = 0
        return state

    def mimic(self, experience_to_mimic):
        print("ready to mimic")
        self.memory.memory = experience_to_mimic

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        def save_grads(writer, model):
            for layer in model.layers:
                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf.summary.histogram(mapped_weight_name, weight)

                    grads = model.optimizer.get_gradients(
                        model.total_loss, weight)

                    def is_indexed_slices(grad):
                        return type(grad).__name__ == 'IndexedSlices'

                    grads = [
                        grad.values if is_indexed_slices(grad) else grad
                        for grad in grads
                    ]
                    tf.summary.histogram('{}_grad'.format(mapped_weight_name),
                                         grads)
                    merged = tf.summary.merge_all()
                    writer.flush()
                    writer.close()

        #save_grads(self.writer, self.critic_local.model)
        #def write_log(callback, names, logs, batch_no):
        #    for name, value in zip(names, logs):
        #        summary = tf.Summary()
        #        summary_value = summary.value.add()
        #        summary_value.simple_value = value
        #        summary_value.tag = name
        #        callback.writer.add_summary(summary, batch_no)
        #        callback.writer.flush()

        #train_names = ['train_loss', 'train_mae']
        #print("about to write log")
        #write_log(self.callback, train_names, logs, self.learn_counter)
        #trainable_weights = critic_local.model.trainable_weights
        #gradients = critic_local.model.optimizer.get_gradients(critic_local.model.total_loss, trainable_weights)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        #self.learn_counter += 1

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def _save_weight(self, model, directory_name, file_name):
        cwd = os.getcwd()
        directory_path = os.path.join(cwd, directory_name)
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)

        file_path = os.path.join(directory_path, file_name)

        mv_file_to_dir_with_date(file_path, directory_path)

        model.save_weights(file_path)

    def save_weights(self, location='weights_backup'):
        if self.verbose:
            print("start save_weights")

        self._save_weight(self.critic_local.model, location, "critic_local.h5")
        self._save_weight(self.critic_target.model, location,
                          "critic_target.h5")
        self._save_weight(self.actor_local.model, location, "actor_local.h5")
        self._save_weight(self.actor_target.model, location, "actor_target.h5")

        if self.verbose:
            print("done save_weights")

    def _h5(self, model, file_path):
        if os.path.exists(file_path):
            model.load_weights(file_path)
        else:
            print(f'could not find weight to load from [{file_path}]')

    def load_weights(self, location='weights_backup'):
        if self.verbose:
            print("start load_weights")

        cwd = os.getcwd()
        directory_path = os.path.join(cwd, location)

        self._h5(self.critic_local.model,
                 os.path.join(directory_path, "critic_local.h5"))
        self._h5(self.critic_target.model,
                 os.path.join(directory_path, "critic_target.h5"))
        self._h5(self.actor_local.model,
                 os.path.join(directory_path, "actor_local.h5"))
        self._h5(self.actor_target.model,
                 os.path.join(directory_path, "actor_target.h5"))

        if self.verbose:
            print("done load_weights")
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, basename):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # learning rates
        self.actor_learning_rate = 0.0001
        self.critic_learning_rate = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_learning_rate)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_learning_rate)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_learning_rate)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_learning_rate)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # keep track of the best run
        self.nEpisode = 0
        self.bestEpisode = []
        self.bestEpisodeAt = -1

        # logging business
        self.state_labels = self.task.get_state_labels()
        self.action_labels = [
            'ac{}'.format(i) for i in range(self.action_size)
        ]
        self.df_columns = [
            't'
        ] + self.state_labels.tolist() + self.action_labels + ['R']
        self.basename = os.path.join('log', basename)
        self.currentEpisode = []
        self.bestCumReward = -np.inf

    def reset_episode(self):
        self.noise.reset()
        self.last_state = self.task.reset()
        self.currentEpisode = []
        return self.last_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights),\
            "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def step(self, action):
        last_state_variables = self.task.get_state_variables()
        last_t = self.task.sim.get_time()

        # call the model for state transition
        next_state, reward, done = self.task.step(action)

        # logging the current episode
        self.currentEpisode += [
            np.hstack([last_t, last_state_variables, action, reward])
        ]

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        if done:
            # log the episode
            df = pd.DataFrame(self.currentEpisode, columns=self.df_columns)
            fn_i = '{}_{}'.format(self.basename, self.nEpisode)
            df.to_csv(fn_i + '.csv')
            cumR = df.R.sum()
            if len(df) > len(self.bestEpisode) or \
            (len(df) == len(self.bestEpisode) and cumR > self.bestCumReward):
                self.bestCumReward = cumR
                self.bestEpisode = df
                self.bestEpisodeAt = self.nEpisode
                self.plot_episode(df, self.nEpisode, fn_i)
            sys.stdout.write(
                "\rEp#{:4d} dur_{} cumR_{:5.3f} best@{} dur_{} cumR_{:5.3f} ".
                format(self.nEpisode,
                       len(self.bestEpisode), cumR, self.bestEpisodeAt,
                       len(self.bestEpisode), self.bestCumReward))
            self.nEpisode += 1
        return next_state, done

    def train(self, num_episodes=1):
        for ep_i in range(num_episodes):
            state, done = self.reset_episode(), False
            while not done:
                action = self.act(state)
                state, done = self.step(action)

    def plot_episode(self, df, episNo, filename=''):
        fig = plt.figure(1)
        fig.clf()
        ax2 = fig.add_subplot(313)
        ax1 = fig.add_subplot(312, sharex=ax2)
        ax0 = fig.add_subplot(311, sharex=ax2)
        # plot selected state variables
        ax0.set_title('Ep#{} dur={:5.2f} sec'.format(episNo, df.t.iloc[-1]))
        df.plot(x='t', y=self.state_labels[:6], ax=ax0, style='.:')
        df.plot(x='t', y=self.state_labels[6:], ax=ax1, style='.:')
        df.plot(x='t', y=self.action_labels, ax=ax2, style='.:')
        df.plot(x='t', y='R', ax=ax2, secondary_y=True)
        plt.ylabel('Reward')
        plt.show()
        if len(filename) > 0:
            fig.savefig(filename)
Пример #4
0
class DDPG():
    """ Reinforcement Learning Agent. """
    def __init__(self, task, exp_mu, exp_theta, exp_sigma, gamma, tau):
        self.task = task

        self.s_size = task.s_size
        self.a_size = task.a_size

        self.a_low = task.a_low
        self.a_high = task.a_high

        # Actor Model
        self.actor_local = Actor(self.s_size, self.a_size, self.a_low,
                                 self.a_high)
        self.actor_target = Actor(self.s_size, self.a_size, self.a_low,
                                  self.a_high)

        # Critic Model
        self.critic_local = Critic(self.s_size, self.a_size)
        self.critic_target = Critic(self.s_size, self.a_size)

        # Initialize target model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # initialize noise
        self.exp_mu = exp_mu
        self.exp_theta = exp_theta
        self.exp_sigma = exp_sigma
        self.noise = OUNoise(self.a_size, self.exp_mu, self.exp_theta,
                             self.exp_sigma)

        # For Replay buffer
        self.buff_size = 1024 * 1024
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buff_size, self.batch_size)

        # discount factor
        self.gamma = gamma

        # for soft update of target parameters
        self.tau = tau

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()

        # last state
        self.l_state = state

        return state

    # A - Action, R - Reward, D - Done
    def step(self, A, R, nState, D):
        # save experience to memory
        self.memory.add(self.l_state, A, R, nState, D)

        # Learn, if enough samples (experiences) are available in memory
        if len(self.memory) > self.batch_size:
            self.learn(self.memory.sample())

        self.l_state = nState

    def act(self, states):
        S = np.reshape(states, [-1, self.s_size])
        A = self.actor_local.model.predict(S)[0]
        return list(A + self.noise.sample())

    def learn(self, exp):
        """Update policy and value parameters using given batch of experience tuples."""

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        S = np.vstack([e.state for e in exp if e is not None])
        A = np.array([e.action for e in exp if e is not None
                      ]).astype(np.float32).reshape(-1, self.a_size)
        R = np.array([e.reward for e in exp
                      if e is not None]).astype(np.float32).reshape(-1, 1)
        D = np.array([e.done for e in exp
                      if e is not None]).astype(np.uint8).reshape(-1, 1)
        nS = np.vstack([e.next_state for e in exp if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        a_next = self.actor_target.model.predict_on_batch(nS)
        t_next = self.critic_target.model.predict_on_batch([nS, a_next])

        # Compute Q targets for current state and train critic model (local)
        Q_targets = R + self.gamma * t_next * (1 - D)
        self.critic_local.model.train_on_batch(x=[S, A], y=Q_targets)

        # Train actor model (local)
        a_grad = np.reshape(self.critic_local.get_action_gradients([S, A, 0]),
                            (-1, self.a_size))
        self.actor_local.train_fn([S, a_grad, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        l_weights = np.array(local_model.get_weights())
        t_weights = np.array(target_model.get_weights())

        assert len(l_weights) == len(
            t_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * l_weights + (1 - self.tau) * t_weights
        target_model.set_weights(new_weights)
class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

    def create_models(self,
                      hidden_sizes_actor=(512, 256),
                      hidden_sizes_critic=(512, 256, 256)):
        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 hidden_sizes=hidden_sizes_actor)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  hidden_sizes=hidden_sizes_actor)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   hidden_sizes=hidden_sizes_critic)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    hidden_sizes=hidden_sizes_critic)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

    def set_params(self,
                   mu=0.1,
                   sigma=0.1,
                   theta=0.1,
                   buffer_size=1e+8,
                   batch_size=128,
                   gamma=0.99,
                   tau=1e-3):
        self.exploration_mu = mu
        self.exploration_sigma = sigma
        self.exploration_theta = theta
        self.noise = noise(self.action_size, self.exploration_mu,
                           self.exploration_theta, self.exploration_sigma)

        self.buffer_size = int(buffer_size)
        self.batch_size = int(batch_size)
        self.buffer = ReplayBuffer(self.buffer_size)

        self.gamma = gamma
        self.tau = tau

    def act(self, states):
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.calc_noise())

    def learn(self):
        states, actions, rewards, dones, next_states = self.buffer.sample(
            self.batch_size, self.action_size, self.state_size)

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # soft_update
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.buffer.add(self.last_state, action, reward, next_state, done)
        self.learn()
        self.last_state = next_state

    def soft_update(self, local_model, target_model):
        target_model.set_weights(
            self.tau * np.array(local_model.get_weights()) +
            (1 - self.tau) * np.array(target_model.get_weights()))
Пример #6
0
class DDPG:
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.001

        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        self.gamma = 0.99
        self.tau = 0.1
        self.learning_rate = 0.0005

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        self.actor_local = Actor(self.state_size,
                                 self.action_size,
                                 self.action_low,
                                 self.action_high,
                                 learning_rate=self.learning_rate)
        self.actor_target = Actor(self.state_size,
                                  self.action_size,
                                  self.action_low,
                                  self.action_high,
                                  learning_rate=self.learning_rate)

        self.critic_local = Critic(self.state_size,
                                   self.action_size,
                                   learning_rate=self.learning_rate)
        self.critic_target = Critic(self.state_size,
                                    self.action_size,
                                    learning_rate=self.learning_rate)

    def reset_episode(self):
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        self.last_state = self.task.reset()
        return self.last_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.total_reward += reward
        self.count += 1

        if self.memory.size() > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

        self.last_state = next_state

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        q_targets = rewards + (self.gamma * q_targets_next * (1 - dones))
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #7
0
class DDPG_Land():
    def __init__(self, task, seed=None, render=False):

        self.env = task.env
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        self.total_reward = 0
        self.steps = 0
        self.action_repeat = 3
        self.render = render

        # Score tracker and learning parameters
        self.score = -np.inf
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1

        #counter
        self.count = 0

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(1, self.exploration_mu, self.exploration_theta,
                             self.exploration_sigma)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def act(self, s):
        #         # print('act')
        #         # a = lunder.heuristic(self.env, s)
        #         # 1. Testing.
        #         # 2. Demonstration rollout.
        #         angle_targ = s[0]*0.5 + s[2]*1.0         # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed)
        #         if angle_targ >  0.4: angle_targ =  0.4  # more than 0.4 radians (22 degrees) is bad
        #         if angle_targ < -0.4: angle_targ = -0.4
        #         hover_targ = 0.55*np.abs(s[0])           # target y should be proporional to horizontal offset

        #         # PID controller: s[4] angle, s[5] angularSpeed
        #         angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0
        #         #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo))

        #         # PID controller: s[1] vertical coordinate s[3] vertical speed
        #         hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5
        #         #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo))

        #         if s[6] or s[7]: # legs have contact
        #             angle_todo = 0
        #             hover_todo = -(s[3])*0.5  # override to reduce fall speed, that's all we need after contact

        #         if self.env.continuous:
        #             a = np.array( [hover_todo*20 - 1, -angle_todo*20] )
        #             a = np.clip(a, -1, +1)
        #         else:
        #             a = 0
        #             if hover_todo > np.abs(angle_todo) and hover_todo > 0.05: a = 2
        #             elif angle_todo < -0.05: a = 3
        #             elif angle_todo > +0.05: a = 1
        #         # return a
        # state = s
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(s, [-1, 24])
        action = self.actor_local.model.predict(state)[0]

        return list(action + self.noise.sample())

    def step(self, action, reward, next_state, done):
        # print ("step")
        # ob, reward, done, info = self.env.step(action)
        # print(ob)
        # next_state = ob
        # Save experience / reward
        reward = np.clip(reward, a_min=-100, a_max=100)
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.count += 1
        self.total_reward += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        #from the tutorial SRC
        self.score += reward

        if done:
            # self.score = np.clip(self.score,a_min=-100,a_max=100)
            if self.score > self.best_score:
                self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

#         #from the tutorial SRC
#         self.score += reward

#         if done:

#             if self.score > self.best_score:
#                 self.best_score = self.score

#         # return ob, reward, done

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

#         # from policy search

#         # Learn by random policy search, using a reward-based score
#         # self.score = self.total_reward / float(self.count) if self.count else 0.0
#         # if self.score > self.best_score:
#         #     self.best_score = self.score
#         #     self.best_w = self.w
#         #     self.noise_scale = max(0.5 * self.noise_scale, 0.01)
#         # else:
#         #     self.w = self.best_w
#         #     self.noise_scale = min(2.0 * self.noise_scale, 3.2)
#         # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape)  # equal noise in all directions

    def reset(self):
        self.steps = 0
        self.total_reward = 0
        self.count = 0
        self.score = 0
        # self.best_score = 0
        """Reset the sim to start a new episode."""
        ob = self.env.reset()
        state = np.concatenate([ob] * self.action_repeat)

        self.last_state = state
        return state
Пример #8
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # from plicy search
        
        self.action_range = self.action_high - self.action_low
        
        self.w = np.random.normal(
            size=(self.state_size, self.action_size),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range
        
        # Score tracker and learning parameters
        self.score = -np.inf
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1
        
        #counter
        self.count = 0
        
        
        
        
    def reset_episode(self):
        self.noise.reset()
        self.count = 0
        self.total_reward = 0.0
        self.score = 0
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        self.count += 1
        self.total_reward += reward
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        
        #from the tutorial SRC
        self.score += reward
        
        if done:
            if self.score > self.best_score:
                self.best_score = self.score

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)  
        
        # from policy search
        
        # Learn by random policy search, using a reward-based score
        # self.score = self.total_reward / float(self.count) if self.count else 0.0
        # if self.score > self.best_score:
        #     self.best_score = self.score
        #     self.best_w = self.w
        #     self.noise_scale = max(0.5 * self.noise_scale, 0.01)
        # else:
        #     self.w = self.best_w
        #     self.noise_scale = min(2.0 * self.noise_scale, 3.2)
        # self.w = self.w + self.noise_scale * np.random.normal(size=self.w.shape)  # equal noise in all directions

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #9
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.gamma = 0.99
        self.tau = 0.001 

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  
    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])


        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #10
0
class TD3():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, lra, lrc, db):
        self.task = task
        self.s_sz = task.state_size
        self.a_sz = task.action_size
        self.a_max = task.max_action

        # Actor (Policy) Model
        self.actor_local = Actor(self.s_sz, self.a_sz, lra)
        self.actor_target = Actor(self.s_sz, self.a_sz, lra)

        # First Critic (Value) Model
        self.critic_local_1 = Critic(self.s_sz, self.a_sz, lrc)
        self.critic_target_1 = Critic(self.s_sz, self.a_sz, lrc)

        # Second Critic (Value) Model
        self.critic_local_2 = Critic(self.s_sz, self.a_sz, lrc)
        self.critic_target_2 = Critic(self.s_sz, self.a_sz, lrc)

        # Initialize target model parameters with local model parameters
        self.critic_target_1.model.set_weights(
            self.critic_local_1.model.get_weights())
        self.critic_target_2.model.set_weights(
            self.critic_local_2.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = GaussianNoise(self.a_sz)

        # Replay memory
        self.num_exp = 0
        self.batch = 32
        self.buffer = 10000
        labels = ["state", "action", "reward", "next_state", "done"]
        self.experience = namedtuple("Experience", field_names=labels)
        self.memory = PrioritizedReplayBuffer(self.buffer, self.batch, db)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.005  # for soft update of target parameters

    def reset_episode(self):
        state = self.task.reset()
        self.last_state = state
        self.num_exp
        return state

    def step(self, action, reward, next_state, done, PER_init=False):
        # Save experience / reward
        exp = self.experience(self.last_state, action, reward, next_state,
                              done)
        self.memory.add(exp)
        self.num_exp += 1

        # Roll over last state and action
        self.last_state = next_state

        # Learn, if enough samples are available in memory
        if PER_init:
            p_idx, weights, experiences = self.memory.sample()
            mean_abs_error, loss = self.learn(experiences, weights, p_idx)
            return mean_abs_error, loss

    def act(self, state, training=True):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.s_sz])
        action = self.actor_local.model.predict(state)
        if training:
            noise = self.noise.sample(0.1)
            return list((action + noise)[0])  # add some noise for exploration
        else:
            action = self.actor_target.model.predict(state)
            return list(action[0])

    def learn(self, exp, weights, p_idx):
        states = np.vstack([e.state for e in exp])
        actions = np.array([e.action for e in exp
                            ]).astype(np.float32).reshape(-1, self.a_sz)
        rewards = np.array([e.reward
                            for e in exp]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in exp]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in exp])
        weights = np.ndarray.flatten(
            np.array([w for w in weights]).astype(np.float32))

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        target_noise = self.noise.sample(0.2, self.batch, True)
        actions_next = np.clip(actions_next + target_noise, -self.a_max,
                               self.a_max)
        Q_targets_1 = self.critic_target_1.model.predict_on_batch(
            [next_states, actions_next]).reshape(-1, 1)
        Q_targets_2 = self.critic_target_2.model.predict_on_batch(
            [next_states, actions_next]).reshape(-1, 1)
        Q_targets_next = np.minimum(Q_targets_1, Q_targets_2)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        # Compute Q targets for current states and train critic model (local)
        Q_local_1 = self.critic_local_1.model.predict_on_batch(
            [states, actions])
        Q_local_2 = self.critic_local_2.model.predict_on_batch(
            [states, actions])
        loss_1 = self.critic_local_1.model.train_on_batch([states, actions],
                                                          Q_targets, weights)
        loss_2 = self.critic_local_2.model.train_on_batch([states, actions],
                                                          Q_targets, weights)
        Q_error_1 = np.absolute(Q_targets - Q_local_1)
        Q_error_2 = np.absolute(Q_targets - Q_local_2)
        Q_error = np.mean([Q_error_1, Q_error_2], axis=0)
        self.memory.update_weights(p_idx, Q_error)

        # Train actor model (local)
        actor_actions = self.actor_local.model.predict_on_batch(states)
        action_grads = self.critic_local_1.get_gradients(
            [states, actor_actions, 0])
        action_grads = np.reshape(action_grads, (-1, self.a_sz))
        self.actor_local.train_fn([states, action_grads,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local_1.model, self.critic_target_1.model)
        self.soft_update(self.critic_local_2.model, self.critic_target_2.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)
        return np.mean(Q_error), np.mean([loss_1, loss_2])

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        message = "Local and target model parameters must have the same size"
        assert len(local_weights) == len(target_weights), message

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #11
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor
        self.tau = 0.002  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #12
0
class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Critic
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # Exploration noise
        self.exploration_mu = 0.1
        self.exploration_sigma = 0.1
        self.exploration_theta = 0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Experience
        self.buffer_size = 100000000
        self.batch_size = 64
        self.buffer = ReplayBuffer(self.buffer_size)

        # Parameters
        self.gamma = 0.99
        self.tau = 0.001

    def act(self, states):
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self):
        # Sample
        states, actions, rewards, dones, next_states = self.buffer.sample(
            self.batch_size, self.action_size, self.state_size)

        # Predict
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        # Train Critic
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train Actor
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Update weights
        self.update_target_weights(self.critic_local.model,
                                   self.critic_target.model)
        self.update_target_weights(self.actor_local.model,
                                   self.actor_target.model)

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.buffer.add(self.last_state, action, reward, next_state, done)
        self.learn()
        self.last_state = next_state

    def update_target_weights(self, local_model, target_model):
        target_model.set_weights(
            self.tau * np.array(local_model.get_weights()) +
            (1 - self.tau) * np.array(target_model.get_weights()))
Пример #13
0
class Agent():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        
        self.exploration_mu = 0
        self.exploration_theta = 0.10 # same direction
        self.exploration_sigma = 0.001 # random noise
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.gamma = 0.90  # discount factor
        self.tau = 0.1  # for soft update of target parameters

        self.best_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state
        
        self.score += reward
        if done:
            if self.score > self.best_score:
                self.best_score = self.score
                
    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)
        
    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #14
0
class DDPG():
    '''reinforcement learning agent that learns using Deep Deterministic Policy Gradient'''
    def __init__(self, task):
        '''
        Params
        ======
        task (object)   : environment

        '''
        '''
        Reference: Continuous Control With Deep Reinforcement Learning(2016)
        Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras
        =========
        gamma   : 0.99
        tau     : 0.001
        buffer_size (ReplayBuffer)  : 1e6
        batch_size (ReplayBuffer)   : 64
        theta (Ornstein-Uhlenbeck process)  : 0.15
        sigma (Ornstein-Uhlenbeck process)  : 0.2


        '''

        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # actor (policy) model - use two copies of model for updating model and producing target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # critic (value) model - use two copies of model for updating model and producing target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        # noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # reward history
        self.best_avg_score = -np.inf
        self.accumulated_reward = 0
        self.count = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.accumulated_reward = 0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # save experience and reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # roll over last state and action
        self.last_state = next_state

        # accumulate reward
        self.accumulated_reward += reward
        self.count += 1

        # record best average score
        if done:
            if float(self.accumulated_reward /
                     self.count) > self.best_avg_score:
                self.best_avg_score = float(self.accumulated_reward /
                                            self.count)

    def act(self, state):
        '''returns actions for given state(s) as per current policy'''
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration
        # both action and self.noise.sample() are numpy object, + means sum up both,
        # instead of concatenation

    def learn(self, experiences):
        '''update policy and value parameters using given batch of experience tuples'''
        # convert experience tuples to separate arrays for each element(states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).\
                           astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).\
                           astype(np.uint8).reshape(-1,1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # get predicted next-state actions and Q-values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # compute Q targets for current states and train critic model (local)
        # Value Loss: L=∑(R_t+1 + Q_t+1 — Qt)²
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # train actor model (local)
        # Policy Loss: L = (1/N)*log(𝝅(s)) * Q(s)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function
        # The learning phase flag is a bool tensor (0 = test, 1 = train)
        # to be passed as input to any Keras function
        # that uses a different behavior at train time and test time.

        # soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        '''soft update model parameters'''
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights),\
            'Local and target model parameters must have the same size'

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #15
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low
        self.score = 0

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.action_low, self.action_high)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.action_low, self.action_high)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000  # Taken from paper - changed from 10000 originally
        self.batch_size = 64  # Taken from paper - changed from 64 originally
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.005  # Tau for soft update of target parameters. Taken from paper - changed from 0.01 originally

        # Reset the episode when model set up
        self.reset_episode()

    def reset_episode(self):
        self.total_reward = 0.0
        self.score = 0.0
        self.count = 0
        self.best_score = 0.0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def normalise_actions(self, actions):
        # Added square root to action to keep the actions closer to the middle of the rotor speed range
        normalised_actions = (np.sign(actions) * np.sqrt(np.abs(actions)) *
                              self.action_range /
                              2) + self.action_low + self.action_range / 2
        return normalised_actions

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > 10000:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state
        self.count += 1
        self.total_reward += reward

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        action = self.normalise_actions(action)
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def act_eval(self, state):
        """Returns actions without exploration for final evaluation."""
        state = np.reshape(state, [-1, self.state_size])
        # Scale [0, 1] output for each action dimension to proper range
        action = self.actor_local.model.predict(state)[0]
        action = self.normalise_actions(action)
        action = list(action)

        return list(
            action
        )  # No noise for exploration. Evaluate final performance of quadcopter

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        actions_next = self.normalise_actions(actions_next)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        # Update the score metric so it can be tracked through training
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #16
0
class DDPG():
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        # Create the actor instances for local and target
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        # Create the critic instances for local and target
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        # LOAD the weights
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        # Noise hyperparameters
        self.exploration_mu = 0
        self.exploration_theta = 0.35
        self.exploration_sigma = 0.1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)
        # Set the replay memory
        self.buffer_size = 100000
        self.batch_size = 32
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        # Update function hyperparameters
        self.gamma = 0.99
        self.tau = 0.001

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state,
                        done)  # Save current experience
        # When there memory is enough for a batch size then we train
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        # Move up one state
        self.last_state = next_state

    def act(self, state):
        #NOTE: This is how the agent will act, which is based on the current policy
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])
        # Compute Q_targets
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)
        # Train the local actor model
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])
        # Soft-update of target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #17
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""

    #name: is a name to use to save the netural Network models
    #load: load data from existing models or cretae an entirly new model
    def __init__(self, task, name, loadfile=False):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.name = name
        if loadfile:
            self.actor_local.model.load_weights("./weights/" + name +
                                                "_actor.h5")
            self.critic_local.model.load_weights("./weights/" + name +
                                                 "_critic.h5")

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.3 #original 0.15
        self.exploration_sigma = 0.3  #0.3 #original 0.3

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)

        #rewards = np.interp(rewards, (rewards.min(), rewards.max()), (-1, +1)) #TESTING to scale rewards to a small number.

        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def save_weights(self):
        self.actor_local.model.save_weights("./weights/" + self.name +
                                            "_actor.h5")
        self.critic_local.model.save_weights("./weights/" + self.name +
                                             "_critic.h5")

    #Notice that after training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model.
    #However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau.

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #18
0
class DDPG_Agent:
    """Reinforcement learning agent that learns through DDPG."""
    def __init__(self, task):
        """Initialize DDPG Agent instance."""
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_high = task.action_high
        self.action_low = task.action_low

        # Initializing local and target Actor Models
        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_high, self.action_low)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_high, self.action_low)

        # Initializing local and target Critic Models
        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())

        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay Memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Additional Parameters
        self.best_score = -np.inf
        self.total_reward = 0.0
        self.count = 0
        self.score = 0

    def reset_episode(self):
        """Reset episode to initial state."""
        self.total_reward = 0.0
        self.count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        """Take a step."""
        self.total_reward += reward
        self.count += 1
        # Save experience/reward
        self.memory.memorize(self.last_state, action, reward, next_state, done)

        # Learn if enough samples are available in memory.
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

    def act(self, state):
        """Returns actions for state(s) according to given policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]

        # Add some noise to action for exploration and return
        return list(action + self.noise.sample())

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience
        tuples."""

        self.score = self.total_reward / \
            float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack([e.action for e in experiences
                             if e is not None]).astype(np.float32).reshape(
                                 -1, self.action_size)
        rewards = np.vstack([e.reward for e in experiences if e is not None
                             ]).astype(np.float32).reshape(-1, 1)
        dones = np.vstack([e.done for e in experiences
                           if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_actions = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, next_actions])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        # [states, actions, 0] 0 is for No learning Phase
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights), "Local and target model parameters must \
            have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights

        target_model.set_weights(new_weights)
Пример #19
0
class DDPG():
    """Reinforcement learning agent that learns using DDPG."""
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high


        # Set the learning rate suggested by paper:  https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf
        self.actor_learning_rate = 0.001
        self.actor_decay = 0.0
        self.critic_learning_rate = 0.001
        self.critic_decay = 0.0

        # Actor Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay)

        # Critic Model
        self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)
        self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay)

        # initialize targets model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.15
        # self.exploration_sigma = 0.2
        self.exploration_theta = 0.01
        self.exploration_sigma = 0.02
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta,
                   self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000

        self.batch_size = 64

        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_w = None
        self.best_score = -np.inf
        # self.noise_scale = 0.7
        self.score = 0

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01 # for soft update of target parameters

        # Indicate if we want to learn (or use to predict without learn)
        self.set_train(train)

    def reset_episode(self):
        self.total_reward = 0.0
        self.score = 0
        self.step_count = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        self.total_reward += reward
        self.step_count += 1
        # Save experience /reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        self.score = self.total_reward / float(self.step_count) if self.step_count else 0.0
        # Update the noise factor depending on the new score value
        if  self.score >= self.best_score:
            self.best_score = self.score
       
        # Learn, if enough samples are available in memory
        if self.train and len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, done)

        # Roll over last state and action
        self.last_state= next_state

    def act(self, state):
        """Returns actions for given state(s)  as per current policy"""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample()) # add more noise for exploration

    def learn(self, experiences, done):
        """Update policy and value parameters using give batch experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards  = np.array([e.reward for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)

        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_state = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        # Q_targets_next = critic_target(next_state, actor_target(next_state))
        next_action = self.actor_target.model.predict_on_batch(next_state)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_state, next_action])

        # Compute Q targets for current states and train critic model(local)
        Q_targets = rewards + self.gamma * Q_targets_next * ( 1- dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),
                            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target method

        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())


        assert len(local_weights) == len(target_weights), "Local and target model parameters mush have the same size"
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

    def set_train(self, train):
        self.train = train
Пример #20
0
class Christophers_Agent():
    def __init__(self, task):
        # Task (environment) information
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        self.w = np.random.normal(
            size=(
                self.state_size, self.action_size
            ),  # weights for simple linear policy: state_space x action_space
            scale=(self.action_range / (2 * self.state_size)
                   ))  # start producing actions in a decent range

        self.actor = Actor(self.state_size, self.action_size, self.action_low,
                           self.action_high)
        self.critic = Critic(self.state_size, self.action_size)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.gamma = 0.95
        self.tau = 0.001

        self.best_w = None
        self.best_score = -np.inf

        self.exploration_mu = 0.5
        self.exploration_theta = 0.2
        self.exploration_sigma = 0.4
        self.noise = Noise(self.action_size, self.exploration_mu,
                           self.exploration_theta, self.exploration_sigma)

        self.buffer_size = 100000
        self.batch_size = 32
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        self.best_score = -np.inf
        self.num_steps = 0

        # Episode variables
        self.reset_episode()

    def reset_episode(self):
        if self.get_score() > self.best_score:
            self.best_score = self.get_score()
        self.total_reward = 0.0
        self.num_steps = 0
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        self.total_reward += reward
        self.num_steps += 1

        self.memory.add(self.last_state, action, reward, next_state, done)

        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        self.last_state = next_state

    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor.model.predict(state)[0]
        action = list(action +
                      self.noise.sample())  # add some noise for exploration
        return action

    def get_score(self):
        return -np.inf if self.num_steps == 0 else self.total_reward / self.num_steps

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        done = np.array([e.done for e in experiences
                         if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - done)

        self.critic.model.train_on_batch(x=[states, actions], y=Q_targets)

        action_gradients = np.reshape(
            self.critic.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor.train_fn([states, action_gradients, 1])

        self.soft_update(self.critic.model, self.critic_target.model)
        self.soft_update(self.actor.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights)

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #21
0
class Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15 # same direction
        self.exploration_sigma = 0.001 # random noise
        
        #self.exploration_mu = 0
        #self.exploration_theta = 0.15
        #self.exploration_sigma = 0.2
        
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.1  # for soft update of target parameters

        
        # Compute the ongoing top score
        self.top_score = -np.inf
        self.score = 0

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        # stats
        self.score += reward
        if done:
            if self.score > self.top_score:
                self.top_score = self.score

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #22
0
class RLA():
    """ Reinfocement learning agent"""
    
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        #actor model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        
        #Critic model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        
        #Initialize target model params with local params
        self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())
        
        #Initialize noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
        
        #Replay memory Initialization
        self.buffer_size, self.batch_size = 2000000, 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
        
        #Initialize algorithm parameters
        self.gamma, self.tau = 0.95, 0.001
        
        #Initialize scores
        self.score, self.best_score = -np.inf, -np.inf
    
    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        self.score = 0
        return state
    
    def step(self, action, reward, next_state, done):
        self.memory.add(self.last_state, action, reward, next_state, done)
        
        #Learn from samples in memory if they are greater than batch size
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        
        #Preserve state as last_state
        self.last_state = next_state
        
        #Update score with reward from this step
        self.score += reward
        
        if done:
            #Preserve best score
            if self.score > self.best_score:
                self.best_score = self.score
        
    def act(self, state):
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())
    
    def learn(self, experiences):
        #Convert experiences seperate arrays
        states = np.vstack([exp.state for exp in experiences if exp is not None])
        actions = np.array([exp.action for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([exp.reward for exp in experiences if exp is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([exp.done for exp in experiences if exp is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([exp.next_state for exp in experiences if exp is not None])
        
        #predict next_state actions and Q values from target model...
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])
        
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states,actions], y=Q_targets)
        
        #Train local actor model
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])
        
        #Update target models
        self.update(self.critic_local.model, self.critic_target.model)
        self.update(self.actor_local.model, self.actor_target.model)
        
    def update(self, local_model, target_model):
        """Update model parameters"""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        
        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)