Пример #1
0
class agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = .15
        self.exploration_sigma = .2
        #         self.exploration_theta = 0.15
        #         self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000  #used to be 100000
        self.batch_size = 64  #used to be 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = .001  # for soft update of target parameters  (was 0.01)

    def reset_episode(self):

        self.total_reward = 0.0
        self.count = 0

        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):

        self.total_reward += reward
        self.count += 1
        self.score = self.total_reward  # / float(self.count) if self.count else 0.0

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #2
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, state_size, action_size, train=True):

        self.train = train
        self.action_size = action_size
        self.state_size = state_size

        actor_lr = 0.001  #Learning rate for Actor 0.0001
        critic_lr = 0.01  #Lerning rate for Critic 0.001

        deep_lr = 1e-3

        # Noise process
        self.exploration_mu = 0  # Mean
        self.exploration_theta = 0.6  #.15 How fast variable reverts to mean
        self.exploration_sigma = 0.3  # .2 Degree of volatility
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        if (self.train):
            # Actor (Policy) Model
            self.actor_local = Actor(self.state_size, self.action_size,
                                     actor_lr)
            self.actor_target = Actor(self.state_size, self.action_size,
                                      actor_lr)

            # Critic (Value) Model
            self.critic_local = Critic(self.state_size, self.action_size,
                                       critic_lr)
            self.critic_target = Critic(self.state_size, self.action_size,
                                        critic_lr)

            # Initialize target model parameters with local model parameters
            self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
            self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())

            # Replay memory
            self.buffer_size = 300  #1024
            self.batch_size = 32  #32

            # internal memory (deque)
            self.memory = deque(maxlen=self.buffer_size)
            #self.memory = []
            self.experience = namedtuple("Data",
                                         field_names=[
                                             "state", "action", "reward",
                                             "next_state", "done"
                                         ])

            # Algorithm parameters
            self.gamma = 0.99  # discount factor
            self.tau = 0.01  # for soft update of target parameters 0.001

            self.guide = False

            print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr)
            #print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma)
            print(self.actor_local.model.summary())
            print(self.critic_local.model.summary())

            self.batch_id = 0
            self.critic_loss = 0
            self.actor_loss = 0

            self.C_loss = []
            self.A_loss = []

    def save_model(self, num):
        # Save the weights weights-improvement--0.03.hdf5
        load_str = "weights-improvement--0.{}.hdf5".format(num)
        self.deep_NN.model.load_weights(load_str)
        self.deep_NN.model.save("./model/model.h5")
        print("Saved model with best weights to disk")

    def load_model(self, name):
        # Save the weights
        self.deep_NN.model.load_weights(name)

    def summarize_prediction(self, Y_true, Y_pred):
        mse = mean_squared_error(Y_true, Y_pred)
        r_squared = r2_score(Y_true, Y_pred)
        print("mse       = {0:.2f}".format(mse))
        print("r_squared = {0:.2f}%".format(r_squared))

    def predict_and_summarize(self, X, Y):
        model = load_model("./model/model.h5")
        Y_pred = model.predict(X).astype('int')
        self.summarize_prediction(Y, Y_pred)
        return Y_pred

    def predict(self, state):
        """Returns actions for given state(s) as per current policy."""
        #state = np.reshape(state, [-1, self.state_size])
        #action = self.trained.model.predict(state)[0]
        noise = self.noise.sample()
        action = self.actor_target.model.predict(state)
        return action, noise

    def get_sample(self, b_size=None):
        if (b_size is None):
            b_size = self.batch_size
        return rn.sample(self.memory, k=b_size)

    def conv_to_tensor(self, img):
        # Black and White Image ex: 1, 244, 244, 1
        if (len(img.shape) == 2):
            img = np.expand_dims(img, axis=3)
            img = np.expand_dims(img, axis=0)
        # RGB Image or stacked image: 1, 244, 244, 3
        elif (len(img.shape) == 3):
            img = np.expand_dims(img, axis=0)
        return img

    def reset(self):
        self.critic_loss = 0
        self.memory.clear()

    def step(self, state, action, reward, next_state, done):
        d = self.experience(state, action, reward, next_state, done)
        if (len(self.memory) == self.buffer_size):
            self.memory.popleft()
        self.memory.append(d)

    def learn(self, verbose=False):  #experiences
        """Update policy and value parameters using given batch of experience tuples."""
        if (len(self.memory) < self.batch_size):
            return

        experiences = self.get_sample()

        if (verbose):
            print("Buffer Size: ", len(self.memory))
            print("Sample Size: ", len(experiences))

        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([
            self.conv_to_tensor(e.state) for e in experiences if e is not None
        ])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([
            self.conv_to_tensor(e.next_state) for e in experiences
            if e is not None
        ])

        if (0):
            print("States", states.shape)
            print("Actions", actions.shape)
            print("Rewards", rewards.shape)
            print("Next States", next_states.shape)
            print("Dones", dones.shape)

        # keep training actor local and critic local
        # use values from target model to update and train local
        # don't train target models, we soft update target

        actions_next = self.actor_target.model.predict_on_batch(next_states)

        #print("Actions next", actions_next.shape)

        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        self.critic_loss = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_loss = self.actor_local.train_fn(
            [states, action_gradients, 1])  # custom training function

        self.A_loss.append(self.actor_loss)
        self.C_loss.append(self.critic_loss)
        #self.batch_id += 1

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #3
0
class DDPGAgent():
    """Reinforcement Learning agent that learns using DDPG."""

    def __init__(self, state_size, action_size, action_low, action_high):
        # self.task = task
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high

        # learning rates
        self.lr_actor = 1e-4
        self.lr_critic = 1e-3

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.lr_actor)
        self.actor_target = Actor(self.state_size, self.action_size, self.lr_actor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.lr_critic)
        self.critic_target = Critic(self.state_size, self.action_size, self.lr_critic)

        # store model architecture of actor and critic locally
        # keras.utils.plot_model(self.actor_local.model, '/home/danie/catkin_ws/src/ddpg/src/actor.png', show_shapes=True)        
        # keras.utils.plot_model(self.critic_local.model, '/home/danie/catkin_ws/src/ddpg/src/critic.png', show_shapes=True)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Initialize OU noise
        self.noise = OUNoise(action_size=self.action_size)

        # Currently testing with Gaussian noise instead of OU. Parameters for Gaussian follow
        self.noise_mean = 0.0
        self.noise_stddev = 0.2

        # Initialize replay buffer
        self.buffer_size = 1e6
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for DDPG
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()

    def choose_action(self, state):
        """Returns actions for given state(s) as per current policy."""
        pure_action = self.actor_local.model.predict(state)[0]
        # add gaussian noise for exploration
        # noise = np.random.normal(self.noise_mean, self.noise_stddev, self.action_size)
        
        # add OU noise for exploration
        noise = self.noise.sample()

        # action = np.clip(pure_action + noise, self.action_low, self.action_high)
        # print("pure", pure_action)
        # print("noise", noise)
        # action = self.action_high * (pure_action + noise)
        # action = pure_action + noise
        action = np.clip(pure_action + noise, self.action_low, self.action_high)
        # print("action", action)
        return action.tolist()

    def store_transition(self, state, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def train_actor_and_critic(self):
        """
        Update policy and value parameters using given batch of experience
        tuples.
        """

        # if not enough transitions in memory, don't train!
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample() # sample a batch from memory

        # Convert experience tuples to separate arrays for each element
        # (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in transitions if e is not None])
        actions = np.array([
            e.action for e in transitions if e is not None]).astype(
            np.float32).reshape(-1, self.action_size)
        rewards = np.array([
            e.reward for e in transitions if e is not None]).astype(
            np.float32).reshape(-1, 1)
        dones = np.array([
            e.done for e in transitions if e is not None]).astype(
            np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in transitions if e is not None])

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states) #mu_marked in algo
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) #Q' in algo

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) #y_i in algo
        critic_loss = self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        # print("action_gradients",action_gradients)
        # custom training function
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        # self.soft_update(self.critic_local.model, self.critic_target.model, self.tau)
        # self.soft_update(self.actor_local.model, self.actor_target.model, self.tau)
        self.soft_update_critic()
        self.soft_update_actor()

        return critic_loss

    def soft_update_actor(self):
        """Soft update model parameters."""
        local_weights = np.array(self.actor_local.model.get_weights())
        target_weights = np.array(self.actor_target.model.get_weights())

        assert len(local_weights) == len(
            target_weights), ('Local and target model parameters must have '
                              'the same size')

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        self.actor_target.model.set_weights(new_weights)

    def soft_update_critic(self):
        """Soft update model parameters."""
        local_weights = np.array(self.critic_local.model.get_weights())
        target_weights = np.array(self.critic_target.model.get_weights())

        assert len(local_weights) == len(
            target_weights), ('Local and target model parameters must have '
                              'the same size')

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        self.critic_target.model.set_weights(new_weights)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights), ('Local and target model parameters must have '
                              'the same size')

        new_weights = tau * local_weights + (1 - tau) * target_weights
        target_model.set_weights(new_weights)

        
Пример #4
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, train=True):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        self.actor_lr = 1e-5  #.0001
        self.critic_lr = 1e-4  #0.0000001

        self.network = [128, 256, 128]

        self.train = train
        network = self.network
        actor_lr = self.actor_lr
        critic_lr = self.critic_lr

        if (self.train):
            # Actor (Policy) Model
            self.actor_local = Actor(self.state_size, self.action_size,
                                     self.action_low, self.action_high,
                                     actor_lr, network)
            self.actor_target = Actor(self.state_size, self.action_size,
                                      self.action_low, self.action_high,
                                      actor_lr, network)

            # Critic (Value) Model
            self.critic_local = Critic(self.state_size, self.action_size,
                                       critic_lr, network)
            self.critic_target = Critic(self.state_size, self.action_size,
                                        critic_lr, network)

            # Initialize target model parameters with local model parameters
            self.critic_target.model.set_weights(
                self.critic_local.model.get_weights())
            self.actor_target.model.set_weights(
                self.actor_local.model.get_weights())

            # Noise process
            self.exploration_mu = 0  # Mean
            self.exploration_theta = 0.15  #.15 How fast variable reverts to mean
            self.exploration_sigma = 0.2  #.2 Degree of volatility
            self.noise = OUNoise(self.action_size, self.exploration_mu,
                                 self.exploration_theta,
                                 self.exploration_sigma)

            # Replay memory
            self.buffer_size = 5000
            self.batch_size = 16
            self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
            self.targets = ReplayBuffer(self.buffer_size, self.batch_size)

            # Algorithm parameters
            self.gamma = 0.99  # discount factor
            self.tau = 0.01  # for soft update of target parameters

            print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr)
            print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma)
            print(self.actor_local.model.summary())
            print(self.critic_local.model.summary())

            # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1
            # Create the TensorBoard callback,
            # which we will drive manually
            self.tensorboard = keras.callbacks.TensorBoard(
                log_dir='logdir',
                histogram_freq=0,
                batch_size=self.batch_size,
                write_graph=True,
                write_grads=True)

            self.tensorboard.set_model(self.critic_local.model)
            self.summary_writer = tf.summary.FileWriter("scores")

            self.batch_id = 0

    def reset_episode(self):
        if (self.train):
            self.noise.reset()
            self.noise_arr = []
            self.noise_matrix = [0., 0., 0., 0.]

        state = self.task.reset()
        self.last_state = state
        return state

    def save_initial_weights(self):
        self.actor_local.model.save_weights('actor_local.h5')
        self.actor_target.model.save_weights('actor_target.h5')
        self.critic_local.model.save_weights('critic_local.h5')
        self.critic_target.model.save_weights('critic_target.h5')

    def load_initial_weights(self):
        self.actor_local.model.load_weights('actor_local.h5')
        self.actor_target.model.load_weights('actor_target.h5')
        self.critic_local.model.load_weights('critic_local.h5')
        self.critic_target.model.load_weights('critic_target.h5')

    def save_model(self):
        # Save the weights
        self.actor_local.model.save_weights('model_weights.h5')

    def load_weights(self, option=None):
        if (option == None):
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('model_weights.h5')
        else:
            self.trained = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_lr, self.network)
            self.trained.model.load_weights('weights-best.hdf5')
            print(self.trained.model.summary())

    def predict(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.trained.model.predict(state)[0]
        return action

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size * 2):
            experiences = self.memory.sample()
            self.learn(experiences)

        if (len(self.memory) == self.buffer_size):
            self.memory.memory.clear()
            print("buffer cleared")

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        noise = self.noise.sample()
        action = list(self.actor_local.model.predict(state)[0] + noise)

        return action, noise  # add some noise for exploration

    def learn(self, experiences):  #experiences
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        '''
        print("States", states.shape)
        print("actions", actions.shape)
        print("rewards", rewards.shape)
        print("dones", dones.shape)
        print("Next states", next_states.shape)
        '''
        # keep training actor local and critic local
        # use values from target model to update and train local
        # don't train target models, we soft update target

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))

        actions_next = self.actor_target.model.predict_on_batch(
            next_states)  #target

        #Actions predicted by target critic
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])  #target

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)

        critic_loss = self.critic_local.model.train_on_batch(
            x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        actor_loss = self.actor_local.train_fn([states, action_gradients,
                                                1])  # custom training function

        self.tensorboard.on_epoch_end(
            self.batch_id, named_logs(self.critic_local.model, [critic_loss]))
        self.batch_id += 1

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #5
0
class DDPG_cartpole():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, state_size, action_size):
        self.epsilon = 0.8
        self.state_size = state_size
        self.action_size = action_size

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15
        #         self.exploration_sigma = 0.2
        #         self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 20000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor
        self.tau = 0.002  # for soft update of target parameters

        self.stats = np.array([])

#     def reset_episode(self):
#         self.noise.reset()
#         state = self.task.reset()
#         self.last_state = state
#         return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        # if score > 2:
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        # if len(self.memory) > self.batch_size:
        if len(self.memory) > 200:
            experiences = self.memory.sample(batch_size=self.batch_size)
            lossarr = self.learn(experiences)
            return lossarr

        # Roll over last state and action
        self.last_state = next_state

        return 0

    def act(self, state, epsilon=0.1, epsilong_min=0.05):
        """Returns actions for given state(s) as per current policy."""
        self.epsilon = epsilon
        if self.epsilon < epsilong_min:
            # print('epsilon', epsilon)
            self.epsilon = epsilong_min

        state = np.reshape(state, [-1, self.state_size])
        act_prob = self.actor_local.model.predict(state)[0]

        # if np.random.rand() > self.epsilon:
        # 	action = np.argmax(act_prob)
        # else:
        # 	action = np.random.binomial(1, 0.5, 1)[0]

        if np.random.rand() > self.epsilon:
            pass
        else:
            # act_prob = act_prob + np.random.randn(2)*self.epsilon*10
            action = np.random.binomial(1, 0.5, 1)[0]
            act_prob = np.zeros(2)
            act_prob[action] = 1

#         return list(action + self.noise.sample())  # add some noise for exploration

# acvtion_ = np.zeros(self.action_size)
#       acvtion_[action] = 1
        return act_prob

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        #         self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)
        history = self.critic_local.model.fit(x=[states, actions],
                                              y=Q_targets,
                                              batch_size=self.batch_size,
                                              epochs=1,
                                              verbose=0)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))

        # calculate statistics
        statistics = np.array([
            action_gradients.max(),
            action_gradients.min(),
            action_gradients.mean()
        ])
        self.stats = np.vstack([self.stats, statistics
                                ]) if self.stats.size else statistics

        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        # self.update_target()
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        return history.history['loss']

    # separate this out for each episod, not each step.
    def update_target(self):
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #6
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        #print("initializing DDPG agent")
        # Actor (Policy) Model
        #print("initializing actor_local")
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        #print("initializing actor_target")
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        #print("initializing critic_local")
        self.critic_local = Critic(self.state_size, self.action_size)
        #print("initializing critic_target")
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 1.0        # initial value was 0
        self.exploration_theta = 0.5     # initial value was 0.15
        self.exploration_sigma = 0.15    # initial value was 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000   # initially this was buffer_size = 100000
        self.batch_size = 64    # initial was 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor initial value was 0.99
        self.tau = 0.05  # for soft update of target parameters initial value was 0.01

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        #print("\nactor act sees state as {}\n".format(state))
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        #print("we are about to learn from our experiences")
        #print(experiences)
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        # the below line was throwing an error when running with tg-gpu2 and python2.7
        #action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        print("agent.py in learn method")
        print(states[1])
        print(actions[0])
        print("Python2.7 agent.py saw those for states and actions")
        #a_g = self.critic_local.get_action_gradients([states, actions, 0])
        #print(type(a_g[0]))
        #print(a_g[0])
        #print("Python2.7 agent.py saw that action gradient")

        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))

        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   

    def soft_update(self, local_model, target_model):
        #print("\ntime to update the target model")
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)