Пример #1
0
           kernel_initializer=initializer),
    Conv2D(16, 2, activation='elu', padding='valid',
           input_shape=dummy_env.observation_space.shape,
           kernel_initializer=initializer),
    Flatten(),
    Dropout(0.5),
    Dense(512, activation='elu', kernel_initializer=initializer)
])

# Exploration and learning rate decay after each epoch
eps = 0.2
eps_decay = 0.9
learning_rate = 3e-3
learning_decay = 0.9
explore_policy = EpsGreedy(eps, gym_2048.check_valid)
exploit_policy = Greedy(gym_2048.check_valid)
# Create Deep Q-Learning Network agent
agent = DQN(model, actions=dummy_env.action_space.n, gamma=0.99,
            batch_size=64, nsteps=50, enable_double_dqn=True,
            enable_dueling_network=True, target_update=10,
            test_policy=exploit_policy)


def plot_rewards_show(episode_rewards, episode_steps, done=False,
                      title='Rewards'):
    plt.clf()
    plt.xlabel('Step')
    plt.ylabel('Reward')
    plt.title('DQN rewards')
    for i, (ed, steps) in enumerate(zip(episode_rewards, episode_steps)):
        plt.plot(steps, ed, alpha=0.5 if i == 0 else 0.2,
Пример #2
0
    def __init__(self,
                 model,
                 actions,
                 optimizer=None,
                 policy=None,
                 test_policy=None,
                 gamma=0.99,
                 instances=8,
                 nsteps=1,
                 value_loss=0.5,
                 entropy_loss=0.01):
        """
		TODO: Describe parameters
		"""
        self.actions = actions
        self.optimizer = Adam(lr=3e-3) if optimizer is None else optimizer
        self.memory = memory.OnPolicy(steps=nsteps, instances=instances)

        if policy is None:
            # Create one policy per instance, with varying exploration parameters
            self.policy = [Greedy()] + [
                GaussianEpsGreedy(eps, 0.1)
                for eps in np.arange(0, 1, 1 / (instances - 1))
            ]
        else:
            self.policy = policy
        self.test_policy = Greedy() if test_policy is None else test_policy

        self.gamma = gamma
        self.instances = instances
        self.nsteps = nsteps
        self.value_loss = value_loss
        self.entropy_loss = entropy_loss
        self.training = True

        # Create output model layers based on number of actions
        raw_output = model.layers[-1].output
        actor = Dense(actions, activation='softmax')(
            raw_output)  # Actor (Policy Network)
        critic = Dense(1, activation='linear')(
            raw_output)  # Critic (Value Network)
        output_layer = Concatenate()([actor, critic])
        self.model = Model(inputs=model.input, outputs=output_layer)

        def a2c_loss(targets_actions, y_pred):
            # Unpack input
            targets, actions = targets_actions[:,
                                               0], targets_actions[:,
                                                                   1:]  # Unpack
            probs, values = y_pred[:, :-1], y_pred[:, -1]
            # Compute advantages and logprobabilities
            adv = targets - values
            logprob = tf.math.log(
                tf.reduce_sum(probs * actions, axis=1, keepdims=False) + 1e-10)
            # Compute composite loss
            loss_policy = -adv * logprob
            loss_value = self.value_loss * tf.square(adv)
            entropy = self.entropy_loss * tf.reduce_sum(
                probs * tf.math.log(probs + 1e-10), axis=1, keepdims=False)
            return tf.reduce_mean(loss_policy + loss_value + entropy)

        self.model.compile(optimizer=self.optimizer, loss=a2c_loss)
Пример #3
0
class A2C(Agent):
    """Advantage Actor-Critic (A2C)
	A2C is a synchronous version of A3C which gives equal or better performance.
	For more information on A2C refer to the OpenAI blog post: https://blog.openai.com/baselines-acktr-a2c/.
	The A3C algorithm is described in "Asynchronous Methods for Deep Reinforcement Learning" (Mnih et al., 2016)
	Since this algorithm is on-policy, it can and should be trained with multiple simultaneous environment instances.
	The parallelism decorrelates the agents' data into a more stationary process which aids learning.
	"""
    def __init__(self,
                 model,
                 actions,
                 optimizer=None,
                 policy=None,
                 test_policy=None,
                 gamma=0.99,
                 instances=8,
                 nsteps=1,
                 value_loss=0.5,
                 entropy_loss=0.01):
        """
		TODO: Describe parameters
		"""
        self.actions = actions
        self.optimizer = Adam(lr=3e-3) if optimizer is None else optimizer
        self.memory = memory.OnPolicy(steps=nsteps, instances=instances)

        if policy is None:
            # Create one policy per instance, with varying exploration parameters
            self.policy = [Greedy()] + [
                GaussianEpsGreedy(eps, 0.1)
                for eps in np.arange(0, 1, 1 / (instances - 1))
            ]
        else:
            self.policy = policy
        self.test_policy = Greedy() if test_policy is None else test_policy

        self.gamma = gamma
        self.instances = instances
        self.nsteps = nsteps
        self.value_loss = value_loss
        self.entropy_loss = entropy_loss
        self.training = True

        # Create output model layers based on number of actions
        raw_output = model.layers[-1].output
        actor = Dense(actions, activation='softmax')(
            raw_output)  # Actor (Policy Network)
        critic = Dense(1, activation='linear')(
            raw_output)  # Critic (Value Network)
        output_layer = Concatenate()([actor, critic])
        self.model = Model(inputs=model.input, outputs=output_layer)

        def a2c_loss(targets_actions, y_pred):
            # Unpack input
            targets, actions = targets_actions[:,
                                               0], targets_actions[:,
                                                                   1:]  # Unpack
            probs, values = y_pred[:, :-1], y_pred[:, -1]
            # Compute advantages and logprobabilities
            adv = targets - values
            logprob = tf.math.log(
                tf.reduce_sum(probs * actions, axis=1, keepdims=False) + 1e-10)
            # Compute composite loss
            loss_policy = -adv * logprob
            loss_value = self.value_loss * tf.square(adv)
            entropy = self.entropy_loss * tf.reduce_sum(
                probs * tf.math.log(probs + 1e-10), axis=1, keepdims=False)
            return tf.reduce_mean(loss_policy + loss_value + entropy)

        self.model.compile(optimizer=self.optimizer, loss=a2c_loss)

    def save(self, filename, overwrite=False):
        """Saves the model parameters to the specified file."""
        self.model.save_weights(filename, overwrite=overwrite)

    def act(self, state, instance=0):
        """Returns the action to be taken given a state."""
        qvals = self.model.predict(np.array([state]))[0][:-1]
        if self.training:
            return self.policy[instance].act(qvals) if isinstance(
                self.policy, list) else self.policy.act(qvals)
        else:
            return self.test_policy[instance].act(qvals) if isinstance(
                self.test_policy, list) else self.test_policy.act(qvals)

    def push(self, transition, instance=0):
        """Stores the transition in memory."""
        self.memory.put(transition, instance)

    def train(self, step):
        """Trains the agent for one step."""
        if len(self.memory) < self.instances:
            return

        state_batch, action_batch, reward_batches, end_state_batch, not_done_mask = self.memory.get(
        )

        # Compute the value of the last next states
        target_qvals = np.zeros(self.instances)
        non_final_last_next_states = [
            es for es in end_state_batch if es is not None
        ]
        if len(non_final_last_next_states) > 0:
            non_final_mask = list(map(lambda s: s is not None,
                                      end_state_batch))
            target_qvals[non_final_mask] = self.model.predict_on_batch(
                np.array(non_final_last_next_states))[:, -1].squeeze()

        # Compute n-step discounted return
        # If episode ended within any sampled nstep trace - zero out remaining rewards
        for n in reversed(range(self.nsteps)):
            rewards = np.array([b[n] for b in reward_batches])
            target_qvals *= np.array([t[n] for t in not_done_mask])
            target_qvals = rewards + (self.gamma * target_qvals)

        # Prepare loss data: target Q-values and actions taken (as a mask)
        ran = np.arange(self.instances)
        targets_actions = np.zeros((self.instances, self.actions + 1))
        targets_actions[ran, 0] = target_qvals
        targets_actions[ran, np.array(action_batch) + 1] = 1

        self.model.train_on_batch(np.array(state_batch), targets_actions)
Пример #4
0
# Optimizer with sheduled learning rate decay
optimizer = Adam(lr=3e-3, decay=1e-5)
# Run multiple instances
instances = 8
# Exploration and learning rate decay after each epoch
eps_max = 0.2
eps_decay = 0.9
learning_rate = 3e-3
learning_decay = 0.9
# Create Advantage Actor-Critic agent
agent = A2C(model,
            actions=dummy_env.action_space.n,
            nsteps=20,
            instances=instances,
            optimizer=optimizer,
            test_policy=Greedy(gym_2048.check_valid))

# Run epochs
for epoch in range(20):
    # Create a policy for each instance with a different eps
    policy = [Greedy(gym_2048.check_valid)] + [
        EpsGreedy(eps, gym_2048.check_valid)
        for eps in np.arange(0, eps_max, eps_max / (instances - 1))
    ]
    # Update agent
    agent.policy = policy
    agent.model.optimizer.lr = learning_rate
    # Run epoch
    print(f'Epoch {epoch}')
    run_epoch(create_env,
              agent,
Пример #5
0
    def __init__(self,
                 model,
                 actions,
                 optimizer=None,
                 policy=None,
                 test_policy=None,
                 memsize=100000,
                 target_update=10,
                 gamma=0.99,
                 batch_size=32,
                 nsteps=0,
                 enable_double_dqn=False,
                 enable_dueling_network=False,
                 dueling_type='avg'):
        """
		TODO: Describe parameters
		"""
        self.actions = actions
        self.optimizer = Adam(lr=3e-3) if optimizer is None else optimizer

        self.policy = EpsGreedy(0.1) if policy is None else policy
        self.test_policy = Greedy() if test_policy is None else test_policy

        self.memsize = memsize
        self.memory = PrioritizedExperienceReplay(memsize, nsteps)
        self.target_update = target_update
        self.gamma = gamma
        self.batch_size = batch_size
        self.nsteps = nsteps
        self.training = True

        # Extension options
        self.enable_double_dqn = enable_double_dqn
        self.enable_dueling_network = enable_dueling_network
        self.dueling_type = dueling_type

        # Create output layer based on number of actions and (optionally) a dueling architecture
        raw_output = model.layers[-1].output
        if self.enable_dueling_network:
            # "Dueling Network Architectures for Deep Reinforcement Learning" (Wang et al., 2016)
            # Output the state value (V) and the action-specific advantages (A) separately then
            # compute the Q values: Q = A + V
            dueling_layer = Dense(self.actions + 1,
                                  activation='linear')(raw_output)
            if self.dueling_type == 'avg':                f = lambda a: tf.expand_dims(a[:,0], -1) + a[:,1:] - \
tf.reduce_mean(a[:,1:], axis=1, keepdims=True)
            elif self.dueling_type == 'max':                f = lambda a: tf.expand_dims(a[:,0], -1) + a[:,1:] - \
tf.reduce_max(a[:,1:], axis=1, keepdims=True)
            elif self.dueling_type == 'naive':
                f = lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:]
            else:
                raise HkException(
                    "dueling_type must be one of {'avg','max','naive'}")
            output_layer = Lambda(f,
                                  output_shape=(self.actions, ))(dueling_layer)
        else:
            output_layer = Dense(self.actions, activation='linear')(raw_output)

        self.model = Model(inputs=model.input, outputs=output_layer)

        # Define loss function that computes the MSE between target Q-values and cumulative discounted rewards
        # If using PrioritizedExperienceReplay, the loss function also computes the TD error
        # and updates the trace priorities
        def masked_q_loss(data, y_pred):
            """Computes the MSE between the Q-values of the actions that were taken and	the cumulative discounted
			rewards obtained after taking those actions. Updates trace priorities if using PrioritizedExperienceReplay.
			"""
            action_batch, target_qvals = data[:, 0], data[:, 1]
            seq = tf.cast(tf.range(0, tf.shape(action_batch)[0]), tf.int32)
            action_idxs = tf.transpose(
                tf.stack([seq, tf.cast(action_batch, tf.int32)]))
            qvals = tf.gather_nd(y_pred, action_idxs)
            if isinstance(self.memory, PrioritizedExperienceReplay):

                def update_priorities(_qvals, _target_qvals, _traces_idxs):
                    """Computes the TD error and updates memory priorities."""
                    td_error = np.abs((_target_qvals - _qvals).numpy())
                    _traces_idxs = (tf.cast(_traces_idxs, tf.int32)).numpy()
                    self.memory.update_priorities(_traces_idxs, td_error)
                    return _qvals

                qvals = tf.py_function(func=update_priorities,
                                       inp=[qvals, target_qvals, data[:, 2]],
                                       Tout=tf.float32)
            return tf.keras.losses.mse(qvals, target_qvals)

        self.model.compile(optimizer=self.optimizer, loss=masked_q_loss)

        # Clone model to use for delayed Q targets
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())
Пример #6
0
class DQN(Agent):
    """Deep Q-Learning Network
	Base implementation:
		"Playing Atari with Deep Reinforcement Learning" (Mnih et al., 2013)
	Extensions:
		Multi-step returns: "Reinforcement Learning: An Introduction" 2nd ed. (Sutton & Barto, 2018)
		Double Q-Learning: "Deep Reinforcement Learning with Double Q-learning" (van Hasselt et al., 2015)
		Dueling Q-Network: "Dueling Network Architectures for Deep Reinforcement Learning" (Wang et al., 2016)
	"""
    def __init__(self,
                 model,
                 actions,
                 optimizer=None,
                 policy=None,
                 test_policy=None,
                 memsize=100000,
                 target_update=10,
                 gamma=0.99,
                 batch_size=32,
                 nsteps=0,
                 enable_double_dqn=False,
                 enable_dueling_network=False,
                 dueling_type='avg'):
        """
		TODO: Describe parameters
		"""
        self.actions = actions
        self.optimizer = Adam(lr=3e-3) if optimizer is None else optimizer

        self.policy = EpsGreedy(0.1) if policy is None else policy
        self.test_policy = Greedy() if test_policy is None else test_policy

        self.memsize = memsize
        self.memory = PrioritizedExperienceReplay(memsize, nsteps)
        self.target_update = target_update
        self.gamma = gamma
        self.batch_size = batch_size
        self.nsteps = nsteps
        self.training = True

        # Extension options
        self.enable_double_dqn = enable_double_dqn
        self.enable_dueling_network = enable_dueling_network
        self.dueling_type = dueling_type

        # Create output layer based on number of actions and (optionally) a dueling architecture
        raw_output = model.layers[-1].output
        if self.enable_dueling_network:
            # "Dueling Network Architectures for Deep Reinforcement Learning" (Wang et al., 2016)
            # Output the state value (V) and the action-specific advantages (A) separately then
            # compute the Q values: Q = A + V
            dueling_layer = Dense(self.actions + 1,
                                  activation='linear')(raw_output)
            if self.dueling_type == 'avg':                f = lambda a: tf.expand_dims(a[:,0], -1) + a[:,1:] - \
tf.reduce_mean(a[:,1:], axis=1, keepdims=True)
            elif self.dueling_type == 'max':                f = lambda a: tf.expand_dims(a[:,0], -1) + a[:,1:] - \
tf.reduce_max(a[:,1:], axis=1, keepdims=True)
            elif self.dueling_type == 'naive':
                f = lambda a: tf.expand_dims(a[:, 0], -1) + a[:, 1:]
            else:
                raise HkException(
                    "dueling_type must be one of {'avg','max','naive'}")
            output_layer = Lambda(f,
                                  output_shape=(self.actions, ))(dueling_layer)
        else:
            output_layer = Dense(self.actions, activation='linear')(raw_output)

        self.model = Model(inputs=model.input, outputs=output_layer)

        # Define loss function that computes the MSE between target Q-values and cumulative discounted rewards
        # If using PrioritizedExperienceReplay, the loss function also computes the TD error
        # and updates the trace priorities
        def masked_q_loss(data, y_pred):
            """Computes the MSE between the Q-values of the actions that were taken and	the cumulative discounted
			rewards obtained after taking those actions. Updates trace priorities if using PrioritizedExperienceReplay.
			"""
            action_batch, target_qvals = data[:, 0], data[:, 1]
            seq = tf.cast(tf.range(0, tf.shape(action_batch)[0]), tf.int32)
            action_idxs = tf.transpose(
                tf.stack([seq, tf.cast(action_batch, tf.int32)]))
            qvals = tf.gather_nd(y_pred, action_idxs)
            if isinstance(self.memory, PrioritizedExperienceReplay):

                def update_priorities(_qvals, _target_qvals, _traces_idxs):
                    """Computes the TD error and updates memory priorities."""
                    td_error = np.abs((_target_qvals - _qvals).numpy())
                    _traces_idxs = (tf.cast(_traces_idxs, tf.int32)).numpy()
                    self.memory.update_priorities(_traces_idxs, td_error)
                    return _qvals

                qvals = tf.py_function(func=update_priorities,
                                       inp=[qvals, target_qvals, data[:, 2]],
                                       Tout=tf.float32)
            return tf.keras.losses.mse(qvals, target_qvals)

        self.model.compile(optimizer=self.optimizer, loss=masked_q_loss)

        # Clone model to use for delayed Q targets
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())

    def save(self, filename, overwrite=False):
        """Saves the model parameters to the specified file."""
        self.model.save_weights(filename, overwrite=overwrite)

    def act(self, state, instance=0):
        """Returns the action to be taken given a state."""
        qvals = self.model.predict(np.array([state]))[0]
        return self.policy.act(
            qvals) if self.training else self.test_policy.act(qvals)

    def push(self, transition, instance=0):
        """Stores the transition in memory."""
        self.memory.put(transition)

    def train(self, step):
        """Trains the agent for one step."""
        if len(self.memory) == 0:
            return

        # Update target network
        if self.target_update >= 1 and step % self.target_update == 0:
            # Perform a hard update
            self.target_model.set_weights(self.model.get_weights())
        elif self.target_update < 1:
            # Perform a soft update
            mw = np.array(self.model.get_weights())
            tmw = np.array(self.target_model.get_weights())
            self.target_model.set_weights(self.target_update * mw +
                                          (1 - self.target_update) * tmw)

        # Train even when memory has fewer than the specified batch_size
        batch_size = min(len(self.memory), self.batch_size)

        # Sample batch_size traces from memory
        state_batch, action_batch, reward_batches, end_state_batch, not_done_mask = self.memory.get(
            batch_size)

        # Compute the value of the last next states
        target_qvals = np.zeros(batch_size)
        non_final_last_next_states = [
            es for es in end_state_batch if es is not None
        ]

        if len(non_final_last_next_states) > 0:
            if self.enable_double_dqn:
                # "Deep Reinforcement Learning with Double Q-learning" (van Hasselt et al., 2015)
                # The online network predicts the actions while the target network is used to estimate the Q-values
                q_values = self.model.predict_on_batch(
                    np.array(non_final_last_next_states))
                actions = np.argmax(q_values, axis=1)
                # Estimate Q-values using the target network but select the values with the
                # highest Q-value wrt to the online model (as computed above).
                target_q_values = self.target_model.predict_on_batch(
                    np.array(non_final_last_next_states))
                selected_target_q_vals = target_q_values[
                    range(len(target_q_values)), actions]
            else:
                # Use delayed target network to compute target Q-values
                selected_target_q_vals = self.target_model.predict_on_batch(
                    np.array(non_final_last_next_states)).max(1)
            non_final_mask = list(map(lambda s: s is not None,
                                      end_state_batch))
            target_qvals[non_final_mask] = selected_target_q_vals

        # Compute n-step discounted return
        # If episode ended within any sampled nstep trace - zero out remaining rewards
        for n in reversed(range(self.nsteps)):
            rewards = np.array([b[n] for b in reward_batches])
            target_qvals *= np.array([t[n] for t in not_done_mask])
            target_qvals = rewards + (self.gamma * target_qvals)

        # Compile information needed by the custom loss function
        loss_data = [action_batch, target_qvals]

        # If using PrioritizedExperienceReplay then we need to provide the trace indexes
        # to the loss function as well so we can update the priorities of the traces
        if isinstance(self.memory, PrioritizedExperienceReplay):
            loss_data.append(self.memory.last_traces_idxs())

        # Train model
        self.model.train_on_batch(np.array(state_batch),
                                  np.stack(loss_data).transpose())