Exemplo n.º 1
0
class VPGSolver(StandardAgent):
    """
    A standard vpg_solver, inpired by:
      https://github.com/jachiam/rl-intro/blob/master/pg_cartpole.py
    NOTE: 
        will need to examine steps (total_t), not episodes, as VPG doesn't
        implement episodes per-training-step
    """
    can_graph = True  # batch size is variable, cannot use tf graphing

    def __init__(self, 
        experiment_name, 
        env_wrapper,
        gamma=0.99, 
        epsilon=None,
        epsilon_decay_rate=0.995,
        epsilon_min=0.1,
        batch_size=64,
        n_cycles=128,
        learning_rate=0.01,
        model_name="vpg", 
        saving=True):

        super(VPGSolver, self).__init__(
            env_wrapper,
            model_name,
            experiment_name,
            saving=saving)

        self.label = "Batch"  # not by episode, by arbitrary batch
        self.action_size_tensor = tf.constant(self.action_size)
        
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay_rate = epsilon_decay_rate
        self.epsilon_min = epsilon_min

        # TODO could go to standard..
        self.batch_size = batch_size
        self.n_cycles = n_cycles

        self.memory = []  # state
        self.solved_on = None

        self.model = self.build_model()
        self.optimizer = Adam(lr=learning_rate)  # decay=learning_rate_decay)

        self.load_state()

        # TODO rollout steps

    @staticmethod
    def discount_future_cumsum(episode_rewards, gamma):
        """
        Takes: 
            A list of rewards per step for an episode
        Returns: 
            The future reward at each step, with the future discounting 
            rate applied from that step onwards.
        """
        ep_rwds = np.array(episode_rewards)
        n = len(ep_rwds)
        discounts = gamma ** np.arange(n)
        discounted_futures = np.zeros_like(ep_rwds, dtype=np.float64)
        for j in range(n):
            discounted_futures[j] = sum(ep_rwds[j:] * discounts[:(n-j)])

        assert len(discounted_futures) == len(episode_rewards)
        return discounted_futures

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env = self.env_wrapper.env
        state, done, episode_rewards = env.reset(), False, []
        success_steps = 0

        for batch_num in range(max_iters):
            # Refresh every batch (on-policy)
            state_batch, act_batch, batch_future_rewards = [], [], []

            for step in range(self.n_cycles):
                if render:
                    env.render()

                action = self.act(self.model, state, epsilon=self.epsilon)
                state_next, reward, done, _ = env.step(action)

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, state_next, reward, done, step)
                
                state_batch.append(state.copy())
                act_batch.append(np.int32(action))
                episode_rewards.append(reward)

                # NOTE: Removed copy
                state = state_next

                self.report_step(step, batch_num, max_iters)

                if done:

                    # At the end of each episode:
                    # Create a list of future rewards, 
                    #  discounting by how far in the future
                    batch_future_rewards += list(
                        self.discount_future_cumsum(
                            episode_rewards, self.gamma))
                    self.scores.append(success_steps)
                    state, done, episode_rewards = env.reset(), False, []
                    success_steps = 0
                else:
                    success_steps += 1
            
            # Add any trailing rewards to done
            batch_future_rewards += list(
                self.discount_future_cumsum(
                    episode_rewards, self.gamma)
            )
            episode_rewards = []

            # HANDLE END OF EPISODE
            batch_advs = np.array(batch_future_rewards)

            # This is R(tau), normalised
            normalised_batch_advs = ( 
                (batch_advs - np.mean(batch_advs))
                / (np.std(batch_advs) + 1e-8)
            )

            self.remember(state_batch, act_batch, normalised_batch_advs)
            self.learn(*self.get_batch_to_train())

            solved = self.handle_episode_end(
                state, state_next, reward, 
                step, max_iters, verbose=verbose)

            if solved:
                break
        
        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def remember(self, state_batch, act_batch, batch_advs):

        self.memory = (state_batch, act_batch, batch_advs)

    def get_batch_to_train(self):

        assert len(self.memory[0]) == len(self.memory[1]), f"{len(self.memory[0])}, {len(self.memory[1])}"
        assert len(self.memory[1]) == len(self.memory[2]), f"{len(self.memory[1])}, {len(self.memory[2])}"

        minibatch_i = np.random.choice(len(self.memory[0]),
            min(self.batch_size, len(self.memory[0])),
            )
        
        sampled_memory = []
        for i in range(len(self.memory)):
            sampled_memory.append(tf.convert_to_tensor([self.memory[i][j] for j in minibatch_i]))

        self.memory = []  # Only learning from last set of trajectories

        return sampled_memory
    
    def learn(self, sts, acts, advs):
        """Updated the agent's decision network based
        on a sample of previous decisions it has seen.
        Here, we combine the target and action networks.
        """

        loss_value = self.take_training_step(sts, acts, advs)

        if self.epsilon:
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay_rate

        return loss_value

    @conditional_decorator(tf.function, can_graph)
    def take_training_step(self, sts, acts, advs):
        tf.debugging.assert_equal(tf.shape(sts)[0], tf.size(acts), summarize=1) 
        tf.debugging.assert_equal(tf.size(acts), tf.size(advs), summarize=1)

        with tf.GradientTape() as tape:
            
            # One step away from Pi_theta(at|st)
            pi_action_logits = self.model(sts)
            
            action_one_hots = tf.one_hot(
                acts, self.action_size_tensor, dtype=tf.float64)
            
            # This IS pi_theta(at|st), only at the actual action taken
            pi_action_log_probs = tf.math.reduce_sum(
                action_one_hots * tf.nn.log_softmax(pi_action_logits), 
                axis=1)

            tf.debugging.assert_equal(tf.size(advs), tf.size(pi_action_log_probs))

            loss_value = - tf.math.reduce_mean(
                advs * pi_action_log_probs
            )

        grads = tape.gradient(loss_value, self.model.trainable_variables)

        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables))

        return loss_value

    def save_state(self, add_to_save={}):
        """Save a (trained) model with its weights to a specified file.
        Metadata should be passed to keep information avaialble.
        """

        self.save_state_to_dict(append_dict={
            "optimizer_config": self.optimizer.get_config(),
            "epislon": self.epsilon,
        })

        self.model.save(self.model_location)

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        
        if os.path.exists(self.model_location):
            self.model = tf.keras.models.load_model(self.model_location)

            self.optimizer = self.optimizer.from_config(self.optimizer_config)
            del model_dict["optimizer_config"], self.optimizer_config

            print(" Loaded.")
        
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)
Exemplo n.º 2
0
class PPOSolver(StandardAgent):
    """
    PPO Solver
    Inspired by:
      https://github.com/anita-hu/TF2-RL/blob/master/PPO/TF2_PPO.py
      https://github.com/ajleite/basic-ppo/blob/master/ppo.py
    """

    can_graph = True

    def __init__(self, 
        experiment_name,
        env_wrapper,
        clip_ratio=0.2,
        val_coef=1.0,
        entropy_coef=0.01,
        lam=1.0,
        gamma=0.95,
        actors=1,
        cycle_length=128,
        minibatch_size_per_actor=64,
        cycle_epochs=4,
        learning_rate=5e-4,
        model_name="ppo",
        saving=True):

        super(PPOSolver, self).__init__(
            env_wrapper,
            model_name,
            experiment_name, 
            saving=saving)

        self.clip_ratio = clip_ratio
        self.gamma = gamma
        self.lam = lam
        self.val_coef = val_coef
        self.entropy_coef = entropy_coef

        self.actors = actors
        self.cycle_length = cycle_length  # Run this many per epoch
        self.batch_size = cycle_length * actors  # Sample from the memory
        self.minibatch_size = minibatch_size_per_actor * actors  # train on batch
        self.cycle_epochs = cycle_epochs  # Train for this many epochs

        # self.num_init_random_rollouts = num_init_random_rollouts
        self.model_name = model_name

        self.solved_on = None

        self.model = PPOModel(
            self.state_size, self.action_size, model_name=self.model_name)
        self.model.build(input_shape=(None, self.state_size))

        # self._random_dataset = self._gather_rollouts(
        #     env_wrapper, num_init_random_rollouts, epsilon=1.)

        self.optimizer = Adam(lr=learning_rate)

        head, _, _ = self.model_location.rpartition(".h5")
        self.model_location = head + ".weights"
        self.load_state()

    def show(self, render=False):
        raise NotImplementedError("self.model needs to be adapted in super")

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env_trackers = [EnvTracker(self.env_wrapper) for _ in range(self.actors)]
        solved = False

        # Every episode return ever
        all_episode_returns = []
        all_episode_steps = []

        for iteration in range(max_iters):
            data = []  # Refresh every batch (on-policy)

            for env_tracker in env_trackers:
                state = env_tracker.latest_state
                states, actions, log_probs, rewards, v_preds =\
                    [], [], [], [], []

                for step in range(self.cycle_length):
                    if render:
                        env_tracker.env.render()

                    action, value, log_prob = (
                        tf.squeeze(x).numpy() for x in
                        self.model.act_value_logprobs(
                            state, 
                            eps=None)
                    )
                    observation, reward, done, _ = env_tracker.env.step(action)
                    state_next = observation

                    # Custom reward if required by env wrapper
                    reward = self.env_wrapper.reward_on_step(
                        state, state_next, reward, done, step)

                    env_tracker.return_so_far += reward

                    states.append(state)
                    actions.append(action)
                    log_probs.append(log_prob)
                    rewards.append(np.float64(reward))
                    v_preds.append(value)

                    self.report_step(step, iteration, max_iters)
                    if done:
                        all_episode_returns.append(
                            env_tracker.return_so_far)
                        all_episode_steps.append(env_tracker.steps_so_far)
                        state = env_tracker.env.reset()
                        env_tracker.steps_so_far = 0
                        env_tracker.return_so_far = 0.
                    else:
                        env_tracker.steps_so_far += 1
                        state = observation

                next_v_preds = v_preds[1:] + [0.]  # TODO - both right float?
                gaes = self.get_norm_general_advantage_est(
                    rewards, v_preds, next_v_preds)

                # TODO make a handler object
                if not data:
                    data = [
                        states, actions, log_probs, next_v_preds, rewards, 
                        gaes
                    ]
                else:
                    data[0] += states; data[1] += actions; data[2] += log_probs
                    data[3] += next_v_preds; data[4] += rewards; data[5] += gaes

                env_tracker.latest_state = state

            self.scores = all_episode_steps  # FIXME this won't handle picking up from left-off
            solved = self.handle_episode_end(
                state, state_next, reward, 
                step, max_iters, verbose=verbose)
            if solved: 
                break

            self.take_training_step(
                *(tf.convert_to_tensor(lst) for lst in data)
                # *tuple(map(tf.convert_to_tensor, zip(*memory)))
            )
        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def get_norm_general_advantage_est(self, rewards, v_preds, next_v_preds):
        # Sources:
        #  https://github.com/uidilr/ppo_tf/blob/master/ppo.py#L98
        #  https://github.com/anita-hu/TF2-RL/blob/master/PPO/TF2_PPO.py
        deltas = [
            r_t + self.gamma * v_next - v for r_t, v_next, v in 
            zip(rewards, next_v_preds, v_preds)
        ]
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(gaes) - 1)):
            gaes[t] = gaes[t] + self.lam * self.gamma * gaes[t + 1]

        gaes = np.array(gaes).astype(np.float64)
        norm_gaes = (gaes - gaes.mean()) / gaes.std()

        return norm_gaes

    @conditional_decorator(tf.function, can_graph)
    def take_training_step(self, sts, a, log_p, nxt_v_pred, r, adv):
        """
        Performs gradient DEscent on minibatches of minibatch_size, 
        sampled from a batch of batch_size, sampled from the memory

        Samples without replacement (to check)
        """

        assert self.batch_size == len(r)

        for _ in range(self.cycle_epochs):
            # Batch from the examples in the memory
            shuffled_indices = tf.random.shuffle(tf.range(self.batch_size))  # Every index of the cycle examples
            num_mb = self.batch_size // self.minibatch_size
            # Pick minibatch-sized samples from there
            for minibatch_i in tf.split(shuffled_indices, num_mb):
                minibatch = (
                    tf.gather(x, minibatch_i, axis=0) 
                    for x in (sts, a, log_p, nxt_v_pred, r, adv)
                )
                self.train_minibatch(*minibatch)

        # TODO used to be zip weights and assign
        # for pi_old_w, pi_w in zip(
        #         self.pi_model_old.weights, self.pi_model.weights):
        #     pi_old_w.assign(pi_w)
    
    @conditional_decorator(tf.function, can_graph)
    def train_minibatch(self, sts, a, log_p, nxt_v_pred, r, adv):
       
        # Convert from (64,) to (64, 1)
        r = tf.expand_dims(r, axis=-1)
        nxt_v_pred = tf.expand_dims(nxt_v_pred, axis=-1)

        with tf.GradientTape() as tape:
            new_log_p, entropy, sts_vals = self.model.evaluate_actions(sts, a)
            ratios = tf.exp(new_log_p - log_p)

            clipped_ratios = tf.clip_by_value(
                ratios, 
                clip_value_min=1-self.clip_ratio, 
                clip_value_max=1+self.clip_ratio
            )
            loss_clip = tf.reduce_mean(
                tf.minimum((adv  * ratios), (adv * clipped_ratios))
            )
            target_values = r + self.gamma * nxt_v_pred

            vf_loss = tf.reduce_mean(
                tf.math.square(sts_vals - target_values)
            )

            entropy = tf.reduce_mean(entropy)

            total_loss = ( 
                - loss_clip 
                + self.val_coef * vf_loss 
                - self.entropy_coef * entropy
            )
        train_variables = self.model.trainable_variables
        grads = tape.gradient(total_loss, train_variables)
        self.optimizer.apply_gradients(zip(grads, train_variables))

    def save_state(self, verbose=False):
        """
        Called at the end of saving-episodes.

        Save a (trained) model with its weights to a specified file.
        Passes the required information to add to the pickle dict for the 
         model.
        """

        add_to_save = {
            # "epsilon": self.epsilon,
            # "memory": self.memory,
            "optimizer_config": self.optimizer.get_config(),
            }

        self.save_state_to_dict(append_dict=add_to_save)

        if verbose:
            print("Saving to", self.model_location)

        self.model.save_weights(self.model_location) # , save_format='tf')

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        if os.path.exists(self.model_location):
            self.model.load_weights(self.model_location)
            self.optimizer = self.optimizer.from_config(self.optimizer_config)
            del model_dict["optimizer_config"], self.optimizer_config
            print(" Loaded.")
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)
Exemplo n.º 3
0
class DDPGSolver(StandardAgent):
    """
    A standard ddpg solver:
      https://github.com/openai/baselines/blob/master/baselines/a2c/a2c.py
    Inspired by
      https://github.com/anita-hu/TF2-RL/blob/master/DDPG/TF2_DDPG_Basic.py
    """
    def __init__(
        self,
        experiment_name,
        env_wrapper,
        ent_coef=1e-4,
        vf_coef=0.5,
        n_cycles=128,
        batch_size=64,
        max_grad_norm=0.5,
        learning_rate_actor=1e-5,
        learning_rate_critic=1e-3,
        memory_len=100000,
        gamma=0.99,
        epsilon=None,
        tau=0.125,
        lrschedule='linear',
        model_name="ddpg",
        saving=True,
        rollout_steps=5000,
    ):

        super(DDPGSolver, self).__init__(env_wrapper,
                                         model_name,
                                         experiment_name,
                                         saving=saving)

        self.n_cycles = n_cycles
        self.batch_size = batch_size

        self.gamma = gamma
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef

        # NOTE new AND need to verify deque is safe
        self.memory = deque(maxlen=memory_len)
        self.epsilon = epsilon  # new but should be in A2C
        self.tau = tau

        # TODO reimplement
        # self.max_grad_norm = max_grad_norm
        # self.epsilon = epsilon  # exploration rate

        self.solved_on = None

        self.actor = self.build_model(model_name=model_name + "_actor")
        self.actor.build(input_shape=(
            None,
            self.state_size,
        ))

        self.actor_dash = self.build_model(model_name=model_name +
                                           "_actor_target")
        self.actor_dash.build(input_shape=(
            None,
            self.state_size,
        ))

        self.actor_dash.set_weights(self.actor.get_weights())

        self.actor_optimizer = Adam(learning_rate=learning_rate_actor)
        self.actor.summary()

        self.critic = self.build_critic_model(self.state_size,
                                              self.action_size,
                                              model_name=model_name +
                                              "_critic")
        # self.critic.build(input_shape=[(state_size,), (action_size,)])
        self.critic_dash = self.build_critic_model(self.state_size,
                                                   self.action_size,
                                                   model_name=model_name +
                                                   "_critic_target")
        # self.critic_dash.build(input_shape=[(state_size,), (action_size,)])

        self.critic_dash.set_weights(self.critic.get_weights())

        self.critic_optimizer = Adam(learning_rate=learning_rate_critic)
        self.critic.summary()

        self.load_state()

        self.rollout_memory(rollout_steps - len(self.memory))

    def build_critic_model(self, input_size, action_size, model_name='critic'):
        """
        Returns Q(st+1 | a, s)
        """

        inputs = [Input(shape=(input_size)), Input(shape=(action_size, ))]
        concat = Concatenate(axis=-1)(inputs)
        x = Dense(24, name="hidden_1", activation='tanh')(concat)
        x = Dense(48, name="hidden_2", activation='tanh')(x)
        output = Dense(1, name="Out")(x)
        model = Model(inputs=inputs, outputs=output, name=model_name)
        model.build(input_shape=[(input_size, ), (action_size, )])

        return model

    def act_with_noise(self, state, add_noise=True):
        raise NotImplementedError(
            "Consider implementing from\nhttps://github.com/anita-hu/"
            "TF2-RL/blob/master/DDPG/TF2_DDPG_Basic.py")

    def show(self, render=False):
        raise NotImplementedError("self.model needs to be adapted in super")

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env = self.env_wrapper.env
        state = env.reset()

        success_steps = 0

        for iteration in range(max_iters):
            for step in range(self.n_cycles):  # itertools.count():
                if render:
                    env.render()

                # TODO implement act and add noise
                action_dist = self.actor(tf.expand_dims(state, axis=0))
                observation, reward, done, _ = env.step(np.argmax(action_dist))

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, observation, reward, done, step)

                self.memory.append((state, tf.squeeze(action_dist),
                                    np.float64(reward), observation, done))
                state = observation

                self.report_step(step, iteration, max_iters)

                if done:
                    # OR env_wrapper.get_score(state, observation, reward, step)
                    self.scores.append(success_steps)
                    success_steps = 0
                    state = env.reset()
                else:
                    success_steps += 1

                self.take_training_step()

            solved = self.handle_episode_end(state,
                                             observation,
                                             reward,
                                             step,
                                             max_iters,
                                             verbose=verbose)

            if solved:
                break

        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def take_training_step(self):
        if len(self.memory) < self.batch_size:
            return

        # Note min is actually unecessary with cond above
        minibatch_i = np.random.choice(
            len(self.memory),
            min(self.batch_size, len(self.memory)),
        )

        minibatch = [self.memory[i] for i in minibatch_i]

        # Obs on [adv, return]
        loss_value = self.train_on_minibatch(
            *tuple(map(tf.convert_to_tensor, zip(*minibatch))))

        # Update weights
        for model_name in "actor", "critic":
            self.update_weights(model_name, self.tau)

        # TODO decrease epsilon if not None

    @tf.function()
    def train_on_minibatch(self, sts, a, r, n_sts, d):

        # r + gam(1-d)Q_phi_targ(s_t+1, mu_theta_targ(s_t+1))
        n_a = self.actor_dash(n_sts)
        q_future_pred = self.critic_dash([n_sts, n_a])
        target_qs = r + tf.where(
            d, tf.zeros(shape=q_future_pred.shape, dtype=tf.dtypes.float64),
            self.gamma * q_future_pred)

        # Minimise (r + target on next state) - (current critic on sts and a)
        # Makes critic better at predicting future
        with tf.GradientTape() as tape:
            updated_q_values = self.critic([sts, a])
            critic_loss = tf.reduce_mean(
                tf.math.square(updated_q_values - target_qs))

        critic_grad = tape.gradient(critic_loss,
                                    self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        # Use the (improving) critic to rate the actor's updated decision
        # Minimising loss means maximising actor's expectation
        with tf.GradientTape() as tape:
            # mu_phi(s)
            updated_action_dist = self.actor(sts)
            # Works due to chain rule, tracks mu gradients to improve mu prediciton
            # TODO this is quite nuanced - check this through
            actor_loss = -tf.reduce_mean(
                self.critic([sts, updated_action_dist]))

        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))

    def update_weights(self, model_name, tau):
        weights = getattr(getattr(self, model_name), "weights")
        target_model = getattr(self, model_name + "_dash")
        target_weights = target_model.weights
        target_model.set_weights([
            weights[i] * tau + target_weights[i] * (1. - tau)
            for i in range(len(weights))
        ])

    def save_state(self):
        """
        Called at the end of saving-episodes.

        Save a (trained) model with its weights to a specified file.
        Passes the required information to add to the pickle dict for the 
         model.
        """

        add_to_save = {
            "memory": self.memory,
            "epsilon": self.epsilon,
            "actor_optimizer_config": self.actor_optimizer.get_config(),
            "critic_optimizer_config": self.critic_optimizer.get_config(),
        }

        self.save_state_to_dict(append_dict=add_to_save)

        for var in ("actor", "actor_dash", "critic", "critic_dash"):
            model = getattr(self, var)
            model.save_weights(
                self.model_location.replace(".h5", "_" + var + ".h5"))

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        if os.path.exists(self.model_location):
            for var in ("actor", "actor_dash", "critic", "critic_dash"):
                model = getattr(self, var)
                self.model.load_weights(
                    self.model_location.replace(".h5", "_" + var + ".h5"))
            self.actor_optimizer = self.actor_optimizer.from_config(
                self.actor_optimizer_config)
            self.critic_optimizer = self.critic_optimizer.from_config(
                self.critic_optimizer_config)
            del model_dict[
                "actor_optimizer_config"], self.actor_optimizer_config
            del model_dict[
                "critic_optimizer_config"], self.critic_optimizer_config
            print(" Loaded.")
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)

    def rollout_memory(self, rollout_steps, render=False):
        if rollout_steps <= 0:
            return
        print("Rolling out steps", rollout_steps)
        env = self.env_wrapper.env
        state = env.reset()

        max_iters = rollout_steps // self.n_cycles

        for iteration in range(max_iters):
            for step in range(self.n_cycles):
                if render:
                    env.render()

                # TODO implement act and add noise
                action_dist = self.actor(tf.expand_dims(state, axis=0))
                observation, reward, done, _ = env.step(np.argmax(action_dist))

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, observation, reward, done, step)

                self.memory.append((state, tf.squeeze(action_dist),
                                    np.float64(reward), observation, done))
                state = observation

                self.report_step(step, iteration, max_iters)

                if done:
                    state = env.reset()

        print("\nCompleted.")
Exemplo n.º 4
0
class DQNSolver(StandardAgent):
    """
    A standard dqn_solver, inpired by:
      https://gym.openai.com/evaluations/eval_EIcM1ZBnQW2LBaFN6FY65g/
    Implements a simple DNN that predicts values.
    """
    def __init__(self,
                 experiment_name,
                 env_wrapper,
                 memory_len=100000,
                 gamma=0.99,
                 batch_size=64,
                 n_cycles=128,
                 epsilon=1.,
                 epsilon_min=0.01,
                 epsilon_decay=0.995,
                 learning_rate=0.01,
                 learning_rate_decay=0.01,
                 rollout_steps=10000,
                 model_name="dqn",
                 saving=True):

        super(DQNSolver, self).__init__(env_wrapper,
                                        model_name,
                                        experiment_name,
                                        saving=saving)

        # Training
        self.batch_size = batch_size
        self.n_cycles = n_cycles

        self.memory = deque(maxlen=memory_len)
        self.solved_on = None

        self.gamma = gamma  # discount rate was 1
        self.epsilon = epsilon  # exploration rate
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay  # 0.995

        self.model = self.build_model()

        self.optimizer = Adam(lr=learning_rate, decay=learning_rate_decay)

        self.load_state()

        self.rollout_memory(rollout_steps - len(self.memory))

    def rollout_memory(self, rollout_steps, verbose=False, render=False):
        if rollout_steps <= 0:
            return
        env = self.env_wrapper.env
        state = env.reset()
        for step in range(rollout_steps):
            if render:
                env.render()

            action = self.act(self.model, state, epsilon=1.)  # Max random
            observation, reward, done, _ = env.step(action)
            state_next = observation

            # Custom reward if required by env wrapper
            reward = self.env_wrapper.reward_on_step(state, state_next, reward,
                                                     done, step)

            self.memory.append(
                (state, np.int32(action), reward, state_next, done))
            state = observation

            if done:
                state = env.reset()
                # OR env_wrapper.get_score(state, state_next, reward, step)
        print(f"Rolled out {len(self.memory)}")

    def solve(self, max_iters, verbose=False, render=False):
        start_time = datetime.datetime.now()
        env = self.env_wrapper.env
        state = env.reset()
        success_steps = 0

        for iteration in range(max_iters):
            for step in range(self.n_cycles):
                if render:
                    env.render()

                action = self.act(self.model, state, epsilon=self.epsilon)
                observation, reward, done, _ = env.step(action)
                state_next = observation

                # Custom reward if required by env wrapper
                reward = self.env_wrapper.reward_on_step(
                    state, state_next, reward, done, step)

                self.memory.append(
                    (state, np.int32(action), reward, state_next, done))
                state = observation

                self.report_step(step, iteration, max_iters)
                if done:
                    state = env.reset()
                    # OR env_wrapper.get_score(state, state_next, reward, step)
                    self.scores.append(success_steps)
                    success_steps = 0
                else:
                    success_steps += 1

                self.learn()

            score = step

            solved = self.handle_episode_end(state,
                                             state_next,
                                             reward,
                                             step,
                                             max_iters,
                                             verbose=verbose)

            if solved:
                break

        self.elapsed_time += (datetime.datetime.now() - start_time)
        return solved

    def learn(self):
        """
        Updated the agent's decision network based
        on a sample of previous decisions it has seen.
        Here, we combine the target and action networks.
        """
        if len(self.memory) < self.batch_size:
            return

        args_as_tuple = get_batch_from_memory(self.memory, self.batch_size)

        loss_value = self.take_training_step(*args_as_tuple)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    @tf.function
    def take_training_step(self, sts, a, r, n_sts, d):

        future_q_pred = tf.math.reduce_max(self.model(n_sts), axis=-1)
        future_q_pred = tf.where(d, tf.zeros((1, ), dtype=tf.dtypes.float64),
                                 future_q_pred)
        q_targets = tf.cast(r, tf.float64) + self.gamma * future_q_pred

        loss_value, grads = self.squared_diff_loss_at_a(sts, a, q_targets)

        self.optimizer.apply_gradients(
            zip(grads, self.model.trainable_variables))

        return loss_value

    @tf.function
    def squared_diff_loss_at_a(self, sts, a, q_next):
        """
        A squared difference loss function 
        Diffs the Q model's predicted values for a state with 
        the actual reward + predicted values for the next state
        """
        with tf.GradientTape() as tape:
            q_s = self.model(sts)  # Q(st)
            # Take only predicted value of the action taken for Q(st|at)
            gather_indices = tf.range(a.shape[0]) * tf.shape(q_s)[-1] + a
            q_s_a = tf.gather(tf.reshape(q_s, [-1]), gather_indices)

            # Q(st|at) diff Q(st+1)
            losses = tf.math.squared_difference(q_s_a, q_next)
            reduced_loss = tf.math.reduce_mean(losses)

        return (reduced_loss,
                tape.gradient(reduced_loss, self.model.trainable_variables))

    def save_state(self):
        """
        Called at the end of saving-episodes.

        Save a (trained) model with its weights to a specified file.
        Passes the required information to add to the pickle dict for the 
         model.
        """

        add_to_save = {
            "epsilon": self.epsilon,
            "memory": self.memory,
            "optimizer_config": self.optimizer.get_config(),
        }

        self.save_state_to_dict(append_dict=add_to_save)

        self.model.save(self.model_location)

    def load_state(self):
        """Load a model with the specified name"""

        model_dict = self.load_state_from_dict()

        print("Loading weights from", self.model_location + "...", end="")
        if os.path.exists(self.model_location):
            self.model = tf.keras.models.load_model(self.model_location)
            self.optimizer = self.optimizer.from_config(self.optimizer_config)
            del model_dict["optimizer_config"], self.optimizer_config
            print(" Loaded.")
        else:
            print(" Model not yet saved at loaction.")

        if "memory" in model_dict:
            del model_dict["memory"]

        print("Loaded state:")
        pprint.pprint(model_dict, depth=1)