Exemplo n.º 1
0
    def compute_gae_advantage(self, batch, gamma, gae_lambda, use_gae=False):
        """
         Expects a batch containing at least one episode, sets advantages according to use_gae.

        :param batch: Sequence of observations for at least one episode.
        :param batch:
        :param gamma:
        :param gae_lambda:
        :param use_gae:
        :return:
        """

        for episode in batch:
            baseline = self.baseline_value_function.predict(episode)

            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])

            episode['returns'] = discount(episode['rewards'], gamma)

            if use_gae:
                deltas = episode['rewards'] + gamma * adjusted_baseline[1:] - adjusted_baseline[:-1]
                episode['advantage'] = discount(deltas, gamma * gae_lambda)
            else:
                episode['advantage'] = episode['returns'] - baseline
Exemplo n.º 2
0
    def generalised_advantage_estimation(self, episode):
        """
         Expects an episode, returns advantages according to config.
        """
        baseline = self.baseline_value_function.predict(episode)

        if self.generalized_advantage_estimation:
            if episode['terminated']:
                adjusted_baseline = np.append(baseline, [0])
            else:
                adjusted_baseline = np.append(baseline, baseline[-1])
            deltas = episode['rewards'] + self.gamma * adjusted_baseline[
                1:] - adjusted_baseline[:-1]
            advantage = discount(deltas, self.gamma * self.gae_lambda)
        else:
            advantage = episode['returns'] - baseline

        if self.normalize_advantage:
            return zero_mean_unit_variance(advantage)
        else:
            return advantage
Exemplo n.º 3
0
    def update(self, batch):
        """
        Get global parameters, compute update, then send results to parameter server.
        :param batch:
        :return:
        """

        for episode in batch:
            episode['returns'] = discount(episode['rewards'], self.gamma)
            episode['advantages'] = self.generalised_advantage_estimation(
                episode)

        # Update linear value function for baseline prediction
        self.baseline_value_function.fit(batch)

        fetches = [self.loss, self.optimize_op, self.global_step]
        fetches.extend(self.local_network.internal_state_outputs)

        print(len(batch))
        print(batch[0]['episode_length'])
        # Merge episode inputs into single arrays
        feed_dict = {
            self.episode_length:
            [episode['episode_length'] for episode in batch],
            self.state: [episode['states'] for episode in batch],
            self.actions: [episode['actions'] for episode in batch],
            self.advantage: [episode['advantages'] for episode in batch]
        }
        for n, internal_state in enumerate(
                self.local_network.internal_state_inputs):
            feed_dict[internal_state] = self.local_states[n]

        fetched = self.session.run(fetches, feed_dict)
        loss = fetched[0]
        self.local_states = fetched[3:]

        self.logger.debug('Distributed model loss = ' + str(loss))
Exemplo n.º 4
0
    def update(self, batch):
        """
        Compute update for one batch of experiences using general advantage estimation
        and the vanilla policy gradient.
        :param batch:
        :return:
        """

        # Set per episode return and advantage
        for episode in batch:
            episode['returns'] = discount(episode['rewards'], self.gamma)
            episode['advantages'] = self.advantage_estimation(episode)

        # Update linear value function for baseline prediction
        self.baseline_value_function.fit(batch)

        fetches = [self.optimize_op, self.log_probabilities, self.loss]
        fetches.extend(self.network.internal_state_outputs)

        feed_dict = {
            self.episode_length:
            [episode['episode_length'] for episode in batch],
            self.state: [episode['states'] for episode in batch],
            self.actions: [episode['actions'] for episode in batch],
            self.advantage: [episode['advantages'] for episode in batch]
        }

        for n, internal_state in enumerate(self.network.internal_state_inputs):
            feed_dict[internal_state] = self.internal_states[n]

        fetched = self.session.run(fetches, feed_dict)

        loss = fetched[2]
        self.internal_states = fetched[3:]

        self.logger.debug('Vanilla policy gradient loss = ' + str(loss))