Exemplo n.º 1
0
 def _update_if_dataset_is_ready(self):
     dataset_size = (
         sum(len(episode) for episode in self.memory)
         + len(self.last_episode)
         + (0 if self.batch_last_episode is None else sum(
             len(episode) for episode in self.batch_last_episode)))
     if dataset_size >= self.update_interval:
         self._flush_last_episode()
         if self.recurrent:
             dataset = _make_dataset_recurrent(
                 episodes=self.memory,
                 model=self.model,
                 phi=self.phi,
                 batch_states=self.batch_states,
                 obs_normalizer=self.obs_normalizer,
                 gamma=self.gamma,
                 lambd=self.lambd,
                 max_recurrent_sequence_len=self.max_recurrent_sequence_len,
             )
             self._update_recurrent(dataset)
         else:
             dataset = _make_dataset(
                 episodes=self.memory,
                 model=self.model,
                 phi=self.phi,
                 batch_states=self.batch_states,
                 obs_normalizer=self.obs_normalizer,
                 gamma=self.gamma,
                 lambd=self.lambd,
             )
             assert len(dataset) == dataset_size
             self._update(dataset)
         self.explained_variance = _compute_explained_variance(
             list(itertools.chain.from_iterable(self.memory)))
         self.memory = []
Exemplo n.º 2
0
    def _update_if_dataset_is_ready(self):
        # override func
        dataset_size = (sum(len(episode) for episode in self.memory) +
                        len(self.last_episode) +
                        (0 if self.batch_last_episode is None else sum(
                            len(episode)
                            for episode in self.batch_last_episode)))
        if dataset_size >= self.update_interval:
            self._flush_last_episode()

            # update reward in self.memory
            transitions = list(chain(*self.memory))

            # Get agent's states and actions. Each list should be update_interval long
            saved_states = [
                transition['state'][None] for transition in transitions
            ]
            saved_actions = [
                transition['action'][None] for transition in transitions
            ]

            # Create state-action pairs, i.e. add a corresponding action to the state list. Each state-action pair
            # should be n_historical events + 1 long for a discrete action, i.e. buy/not buy
            state_action = []
            for state, action in zip(saved_states, saved_actions):
                action = np.array([0, 1]) if action == 0 else np.array([0, 1])
                array = np.append(state, action)
                state_action.append(array.reshape((-1, 1)))

            # Get rewards for all s-a pairs
            with chainer.configuration.using_config(
                    'train', False), chainer.no_backprop_mode():
                rewards = self.discriminator.get_rewards(
                    self.xp.asarray([
                        s_a.T.astype('float32') for s_a in state_action
                    ])).array
                #rewards = self.discriminator.get_rewards(state_action.T.astype('float32')).data

            self.reward_mean_record.append(float(np.mean(rewards)))

            i = 0
            for episode in self.memory:
                for transition in episode:
                    transition['reward'] = float(rewards[i])
                    i += 1
            assert self.memory[0][0]['reward'] == float(
                rewards[0]), 'rewards is not replaced.'

            dataset = _make_dataset(
                episodes=self.memory,
                model=self.model,
                phi=self.phi,
                batch_states=self.batch_states,
                obs_normalizer=self.obs_normalizer,
                gamma=self.gamma,
                lambd=self.lambd,
            )
            assert len(dataset) == dataset_size
            self._update(dataset)
            self.memory = []
Exemplo n.º 3
0
    def _update_if_dataset_is_ready(self):
        # override func
        dataset_size = (sum(len(episode) for episode in self.memory) +
                        len(self.last_episode) +
                        (0 if self.batch_last_episode is None else sum(
                            len(episode)
                            for episode in self.batch_last_episode)))
        if dataset_size >= self.update_interval:
            # update reward in self.memory
            self._flush_last_episode()
            transitions = list(chain.from_iterable(self.memory))
            states = self.xp.asarray(
                np.concatenate(
                    [transition['state'][None] for transition in transitions]))
            actions = self.xp.asarray(
                np.concatenate([
                    transition['action'][None] for transition in transitions
                ]))
            with chainer.configuration.using_config(
                    'train', False), chainer.no_backprop_mode():
                D_outputs = self.discriminator.get_rewards(
                    self.convert_data_to_feed_discriminator(states,
                                                            actions)).array

            self.D_output_mean.append(float(np.mean(D_outputs)))

            s_a = np.concatenate((states, actions.reshape((-1, 1))), axis=1)

            mod_rewards_temp = []
            rewards_temp = []
            i = 0
            for episode in self.memory:
                for transition in episode:
                    transition['reward'] = float(D_outputs[i])
                    rewards_temp.append(transition['reward'])

                    i += 1
            dataset = _make_dataset(
                episodes=self.memory,
                model=self.model,
                phi=self.phi,
                batch_states=self.batch_states,
                obs_normalizer=self.obs_normalizer,
                gamma=self.gamma,
                lambd=self.lambd,
            )
            #dataset = self._make_dataset()
            assert len(dataset) == dataset_size
            self._update(dataset)
            self.memory = []

            self.mod_rewards.append(float(np.mean(mod_rewards_temp)))
            self.rewards.append(float(np.mean(rewards_temp)))
Exemplo n.º 4
0
    def _update_if_dataset_is_ready(self):
        # override func
        dataset_size = (sum(len(episode) for episode in self.memory) +
                        len(self.last_episode) +
                        (0 if self.batch_last_episode is None else sum(
                            len(episode)
                            for episode in self.batch_last_episode)))

        if dataset_size >= self.update_interval:
            self._flush_last_episode()

            # update reward in self.memory
            transitions = list(chain.from_iterable(self.memory))
            states = self.xp.asarray(
                np.concatenate([
                    transition['state'][None] for transition in transitions
                ]))  # why None?
            actions = self.xp.asarray(
                np.concatenate([
                    transition['action'][None] for transition in transitions
                ]))

            with chainer.configuration.using_config(
                    'train', False), chainer.no_backprop_mode():
                rewards = self.discriminator.get_rewards(
                    self.convert_data_to_feed_discriminator(states,
                                                            actions)).array

            self.reward_mean_record.append(float(np.mean(rewards)))
            i = 0
            for episode in self.memory:
                for transition in episode:
                    transition['reward'] = float(rewards[i])
                    i += 1

            if self.recurrent:
                raise NotImplementedError
            else:
                dataset = _make_dataset(
                    episodes=self.memory,
                    model=self.model,
                    phi=self.phi,
                    batch_states=self.batch_states,
                    obs_normalizer=self.obs_normalizer,
                    gamma=self.gamma,
                    lambd=self.lambd,
                )
                assert len(dataset) == dataset_size
                self._update(dataset)
            self.memory = []
Exemplo n.º 5
0
    def _update_if_dataset_is_ready(self):
        # override func
        dataset_size = (sum(len(episode) for episode in self.memory) +
                        len(self.last_episode) +
                        (0 if self.batch_last_episode is None else sum(
                            len(episode)
                            for episode in self.batch_last_episode)))
        if dataset_size >= self.update_interval:
            self._flush_last_episode()

            # update reward in self.memory
            transitions = list(chain(*self.memory))
            with chainer.configuration.using_config(
                    'train', False), chainer.no_backprop_mode():
                rewards = self.discriminator.get_rewards(
                    self.xp.asarray(
                        np.concatenate([
                            transition['state']
                            [None]  # [None] adds an extra [] around the states
                            for transition in transitions
                        ]))).array
            self.reward_mean_record.append(float(np.mean(rewards)))

            i = 0
            for episode in self.memory:
                for transition in episode:

                    transition['reward'] = float(rewards[i])
                    i += 1
            assert self.memory[0][0]['reward'] == float(
                rewards[0]), 'rewards is not replaced.'

            dataset = _make_dataset(
                episodes=self.memory,
                model=self.model,
                phi=self.phi,
                batch_states=self.batch_states,
                obs_normalizer=self.obs_normalizer,
                gamma=self.gamma,
                lambd=self.lambd,
            )
            assert len(dataset) == dataset_size
            self._update(dataset)
            self.memory = []