def _update_if_dataset_is_ready(self): dataset_size = ( sum(len(episode) for episode in self.memory) + len(self.last_episode) + (0 if self.batch_last_episode is None else sum( len(episode) for episode in self.batch_last_episode))) if dataset_size >= self.update_interval: self._flush_last_episode() if self.recurrent: dataset = _make_dataset_recurrent( episodes=self.memory, model=self.model, phi=self.phi, batch_states=self.batch_states, obs_normalizer=self.obs_normalizer, gamma=self.gamma, lambd=self.lambd, max_recurrent_sequence_len=self.max_recurrent_sequence_len, ) self._update_recurrent(dataset) else: dataset = _make_dataset( episodes=self.memory, model=self.model, phi=self.phi, batch_states=self.batch_states, obs_normalizer=self.obs_normalizer, gamma=self.gamma, lambd=self.lambd, ) assert len(dataset) == dataset_size self._update(dataset) self.explained_variance = _compute_explained_variance( list(itertools.chain.from_iterable(self.memory))) self.memory = []
def _update_if_dataset_is_ready(self): # override func dataset_size = (sum(len(episode) for episode in self.memory) + len(self.last_episode) + (0 if self.batch_last_episode is None else sum( len(episode) for episode in self.batch_last_episode))) if dataset_size >= self.update_interval: self._flush_last_episode() # update reward in self.memory transitions = list(chain(*self.memory)) # Get agent's states and actions. Each list should be update_interval long saved_states = [ transition['state'][None] for transition in transitions ] saved_actions = [ transition['action'][None] for transition in transitions ] # Create state-action pairs, i.e. add a corresponding action to the state list. Each state-action pair # should be n_historical events + 1 long for a discrete action, i.e. buy/not buy state_action = [] for state, action in zip(saved_states, saved_actions): action = np.array([0, 1]) if action == 0 else np.array([0, 1]) array = np.append(state, action) state_action.append(array.reshape((-1, 1))) # Get rewards for all s-a pairs with chainer.configuration.using_config( 'train', False), chainer.no_backprop_mode(): rewards = self.discriminator.get_rewards( self.xp.asarray([ s_a.T.astype('float32') for s_a in state_action ])).array #rewards = self.discriminator.get_rewards(state_action.T.astype('float32')).data self.reward_mean_record.append(float(np.mean(rewards))) i = 0 for episode in self.memory: for transition in episode: transition['reward'] = float(rewards[i]) i += 1 assert self.memory[0][0]['reward'] == float( rewards[0]), 'rewards is not replaced.' dataset = _make_dataset( episodes=self.memory, model=self.model, phi=self.phi, batch_states=self.batch_states, obs_normalizer=self.obs_normalizer, gamma=self.gamma, lambd=self.lambd, ) assert len(dataset) == dataset_size self._update(dataset) self.memory = []
def _update_if_dataset_is_ready(self): # override func dataset_size = (sum(len(episode) for episode in self.memory) + len(self.last_episode) + (0 if self.batch_last_episode is None else sum( len(episode) for episode in self.batch_last_episode))) if dataset_size >= self.update_interval: # update reward in self.memory self._flush_last_episode() transitions = list(chain.from_iterable(self.memory)) states = self.xp.asarray( np.concatenate( [transition['state'][None] for transition in transitions])) actions = self.xp.asarray( np.concatenate([ transition['action'][None] for transition in transitions ])) with chainer.configuration.using_config( 'train', False), chainer.no_backprop_mode(): D_outputs = self.discriminator.get_rewards( self.convert_data_to_feed_discriminator(states, actions)).array self.D_output_mean.append(float(np.mean(D_outputs))) s_a = np.concatenate((states, actions.reshape((-1, 1))), axis=1) mod_rewards_temp = [] rewards_temp = [] i = 0 for episode in self.memory: for transition in episode: transition['reward'] = float(D_outputs[i]) rewards_temp.append(transition['reward']) i += 1 dataset = _make_dataset( episodes=self.memory, model=self.model, phi=self.phi, batch_states=self.batch_states, obs_normalizer=self.obs_normalizer, gamma=self.gamma, lambd=self.lambd, ) #dataset = self._make_dataset() assert len(dataset) == dataset_size self._update(dataset) self.memory = [] self.mod_rewards.append(float(np.mean(mod_rewards_temp))) self.rewards.append(float(np.mean(rewards_temp)))
def _update_if_dataset_is_ready(self): # override func dataset_size = (sum(len(episode) for episode in self.memory) + len(self.last_episode) + (0 if self.batch_last_episode is None else sum( len(episode) for episode in self.batch_last_episode))) if dataset_size >= self.update_interval: self._flush_last_episode() # update reward in self.memory transitions = list(chain.from_iterable(self.memory)) states = self.xp.asarray( np.concatenate([ transition['state'][None] for transition in transitions ])) # why None? actions = self.xp.asarray( np.concatenate([ transition['action'][None] for transition in transitions ])) with chainer.configuration.using_config( 'train', False), chainer.no_backprop_mode(): rewards = self.discriminator.get_rewards( self.convert_data_to_feed_discriminator(states, actions)).array self.reward_mean_record.append(float(np.mean(rewards))) i = 0 for episode in self.memory: for transition in episode: transition['reward'] = float(rewards[i]) i += 1 if self.recurrent: raise NotImplementedError else: dataset = _make_dataset( episodes=self.memory, model=self.model, phi=self.phi, batch_states=self.batch_states, obs_normalizer=self.obs_normalizer, gamma=self.gamma, lambd=self.lambd, ) assert len(dataset) == dataset_size self._update(dataset) self.memory = []
def _update_if_dataset_is_ready(self): # override func dataset_size = (sum(len(episode) for episode in self.memory) + len(self.last_episode) + (0 if self.batch_last_episode is None else sum( len(episode) for episode in self.batch_last_episode))) if dataset_size >= self.update_interval: self._flush_last_episode() # update reward in self.memory transitions = list(chain(*self.memory)) with chainer.configuration.using_config( 'train', False), chainer.no_backprop_mode(): rewards = self.discriminator.get_rewards( self.xp.asarray( np.concatenate([ transition['state'] [None] # [None] adds an extra [] around the states for transition in transitions ]))).array self.reward_mean_record.append(float(np.mean(rewards))) i = 0 for episode in self.memory: for transition in episode: transition['reward'] = float(rewards[i]) i += 1 assert self.memory[0][0]['reward'] == float( rewards[0]), 'rewards is not replaced.' dataset = _make_dataset( episodes=self.memory, model=self.model, phi=self.phi, batch_states=self.batch_states, obs_normalizer=self.obs_normalizer, gamma=self.gamma, lambd=self.lambd, ) assert len(dataset) == dataset_size self._update(dataset) self.memory = []