def __init__(self, base_name, config): a2c_common.DiscreteA2CBase.__init__(self, base_name, config) obs_shape = self.obs_shape config = { 'actions_num': self.actions_num, 'input_shape': obs_shape, 'num_seqs': self.num_actors * self.num_agents, 'value_size': self.env_info.get('value_size', 1) } self.model = self.network.build(config) self.model.to(self.ppo_device) self.init_rnn_from_model(self.model) self.last_lr = float(self.last_lr) self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-08, weight_decay=self.weight_decay) if self.normalize_input: if isinstance(self.observation_space, gym.spaces.Dict): self.running_mean_std = RunningMeanStdObs(obs_shape).to( self.ppo_device) else: self.running_mean_std = RunningMeanStd(obs_shape).to( self.ppo_device) if self.has_central_value: cv_config = { 'state_shape': self.state_shape, 'value_size': self.value_size, 'ppo_device': self.ppo_device, 'num_agents': self.num_agents, 'num_steps': self.steps_num, 'num_actors': self.num_actors, 'num_actions': self.actions_num, 'seq_len': self.seq_len, 'model': self.central_value_config['network'], 'config': self.central_value_config, 'writter': self.writer, 'multi_gpu': self.multi_gpu } self.central_value_net = central_value.CentralValueTrain( **cv_config).to(self.ppo_device) self.use_experimental_cv = self.config.get('use_experimental_cv', False) self.dataset = datasets.PPODataset(self.batch_size, self.minibatch_size, self.is_discrete, self.is_rnn, self.ppo_device, self.seq_len) self.algo_observer.after_init(self)
def __init__(self, base_name, params): a2c_common.ContinuousA2CBase.__init__(self, base_name, params) obs_shape = self.obs_shape build_config = { 'actions_num' : self.actions_num, 'input_shape' : obs_shape, 'num_seqs' : self.num_actors * self.num_agents, 'value_size': self.env_info.get('value_size',1), 'normalize_value' : self.normalize_value, 'normalize_input': self.normalize_input, } self.model = self.network.build(build_config) self.model.to(self.ppo_device) self.states = None if self.ewma_ppo: self.ewma_model = EwmaModel(self.model, ewma_decay=0.889) self.init_rnn_from_model(self.model) self.last_lr = float(self.last_lr) self.bound_loss_type = self.config.get('bound_loss_type', 'bound') # 'regularisation' or 'bound' self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-08, weight_decay=self.weight_decay) if self.has_central_value: cv_config = { 'state_shape' : self.state_shape, 'value_size' : self.value_size, 'ppo_device' : self.ppo_device, 'num_agents' : self.num_agents, 'horizon_length' : self.horizon_length, 'num_actors' : self.num_actors, 'num_actions' : self.actions_num, 'seq_len' : self.seq_len, 'normalize_value' : self.normalize_value, 'network' : self.central_value_config['network'], 'config' : self.central_value_config, 'writter' : self.writer, 'max_epochs' : self.max_epochs, 'multi_gpu' : self.multi_gpu } self.central_value_net = central_value.CentralValueTrain(**cv_config).to(self.ppo_device) self.use_experimental_cv = self.config.get('use_experimental_cv', True) self.dataset = datasets.PPODataset(self.batch_size, self.minibatch_size, self.is_discrete, self.is_rnn, self.ppo_device, self.seq_len) if self.normalize_value: self.value_mean_std = self.central_value_net.model.value_mean_std if self.has_central_value else self.model.value_mean_std if 'phasic_policy_gradients' in self.config: self.has_phasic_policy_gradients = True self.ppg_aux_loss = ppg_aux.PPGAux(self, self.config['phasic_policy_gradients']) self.has_value_loss = (self.has_central_value and self.use_experimental_cv) \ or (not self.has_phasic_policy_gradients and not self.has_central_value) self.algo_observer.after_init(self)
def __init__(self, base_name, config): a2c_common.ContinuousA2CBase.__init__(self, base_name, config) obs_shape = torch_ext.shape_whc_to_cwh(self.obs_shape) config = { 'actions_num': self.actions_num, 'input_shape': obs_shape, 'num_seqs': self.num_actors * self.num_agents, 'value_size': self.env_info.get('value_size', 1) } self.model = self.network.build(config) self.model.to(self.ppo_device) self.states = None self.init_rnn_from_model(self.model) self.last_lr = float(self.last_lr) self.optimizer = optim.Adam(self.model.parameters(), float(self.last_lr), eps=1e-07, weight_decay=self.weight_decay) if self.normalize_input: self.running_mean_std = RunningMeanStd(obs_shape).to( self.ppo_device) if self.has_central_value: cv_config = { 'state_shape': torch_ext.shape_whc_to_cwh(self.state_shape), 'value_size': self.value_size, 'ppo_device': self.ppo_device, 'num_agents': self.num_agents, 'num_steps': self.steps_num, 'num_actors': self.num_actors, 'num_actions': self.actions_num, 'seq_len': self.seq_len, 'model': self.central_value_config['network'], 'config': self.central_value_config, 'writter': self.writer } self.central_value_net = central_value.CentralValueTrain( **cv_config).to(self.ppo_device) self.use_experimental_cv = self.config.get('use_experimental_cv', True) self.dataset = datasets.PPODataset(self.batch_size, self.minibatch_size, self.is_discrete, self.is_rnn, self.ppo_device, self.seq_len) self.algo_observer.after_init(self)