def __init__(self, config: Config) -> None: super().__init__() self.hparams = config self.env = env_selector( self.hparams ) # TODO: normalization is not required but will it be needed? self.eval_env = env_selector(self.hparams, config.seed + 1) self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim # includes skill in case env is option wrapped self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target.load_state_dict(self.vf.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], qf=self.qf, reg=config.reg, device=self.hparams.device ) # GMM policy with K mixtures, no reparametrization trick, regularization self.modules = [ "Policy", self.policy, "QF", self.qf, "VF", self.vf, "VF_Target", self.vf_target ] # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state # Runs on CPU(moved sampling to (on_train_start) to avoid bug in DIAYN + use GPU instead of CPU(No need for device logic!!) as Models are transferred to GPU only by trainer which happens after the lightning model init. # TODO remove device logic in Policy # Also the reason why wandb logger is not available self.batch_idx = None
def __init__(self, config: Config) -> None: super().__init__(config) self.z = 0 self._num_skills = self.hparams.num_skills self.env.set_reward_fn(HERF()) self.eval_env.set_reward_fn(HERF()) self.env.reset(state=None, skill=self.z) self.eval_env.reset(state=None, skill=self.z) self.batch_return = 0 self.single_skill = config.single_skill self.double_skill = config.double_skill self.skilldata_val = [[] for i in range(self._num_skills)] if self.single_skill is None: self.batch_env = [ env_selector(self.hparams, config.seed + 1) for i in range(self._num_skills) ] for i in range(self._num_skills): self.batch_env[i].set_reward_fn(HERF()) self.batch_env[i].reset(skill=i) # TODO HERF only supports upto 25 skills, uses modulo beyond that. self.discriminator = Discriminator( self.Do - self._num_skills, [config.layer_size, config.layer_size], self._num_skills) self.distiller = [ Discriminator( self.Do - self._num_skills, [self.hparams.disc_size[i], self.hparams.disc_size[i]], self._num_skills) for i in range(len(self.hparams.disc_size)) ] self.sampler.reset() self._p_z = torch.FloatTensor( np.full(self._num_skills, 1.0 / self._num_skills))
def __init__(self, config: Config) -> None: super().__init__() self.hparams = config self.env = env_selector(self.hparams) self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim self.q1 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.q1_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.stage = None self.pool_train = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.pool_val = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], #TODO: pass both q functions to use policy in deterministic mode qf=self.q1_target, reg=config.reg, device=self.hparams.device, reparametrization=True ) # GMM policy with K mixtures, no reparametrization trick, regularization # TODO: add assertion to test qf of policy and qf of model. self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state self.modules = [ "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target", self.q1_target, "Q2_target", self.q2_target ]
def __init__(self, config: Config) -> None: self.hparams = config self.env = env_selector(self.hparams) # TODO: ensure normalization is not required self.eval_env = env_selector(self.hparams, config.seed + 1) # TODO: add functionality to optionwrap for DIAYN # TODO: check all config.names to ensure they are in dict self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size]) self.vf_target.load_state_dict(self.vf.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], qf=self.qf, reg=config.reg, device="cpu" ) # GMM policy with K mixtures, no reparametrization trick, regularization # self.policy.cuda(config.device) # self.vf.cuda(config.device) # self.qf.cuda(config.device) # self.vf_target.cuda(config.device) # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length) # self.env.reset(None,self.z) # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init. # Also the reason why wandb logger is not available self.pool.add_samples(self.sampler.sample(config.min_pool_size, self.policy)) # self.optimizers = [] # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase # compute time by the expected self.optimizer_policy = optim.Adam(list(self.policy.parameters()) # +list(self.vf.parameters()) , lr=self._policy_lr) self.optimizer_vf = optim.Adam(self.vf.parameters(), lr=self._vf_lr) self.optimizer_qf = optim.Adam(self.qf.parameters(), lr=self._qf_lr) self.optimizer = optim.Adam(list(self.policy.parameters())+ list(self.vf.parameters())+ list(self.qf.parameters()), lr=self._policy_lr)
def __init__(self, config: Config) -> None: super().__init__() self.hparams = config self.env = env_selector( self.hparams ) # TODO: normalization is not required but will it be needed? self.eval_env = env_selector(self.hparams, config.seed + 1) # TODO: check all config.names to ensure they are in dict self.Da = self.env.action_space.flat_dim self.Do = self.env.observation_space.flat_dim self.q1 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2 = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier # init for weights and zero init for biases. self.q1_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q2_target = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size]) self.q1_target.load_state_dict(self.q1.state_dict()) self.q2_target.load_state_dict(self.q2.state_dict()) self.pool = SimpleReplayBuffer( env_spec=self.env.spec, max_replay_buffer_size=config.max_pool_size, ) # create a replay buffer for state+skill and action. self.policy = GMMPolicy( env_spec=self.env.spec, K=config.K, hidden_layer_sizes=[config.layer_size, config.layer_size], #TODO: pass both q functions to use policy in deterministic mode qf=self.q1_target, reg=config.reg, device=self.hparams.device, reparametrization=True ) # GMM policy with K mixtures, no reparametrization trick, regularization # TODO: add assertion to test qf of policy and qf of model. self.sampler = Sampler(self.env, config.max_path_length) self._policy_lr = config.lr self._qf_lr = config.lr self._vf_lr = config.lr # TODO fix varialbe naming with _ self._scale_reward = config.scale_reward self._discount = config.discount self._tau = config.tau self.max_path_return = -np.inf self.last_path_return = 0 self.val_path_return = 0 self._scale_entropy = config.scale_entropy self._save_full_state = config.save_full_state self.modules = [ "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target", self.q1_target, "Q2_target", self.q2_target ] # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length) # self.env.reset(None,self.z) # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init. # Also the reason why wandb logger is not available self.batch_idx = None