def __init__(self, config, ob_space, ac_space, actor, critic): super().__init__(config, ob_space) self._ac_space = ac_space # e.g. ActionSpace(shape=OrderedDict([('default', 8)]),minimum=-1.0, maximum=1.0) # build up networks self._actor = actor(config, ob_space, ac_space, config.tanh_policy) # actor model (MLP) self._old_actor = actor(config, ob_space, ac_space, config.tanh_policy) self._critic = critic(config, ob_space) # critic model (MLP) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic_optim = optim.Adam(self._critic.parameters(), lr=config.lr_critic) sampler = RandomSampler() buffer_keys = [ 'ob', 'ac', 'done', 'rew', 'ret', 'adv', 'ac_before_activation' ] self._buffer = ReplayBuffer(buffer_keys, config.buffer_size, sampler.sample_func) if config.is_chef: logger.info('Creating a PPO agent') logger.info('The actor has %d parameters', count_parameters(self._actor)) logger.info('The critic has %d parameters', count_parameters(self._critic))
def __init__(self, config, ob_space, ac_space, actor, critic): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._log_alpha = [torch.zeros(1, requires_grad=True, device=config.device)] self._alpha_optim = [optim.Adam([self._log_alpha[0]], lr=config.lr_actor)] self._actor = actor(self._config, self._ob_space, self._ac_space, self._config.tanh_policy, deterministic=True) self._actor_target = actor(self._config, self._ob_space, self._ac_space, self._config.tanh_policy, deterministic=True) self._actor_target.load_state_dict(self._actor.state_dict()) self._critic = critic(config, ob_space, ac_space) self._critic_target = critic(config, ob_space, ac_space) self._critic_target.load_state_dict(self._critic.state_dict()) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic_optim = optim.Adam(self._critic.parameters(), lr=config.lr_critic) self._buffer = ReplayBuffer(config, sampler.sample_func, ob_space, ac_space) self._ounoise = OUNoise(action_size(ac_space)) self._log_creation()
def __init__(self, config, ob_space, ac_space, actor, critic): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._target_entropy = -ac_space.size self._log_alpha = torch.zeros(1, requires_grad=True, device=config.device) self._alpha_optim = optim.Adam([self._log_alpha], lr=config.lr_actor) # build up networks self._build_actor(actor) self._critic1 = critic(config, ob_space, ac_space) self._critic2 = critic(config, ob_space, ac_space) # build up target networks self._critic1_target = critic(config, ob_space, ac_space) self._critic2_target = critic(config, ob_space, ac_space) self._critic1_target.load_state_dict(self._critic1.state_dict()) self._critic2_target.load_state_dict(self._critic2.state_dict()) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic1_optim = optim.Adam(self._critic1.parameters(), lr=config.lr_critic) self._critic2_optim = optim.Adam(self._critic2.parameters(), lr=config.lr_critic) sampler = RandomSampler() buffer_keys = ['ob', 'ac', 'done', 'rew'] self._buffer = ReplayBuffer(buffer_keys, config.buffer_size, sampler.sample_func) self._log_creation()
def __init__(self, config, ob_space, ac_space, dqn): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space # build up networks self._dqn = dqn(config, ob_space, ac_space) self._network_cuda(config.device) self._dqn_optim = optim.Adam(self._dqn.parameters(), lr=config.lr_actor) sampler = RandomSampler() self._buffer = ReplayBuffer(config.buffer_size, sampler.sample_func, ob_space, ac_space)
def __init__(self, config, ob_space, ac_space, actor, critic): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._log_alpha = torch.tensor(np.log(config.alpha), requires_grad=True, device=config.device) self._alpha_optim = optim.Adam([self._log_alpha], lr=config.lr_actor) # build up networks self._actor = actor(config, ob_space, ac_space, config.tanh_policy) self._critic1 = critic(config, ob_space, ac_space) self._critic2 = critic(config, ob_space, ac_space) self._target_entropy = -action_size(self._actor._ac_space) # build up target networks self._critic1_target = critic(config, ob_space, ac_space) self._critic2_target = critic(config, ob_space, ac_space) self._critic1_target.load_state_dict(self._critic1.state_dict()) self._critic2_target.load_state_dict(self._critic2.state_dict()) if config.policy == 'cnn': self._critic2.base.copy_conv_weights_from(self._critic1.base) self._actor.base.copy_conv_weights_from(self._critic1.base) if config.unsup_algo == 'curl': self._curl = CURL(config, ob_space, ac_space, self._critic1, self._critic1_target) self._encoder_optim = optim.Adam( self._critic1.base.parameters(), lr=config.lr_encoder) self._cpc_optim = optim.Adam(self._curl.parameters(), lr=config.lr_encoder) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic1_optim = optim.Adam(self._critic1.parameters(), lr=config.lr_critic) self._critic2_optim = optim.Adam(self._critic2.parameters(), lr=config.lr_critic) self._buffer = ReplayBuffer(config, ob_space, ac_space)
def __init__(self, config, ob_space, ac_space, actor, critic): super().__init__(config, ob_space) self._ac_space = ac_space # build up networks self._actor = actor(config, ob_space, ac_space, config.tanh_policy) self._old_actor = actor(config, ob_space, ac_space, config.tanh_policy) self._critic = critic(config, ob_space) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic_optim = optim.Adam(self._critic.parameters(), lr=config.lr_critic) sampler = RandomSampler() self._buffer = ReplayBuffer(['ob', 'ac', 'done', 'rew', 'ret', 'adv', 'ac_before_activation'], config.buffer_size, sampler.sample_func) if config.is_chef: logger.info('Creating a PPO agent') logger.info('The actor has %d parameters', count_parameters(self._actor)) logger.info('The critic has %d parameters', count_parameters(self._critic))
def __init__( self, config, ob_space, ac_space, actor, critic, non_limited_idx=None, ref_joint_pos_indexes=None, joint_space=None, is_jnt_limited=None, jnt_indices=None, ): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._jnt_indices = jnt_indices self._ref_joint_pos_indexes = ref_joint_pos_indexes self._joint_space = joint_space self._is_jnt_limited = is_jnt_limited if joint_space is not None: self._jnt_minimum = joint_space["default"].low self._jnt_maximum = joint_space["default"].high self._log_alpha = [ torch.zeros(1, requires_grad=True, device=config.device) ] self._alpha_optim = [ optim.Adam([self._log_alpha[0]], lr=config.lr_actor) ] self._actor = actor( self._config, self._ob_space, self._ac_space, self._config.tanh_policy, deterministic=True, ) self._actor_target = actor( self._config, self._ob_space, self._ac_space, self._config.tanh_policy, deterministic=True, ) self._actor_target.load_state_dict(self._actor.state_dict()) self._critic1 = critic(config, ob_space, ac_space) self._critic2 = critic(config, ob_space, ac_space) self._critic1_target = critic(config, ob_space, ac_space) self._critic2_target = critic(config, ob_space, ac_space) self._critic1_target.load_state_dict(self._critic1.state_dict()) self._critic2_target.load_state_dict(self._critic2.state_dict()) self._network_cuda(config.device) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic1_optim = optim.Adam(self._critic1.parameters(), lr=config.lr_critic) self._critic2_optim = optim.Adam(self._critic2.parameters(), lr=config.lr_critic) self._update_steps = 0 sampler = RandomSampler() buffer_keys = ["ob", "ac", "done", "rew"] if config.mopa or config.expand_ac_space: buffer_keys.append("intra_steps") self._buffer = ReplayBuffer(buffer_keys, config.buffer_size, sampler.sample_func) self._log_creation() self._planner = None self._is_planner_initialized = False if config.mopa: self._planner = PlannerAgent( config, ac_space, non_limited_idx, planner_type=config.planner_type, passive_joint_idx=config.passive_joint_idx, ignored_contacts=config.ignored_contact_geom_ids, is_simplified=config.is_simplified, simplified_duration=config.simplified_duration, allow_approximate=config.allow_approximate, range_=config.range, ) self._simple_planner = PlannerAgent( config, ac_space, non_limited_idx, planner_type=config.simple_planner_type, passive_joint_idx=config.passive_joint_idx, ignored_contacts=config.ignored_contact_geom_ids, goal_bias=1.0, allow_approximate=False, is_simplified=config.simple_planner_simplified, simplified_duration=config.simple_planner_simplified_duration, range_=config.simple_planner_range, ) self._omega = config.omega
def __init__( self, config, ob_space, ac_space, actor, critic, non_limited_idx=None, ref_joint_pos_indexes=None, joint_space=None, is_jnt_limited=None, jnt_indices=None, ): super().__init__(config, ob_space) self._ob_space = ob_space self._ac_space = ac_space self._jnt_indices = jnt_indices self._ref_joint_pos_indexes = ref_joint_pos_indexes self._log_alpha = torch.tensor(np.log(config.alpha), requires_grad=True, device=config.device) self._alpha_optim = optim.Adam([self._log_alpha], lr=config.lr_actor) self._joint_space = joint_space self._is_jnt_limited = is_jnt_limited if joint_space is not None: self._jnt_minimum = joint_space["default"].low self._jnt_maximum = joint_space["default"].high # build up networks self._build_actor(actor) self._build_critic(critic) self._network_cuda(config.device) self._target_entropy = -action_size(self._actor._ac_space) self._actor_optim = optim.Adam(self._actor.parameters(), lr=config.lr_actor) self._critic1_optim = optim.Adam(self._critic1.parameters(), lr=config.lr_critic) self._critic2_optim = optim.Adam(self._critic2.parameters(), lr=config.lr_critic) sampler = RandomSampler() buffer_keys = ["ob", "ac", "meta_ac", "done", "rew"] if config.mopa or config.expand_ac_space: buffer_keys.append("intra_steps") self._buffer = ReplayBuffer(buffer_keys, config.buffer_size, sampler.sample_func) self._log_creation() self._planner = None self._is_planner_initialized = False if config.mopa: self._planner = PlannerAgent( config, ac_space, non_limited_idx, planner_type=config.planner_type, passive_joint_idx=config.passive_joint_idx, ignored_contacts=config.ignored_contact_geom_ids, is_simplified=config.is_simplified, simplified_duration=config.simplified_duration, range_=config.range, ) self._simple_planner = PlannerAgent( config, ac_space, non_limited_idx, planner_type=config.simple_planner_type, passive_joint_idx=config.passive_joint_idx, ignored_contacts=config.ignored_contact_geom_ids, goal_bias=1.0, is_simplified=config.simple_planner_simplified, simplified_duration=config.simple_planner_simplified_duration, range_=config.simple_planner_range, ) self._omega = config.omega